mobilize-hive 1.0.07
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +201 -0
- data/README.md +229 -0
- data/Rakefile +19 -0
- data/lib/mobilize-hive/handlers/hive.rb +530 -0
- data/lib/mobilize-hive/tasks.rb +38 -0
- data/lib/mobilize-hive/version.rb +5 -0
- data/lib/mobilize-hive.rb +8 -0
- data/lib/samples/hive.yml +19 -0
- data/mobilize-hive.gemspec +20 -0
- data/test/hive_job_rows.yml +26 -0
- data/test/hive_test_1.hql +1 -0
- data/test/hive_test_1_in.yml +41 -0
- data/test/hive_test_1_schema.yml +3 -0
- data/test/mobilize-hive_test.rb +71 -0
- data/test/redis-test.conf +540 -0
- data/test/test_helper.rb +10 -0
- metadata +87 -0
@@ -0,0 +1,530 @@
|
|
1
|
+
module Mobilize
|
2
|
+
module Hive
|
3
|
+
def Hive.config
|
4
|
+
Base.config('hive')
|
5
|
+
end
|
6
|
+
|
7
|
+
def Hive.exec_path(cluster)
|
8
|
+
Hive.clusters[cluster]['exec_path']
|
9
|
+
end
|
10
|
+
|
11
|
+
def Hive.output_db(cluster)
|
12
|
+
Hive.clusters[cluster]['output_db']
|
13
|
+
end
|
14
|
+
|
15
|
+
def Hive.output_db_user(cluster)
|
16
|
+
output_db_node = Hadoop.gateway_node(cluster)
|
17
|
+
output_db_user = Ssh.host(output_db_node)['user']
|
18
|
+
output_db_user
|
19
|
+
end
|
20
|
+
|
21
|
+
def Hive.clusters
|
22
|
+
Hive.config['clusters']
|
23
|
+
end
|
24
|
+
|
25
|
+
def Hive.slot_ids(cluster)
|
26
|
+
(1..Hive.clusters[cluster]['max_slots']).to_a.map{|s| "#{cluster}_#{s.to_s}"}
|
27
|
+
end
|
28
|
+
|
29
|
+
def Hive.slot_worker_by_cluster_and_path(cluster,path)
|
30
|
+
working_slots = Mobilize::Resque.jobs('working').map{|j| j['hive_slot'] if (j and j['hive_slot'])}.compact
|
31
|
+
Hive.slot_ids(cluster).each do |slot_id|
|
32
|
+
unless working_slots.include?(slot_id)
|
33
|
+
Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>slot_id})
|
34
|
+
return slot_id
|
35
|
+
end
|
36
|
+
end
|
37
|
+
#return false if none are available
|
38
|
+
return false
|
39
|
+
end
|
40
|
+
|
41
|
+
def Hive.unslot_worker_by_path(path)
|
42
|
+
begin
|
43
|
+
Mobilize::Resque.set_worker_args_by_path(path,{'hive_slot'=>nil})
|
44
|
+
return true
|
45
|
+
rescue
|
46
|
+
return false
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
#get field names and partition datatypes and size of a hive table
|
51
|
+
def Hive.table_stats(db,table,cluster,user)
|
52
|
+
describe_sql = "use #{db};describe extended #{table}"
|
53
|
+
describe_output = Hive.run(describe_sql,cluster,user)
|
54
|
+
describe_output.split("location:").last.split(",").first
|
55
|
+
#get location, fields, partitions
|
56
|
+
result_hash = {}
|
57
|
+
result_hash['location'] = describe_output.split("location:").last.split(",").first
|
58
|
+
#get fields
|
59
|
+
field_defs = describe_output.split(" \nDetailed Table Information").first.split(
|
60
|
+
"\n").map{|f|
|
61
|
+
f.strip.split("\t").ie{|fa|
|
62
|
+
{"name"=>fa.first,"datatype"=>fa.second} if fa.first}}.compact
|
63
|
+
#check for partititons
|
64
|
+
if describe_output.index("partitionKeys:[FieldSchema")
|
65
|
+
part_field_string = describe_output.split("partitionKeys:[").last.split("]").first
|
66
|
+
#parse weird schema using yaml plus gsubs
|
67
|
+
yaml_fields = "---" + part_field_string.gsub("FieldSchema","\n").gsub(
|
68
|
+
")","").gsub(
|
69
|
+
",","\n ").gsub(
|
70
|
+
"(","- ").gsub(
|
71
|
+
"null","").gsub(
|
72
|
+
":",": ")
|
73
|
+
#return partitions without the comment part
|
74
|
+
result_hash['partitions'] = YAML.load(yaml_fields).map{|ph| ph.delete('comment');ph}
|
75
|
+
#get rid of fields in fields section that are also partitions
|
76
|
+
result_hash['partitions'].map{|p| p['name']}.each{|n| field_defs.delete_if{|f| f['name']==n}}
|
77
|
+
end
|
78
|
+
#assign field defs after removing partitions
|
79
|
+
result_hash['field_defs'] = field_defs
|
80
|
+
#get size
|
81
|
+
result_hash['size'] = Hadoop.run("fs -dus #{result_hash['location']}",cluster,user).split("\t").last.strip.to_i
|
82
|
+
return result_hash
|
83
|
+
end
|
84
|
+
|
85
|
+
#run a generic hive command, with the option of passing a file hash to be locally available
|
86
|
+
def Hive.run(hql,cluster,user,file_hash=nil)
|
87
|
+
filename = hql.to_md5
|
88
|
+
file_hash||= {}
|
89
|
+
file_hash[filename] = hql
|
90
|
+
#silent mode so we don't have logs in stderr; clip output
|
91
|
+
#at hadoop read limit
|
92
|
+
command = "#{Hive.exec_path(cluster)} -S -f #{filename} | head -c #{Hadoop.read_limit}"
|
93
|
+
gateway_node = Hadoop.gateway_node(cluster)
|
94
|
+
Ssh.run(gateway_node,command,user,file_hash)
|
95
|
+
end
|
96
|
+
|
97
|
+
def Hive.run_by_stage_path(stage_path)
|
98
|
+
s = Stage.where(:path=>stage_path).first
|
99
|
+
u = s.job.runner.user
|
100
|
+
params = s.params
|
101
|
+
user = params['user']
|
102
|
+
cluster = params['cluster'] || Hive.clusters.keys.first
|
103
|
+
node = Hadoop.gateway_node(cluster)
|
104
|
+
node_user = Ssh.host(node)['user']
|
105
|
+
if user and !Ssh.sudoers(node).include?(u.name)
|
106
|
+
raise "#{u.name} does not have su permissions for #{node}"
|
107
|
+
elsif user.nil? and Ssh.su_all_users(node)
|
108
|
+
user = u.name
|
109
|
+
end
|
110
|
+
|
111
|
+
#slot Hive worker if available
|
112
|
+
slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
|
113
|
+
return false unless slot_id
|
114
|
+
|
115
|
+
#output table stores stage output
|
116
|
+
output_db,output_table = [Hive.output_db(cluster),stage_path.gridsafe]
|
117
|
+
output_path = [output_db,output_table].join(".")
|
118
|
+
out_url = "hive://#{cluster}/#{output_db}/#{output_table}"
|
119
|
+
|
120
|
+
#get hql
|
121
|
+
if params['hql']
|
122
|
+
hql = params['hql']
|
123
|
+
else
|
124
|
+
#user has passed in a gsheet hql
|
125
|
+
gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
|
126
|
+
#return blank response if there are no slots available
|
127
|
+
return nil unless gdrive_slot
|
128
|
+
source_dst = s.source_dsts(gdrive_slot).first
|
129
|
+
Gdrive.unslot_worker_by_path(stage_path)
|
130
|
+
hql = source_dst.read(user)
|
131
|
+
end
|
132
|
+
|
133
|
+
#check for select at end
|
134
|
+
hql_array = hql.split(";").map{|hc| hc.strip}.reject{|hc| hc.length==0}
|
135
|
+
if hql_array.last.downcase.starts_with?("select")
|
136
|
+
#nil if no prior commands
|
137
|
+
prior_hql = hql_array[0..-2].join(";") if hql_array.length > 1
|
138
|
+
select_hql = hql_array.last
|
139
|
+
output_table_hql = ["drop table if exists #{output_path}",
|
140
|
+
"create table #{output_path} as #{select_hql};"].join(";")
|
141
|
+
full_hql = [prior_hql, output_table_hql].compact.join(";")
|
142
|
+
Hive.run(full_hql, cluster, user)
|
143
|
+
#make sure node user owns the stage result directory
|
144
|
+
output_table_stats = Hive.table_stats(output_db,output_table,cluster,node_user)
|
145
|
+
output_table_location = output_table_stats['location']
|
146
|
+
chown_command = "#{Hadoop.exec_path(cluster)} fs -chown -R #{node_user} '#{output_table_location}'"
|
147
|
+
Ssh.run(node,chown_command,node_user)
|
148
|
+
#already populated, make sure dataset exists
|
149
|
+
Dataset.find_or_create_by_url(out_url)
|
150
|
+
else
|
151
|
+
out_string = Hive.run(hql, cluster, user)
|
152
|
+
out_string = "result\n#{out_string}"
|
153
|
+
Dataset.write_by_url(out_url,out_string,node_user)
|
154
|
+
end
|
155
|
+
#unslot worker
|
156
|
+
Hive.unslot_worker_by_path(stage_path)
|
157
|
+
out_url
|
158
|
+
end
|
159
|
+
|
160
|
+
def Hive.schema_hash(schema_path,user,gdrive_slot)
|
161
|
+
if schema_path.index("/")
|
162
|
+
#slashes mean sheets
|
163
|
+
out_tsv = Gsheet.find_by_path(schema_path,gdrive_slot).read(user)
|
164
|
+
else
|
165
|
+
u = User.where(:name=>user).first
|
166
|
+
#check sheets in runner
|
167
|
+
r = u.runner
|
168
|
+
runner_sheet = r.gbook(gdrive_slot).worksheet_by_title(schema_path)
|
169
|
+
out_tsv = if runner_sheet
|
170
|
+
runner_sheet.read(user)
|
171
|
+
else
|
172
|
+
#check for gfile. will fail if there isn't one.
|
173
|
+
Gfile.find_by_path(schema_path).read(user)
|
174
|
+
end
|
175
|
+
#use Gridfs to cache gdrive results
|
176
|
+
file_name = schema_path.split("/").last
|
177
|
+
out_url = "gridfs://#{schema_path}/#{file_name}"
|
178
|
+
Dataset.write_by_url(out_url,out_tsv,user)
|
179
|
+
schema_tsv = Dataset.find_by_url(out_url).read(user)
|
180
|
+
schema_hash = {}
|
181
|
+
schema_tsv.tsv_to_hash_array.each do |ha|
|
182
|
+
schema_hash[ha['name']] = ha['datatype']
|
183
|
+
end
|
184
|
+
schema_hash
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
def Hive.path_params(cluster, path, user)
|
189
|
+
db, table, partitions = path.gsub(".","/").split("/").ie{|sp| [sp.first, sp.second, sp[2..-1]]}
|
190
|
+
#get existing table stats if any
|
191
|
+
curr_stats = begin
|
192
|
+
Hive.table_stats(db, table, cluster, user)
|
193
|
+
rescue
|
194
|
+
nil
|
195
|
+
end
|
196
|
+
{"db"=>db,
|
197
|
+
"table"=>table,
|
198
|
+
"partitions"=>partitions,
|
199
|
+
"curr_stats"=>curr_stats}
|
200
|
+
end
|
201
|
+
|
202
|
+
def Hive.hql_to_table(cluster, source_hql, target_path, user, drop=false, schema_hash=nil)
|
203
|
+
target_params = Hive.path_params(cluster, target_path, user)
|
204
|
+
target_table_path = ['db','table'].map{|k| target_params[k]}.join(".")
|
205
|
+
target_partitions = target_params['partitions'].to_a
|
206
|
+
target_table_stats = target_params['curr_stats']
|
207
|
+
|
208
|
+
#create temporary table so we can identify fields etc.
|
209
|
+
temp_db = Hive.output_db(cluster)
|
210
|
+
temp_table_name = (source_hql+target_path).to_md5
|
211
|
+
temp_table_path = [temp_db,temp_table_name].join(".")
|
212
|
+
temp_drop_hql = "drop table if exists #{temp_table_path};"
|
213
|
+
temp_create_hql = "#{temp_drop_hql}create table #{temp_table_path} as #{source_hql}"
|
214
|
+
Hive.run(temp_create_hql,cluster,user)
|
215
|
+
|
216
|
+
source_params = Hive.path_params(cluster, temp_table_path, user)
|
217
|
+
source_table_path = ['db','table'].map{|k| source_params[k]}.join(".")
|
218
|
+
source_table_stats = source_params['curr_stats']
|
219
|
+
source_fields = source_table_stats['field_defs']
|
220
|
+
|
221
|
+
if target_partitions.length == 0 and
|
222
|
+
target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
|
223
|
+
#no partitions in either user params or the target table
|
224
|
+
|
225
|
+
target_headers = source_fields.map{|f| f['name']}
|
226
|
+
|
227
|
+
target_field_stmt = target_headers.map{|h| "`#{h}`"}.join(",")
|
228
|
+
|
229
|
+
field_defs = {}
|
230
|
+
target_headers.each do |name|
|
231
|
+
datatype = schema_hash[name] || "string"
|
232
|
+
field_defs[name]=datatype
|
233
|
+
end
|
234
|
+
|
235
|
+
field_def_stmt = "(#{field_defs.map do |name,datatype|
|
236
|
+
"`#{name}` #{datatype}"
|
237
|
+
end.join(",")})"
|
238
|
+
|
239
|
+
#always drop when no partititons
|
240
|
+
target_drop_hql = "drop table if exists #{target_table_path};"
|
241
|
+
|
242
|
+
target_create_hql = "create table if not exists #{target_table_path} #{field_def_stmt};"
|
243
|
+
|
244
|
+
target_insert_hql = "insert overwrite table #{target_table_path} select #{target_field_stmt} from #{source_table_path};"
|
245
|
+
|
246
|
+
target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql,temp_drop_hql].join
|
247
|
+
|
248
|
+
Hive.run(target_full_hql, cluster, user)
|
249
|
+
|
250
|
+
elsif target_partitions.length > 0 and
|
251
|
+
target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == target_partitions}
|
252
|
+
#partitions and no target table or same partitions in both target table and user params
|
253
|
+
|
254
|
+
target_headers = source_fields.map{|f| f['name']}.reject{|h| target_partitions.include?(h)}
|
255
|
+
|
256
|
+
field_defs = {}
|
257
|
+
target_headers.each do |name|
|
258
|
+
datatype = schema_hash[name] || "string"
|
259
|
+
field_defs[name]=datatype
|
260
|
+
end
|
261
|
+
|
262
|
+
field_def_stmt = "(#{field_defs.map do |name,datatype|
|
263
|
+
"`#{name}` #{datatype}"
|
264
|
+
end.join(",")})"
|
265
|
+
|
266
|
+
part_defs = {}
|
267
|
+
target_partitions.each do |name|
|
268
|
+
datatype = schema_hash[name] || "string"
|
269
|
+
part_defs[name] = datatype
|
270
|
+
end
|
271
|
+
|
272
|
+
part_def_stmt = "(#{part_defs.map do |name,datatype|
|
273
|
+
"`#{name}` #{datatype}"
|
274
|
+
end.join(",")})"
|
275
|
+
|
276
|
+
target_field_stmt = target_headers.map{|h| "`#{h}`"}.join(",")
|
277
|
+
|
278
|
+
target_part_stmt = target_partitions.map{|h| "`#{h}`"}.join(",")
|
279
|
+
|
280
|
+
target_set_hql = ["set hive.exec.dynamic.partition.mode=nonstrict;",
|
281
|
+
"set hive.exec.max.dynamic.partitions.pernode=1000;",
|
282
|
+
"set hive.exec.dynamic.partition=true;",
|
283
|
+
"set hive.exec.max.created.files = 200000;",
|
284
|
+
"set hive.max.created.files = 200000;"].join
|
285
|
+
|
286
|
+
if drop or target_table_stats.nil?
|
287
|
+
target_drop_hql = "drop table if exists #{target_table_path};"
|
288
|
+
target_create_hql = target_drop_hql +
|
289
|
+
"create table if not exists #{target_table_path} #{field_def_stmt} " +
|
290
|
+
"partitioned by #{part_def_stmt};"
|
291
|
+
|
292
|
+
else
|
293
|
+
target_db,target_table = target_table_path.split(".")
|
294
|
+
#get all the permutations of possible partititons
|
295
|
+
part_perm_hql = "set hive.cli.print.header=true;select distinct #{target_part_stmt} from #{source_table_path};"
|
296
|
+
part_perm_tsv = Hive.run(part_perm_hql, cluster, user)
|
297
|
+
#having gotten the permutations, ensure they are dropped
|
298
|
+
part_hash_array = part_perm_tsv.tsv_to_hash_array
|
299
|
+
part_drop_hql = part_hash_array.map do |h|
|
300
|
+
part_drop_stmt = h.map do |name,value|
|
301
|
+
part_defs[name[1..-2]]=="string" ? "#{name}='#{value}'" : "#{name}=#{value}"
|
302
|
+
end.join(",")
|
303
|
+
"use #{target_db};alter table #{target_table} drop if exists partition (#{part_drop_stmt});"
|
304
|
+
end.join
|
305
|
+
target_create_hql = part_drop_hql
|
306
|
+
end
|
307
|
+
|
308
|
+
target_insert_hql = "insert overwrite table #{target_table_path} " +
|
309
|
+
"partition (#{target_part_stmt}) " +
|
310
|
+
"select #{target_field_stmt},#{target_part_stmt} from #{source_table_path};"
|
311
|
+
|
312
|
+
target_full_hql = [target_set_hql, target_create_hql, target_insert_hql, temp_drop_hql].join
|
313
|
+
|
314
|
+
Hive.run(target_full_hql, cluster, user)
|
315
|
+
else
|
316
|
+
error_msg = "Incompatible partition specs"
|
317
|
+
raise error_msg
|
318
|
+
end
|
319
|
+
return target_path
|
320
|
+
end
|
321
|
+
|
322
|
+
#turn a tsv into a hive table.
|
323
|
+
#Accepts options to drop existing target if any
|
324
|
+
#also schema with column datatype overrides
|
325
|
+
def Hive.tsv_to_table(cluster, source_tsv, target_path, user, drop=false, schema_hash=nil)
|
326
|
+
source_headers = source_tsv.tsv_header_array
|
327
|
+
|
328
|
+
target_params = Hive.path_params(cluster, target_path, user)
|
329
|
+
target_db,target_table = ['db','table'].map{|k| target_params[k]}
|
330
|
+
target_table_path = [target_db,target_table].join(".")
|
331
|
+
target_partitions = target_params['partitions'].to_a
|
332
|
+
target_table_stats = target_params['curr_stats']
|
333
|
+
|
334
|
+
schema_hash ||= {}
|
335
|
+
|
336
|
+
if target_partitions.length == 0 and
|
337
|
+
target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
|
338
|
+
#no partitions in either user params or the target table
|
339
|
+
#or drop and start fresh
|
340
|
+
|
341
|
+
#one file only, strip headers, replace tab with ctrl-a for hive
|
342
|
+
source_rows = source_tsv.split("\n")[1..-1].join("\n").gsub("\t","\001")
|
343
|
+
source_tsv_filename = "000000_0"
|
344
|
+
file_hash = {source_tsv_filename=>source_rows}
|
345
|
+
|
346
|
+
field_defs = source_headers.map do |name|
|
347
|
+
datatype = schema_hash[name] || "string"
|
348
|
+
"`#{name}` #{datatype}"
|
349
|
+
end.ie{|fs| "(#{fs.join(",")})"}
|
350
|
+
|
351
|
+
#for single insert, use drop table and create table always
|
352
|
+
target_drop_hql = "drop table if exists #{target_table_path}"
|
353
|
+
|
354
|
+
target_create_hql = "create table #{target_table_path} #{field_defs}"
|
355
|
+
|
356
|
+
#load source data
|
357
|
+
target_insert_hql = "load data local inpath '#{source_tsv_filename}' overwrite into table #{target_table_path};"
|
358
|
+
|
359
|
+
target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql].join(";")
|
360
|
+
|
361
|
+
Hive.run(target_full_hql, cluster, user, file_hash)
|
362
|
+
|
363
|
+
elsif target_partitions.length > 0 and
|
364
|
+
target_table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == target_partitions}
|
365
|
+
#partitions and no target table
|
366
|
+
#or same partitions in both target table and user params
|
367
|
+
#or drop and start fresh
|
368
|
+
|
369
|
+
target_headers = source_headers.reject{|h| target_partitions.include?(h)}
|
370
|
+
|
371
|
+
field_defs = "(#{target_headers.map do |name|
|
372
|
+
datatype = schema_hash[name] || "string"
|
373
|
+
"`#{name}` #{datatype}"
|
374
|
+
end.join(",")})"
|
375
|
+
|
376
|
+
partition_defs = "(#{target_partitions.map do |name|
|
377
|
+
datatype = schema_hash[name] || "string"
|
378
|
+
"#{name} #{datatype}"
|
379
|
+
end.join(",")})"
|
380
|
+
|
381
|
+
target_drop_hql = drop ? "drop table if exists #{target_table_path};" : ""
|
382
|
+
|
383
|
+
target_create_hql = target_drop_hql +
|
384
|
+
"create table if not exists #{target_table_path} #{field_defs} " +
|
385
|
+
"partitioned by #{partition_defs}"
|
386
|
+
|
387
|
+
#create target table early if not here
|
388
|
+
Hive.run(target_create_hql, cluster, user)
|
389
|
+
|
390
|
+
target_table_stats = Hive.table_stats(target_db, target_table, cluster, user)
|
391
|
+
|
392
|
+
#create data hash from source hash array
|
393
|
+
data_hash = {}
|
394
|
+
source_hash_array = source_tsv.tsv_to_hash_array
|
395
|
+
source_hash_array.each do |ha|
|
396
|
+
tpmk = target_partitions.map{|pn| "#{pn}=#{ha[pn]}"}.join("/")
|
397
|
+
tpmv = ha.reject{|k,v| target_partitions.include?(k)}.values.join("\001")
|
398
|
+
if data_hash[tpmk]
|
399
|
+
data_hash[tpmk] += "\n#{tpmv}"
|
400
|
+
else
|
401
|
+
data_hash[tpmk] = tpmv
|
402
|
+
end
|
403
|
+
end
|
404
|
+
|
405
|
+
#go through completed data hash and write each key value to the table in question
|
406
|
+
data_hash.each do |tpmk,tpmv|
|
407
|
+
base_filename = "000000_0"
|
408
|
+
part_pairs = tpmk.split("/").map{|p| p.split("=").ie{|pa| ["#{pa.first}","#{pa.second}"]}}
|
409
|
+
part_dir = part_pairs.map{|pp| "#{pp.first}=#{pp.second}"}.join("/")
|
410
|
+
part_stmt = part_pairs.map{|pp| "#{pp.first}='#{pp.second}'"}.join(",")
|
411
|
+
hdfs_dir = "#{target_table_stats['location']}/#{part_dir}"
|
412
|
+
hdfs_source_path = "/#{hdfs_dir.split("/")[3..-2].join("/")}/#{base_filename}"
|
413
|
+
hdfs_target_path = "/#{hdfs_dir.split("/")[3..-1].join("/")}/#{base_filename}"
|
414
|
+
#load partition into source path
|
415
|
+
puts "Writing to #{hdfs_source_path} for #{user} at #{Time.now.utc}"
|
416
|
+
Hdfs.write(hdfs_source_path,tpmv,user)
|
417
|
+
#let Hive know where the partition is
|
418
|
+
target_add_part_hql = "use #{target_db};alter table #{target_table} add if not exists partition (#{part_stmt}) location '#{hdfs_target_path}'"
|
419
|
+
target_insert_part_hql = "load data inpath '#{hdfs_source_path}' overwrite into table #{target_table} partition (#{part_stmt});"
|
420
|
+
target_part_hql = [target_add_part_hql,target_insert_part_hql].join(";")
|
421
|
+
puts "Adding partition #{tpmk} to #{target_table_path} for #{user} at #{Time.now.utc}"
|
422
|
+
Hive.run(target_part_hql, cluster, user)
|
423
|
+
end
|
424
|
+
else
|
425
|
+
error_msg = "Incompatible partition specs: " +
|
426
|
+
"target table:#{target_table_stats['partitions'].to_s}, " +
|
427
|
+
"user_params:#{target_partitions.to_s}"
|
428
|
+
raise error_msg
|
429
|
+
end
|
430
|
+
return target_path
|
431
|
+
end
|
432
|
+
|
433
|
+
def Hive.write_by_stage_path(stage_path)
|
434
|
+
s = Stage.where(:path=>stage_path).first
|
435
|
+
u = s.job.runner.user
|
436
|
+
params = s.params
|
437
|
+
user = params['user']
|
438
|
+
cluster = params['cluster'] || Hive.clusters.keys.first
|
439
|
+
|
440
|
+
#slot Hive worker if available
|
441
|
+
slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
|
442
|
+
return false unless slot_id
|
443
|
+
|
444
|
+
node = Hadoop.gateway_node(cluster)
|
445
|
+
node_user = Ssh.host(node)['user']
|
446
|
+
if user and !Ssh.sudoers(node).include?(u.name)
|
447
|
+
raise "#{u.name} does not have su permissions for #{node}"
|
448
|
+
elsif user.nil? and Ssh.su_all_users(node)
|
449
|
+
user = u.name
|
450
|
+
end
|
451
|
+
|
452
|
+
#determine path for target
|
453
|
+
target_path = params['target']
|
454
|
+
|
455
|
+
gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
|
456
|
+
#return blank response if there are no slots available
|
457
|
+
return nil unless gdrive_slot
|
458
|
+
source_dst = s.source_dsts(gdrive_slot).first
|
459
|
+
schema_hash = params['schema'] ? Hive.schema_hash(params['schema'],user,gdrive_slot) : {}
|
460
|
+
Gdrive.unslot_worker_by_path(stage_path)
|
461
|
+
|
462
|
+
#drop target before create/insert?
|
463
|
+
drop = params['drop']
|
464
|
+
|
465
|
+
#determine source
|
466
|
+
source_tsv,source_hql = [nil]*2
|
467
|
+
if params['hql']
|
468
|
+
source_hql = params['hql']
|
469
|
+
elsif source_dst
|
470
|
+
if source_dst.handler == 'hive'
|
471
|
+
#source table
|
472
|
+
cluster,source_path = source_dst.path.split("/").ie{|sp| [sp.first, sp[1..-1].join(".")]}
|
473
|
+
source_hql = "select * from #{source_path};"
|
474
|
+
elsif ['gridfs','hdfs'].include?(source_dst.handler)
|
475
|
+
if source_dst.path.ie{|sdp| sdp.index(/\.[A-Za-z]ql$/) or sdp.ends_with?(".ql")}
|
476
|
+
source_hql = source_dst.read(user)
|
477
|
+
else
|
478
|
+
#tsv from sheet
|
479
|
+
source_tsv = source_dst.read(user)
|
480
|
+
end
|
481
|
+
end
|
482
|
+
end
|
483
|
+
|
484
|
+
out_string = if source_hql
|
485
|
+
Hive.hql_to_table(cluster, source_hql, target_path, user, drop, schema_hash)
|
486
|
+
elsif source_tsv
|
487
|
+
Hive.tsv_to_table(cluster, source_tsv, target_path, user, drop, schema_hash)
|
488
|
+
else
|
489
|
+
raise "Unable to determine source tsv or source hql"
|
490
|
+
end
|
491
|
+
|
492
|
+
|
493
|
+
|
494
|
+
#unslot worker and write result
|
495
|
+
Hive.unslot_worker_by_path(stage_path)
|
496
|
+
|
497
|
+
#output table stores stage output
|
498
|
+
out_string = "result\n#{out_string}"
|
499
|
+
output_db,output_table = [Hive.output_db(cluster),stage_path.gridsafe]
|
500
|
+
out_url = "hive://#{cluster}/#{output_db}/#{output_table}"
|
501
|
+
Dataset.write_by_url(out_url,out_string,node_user)
|
502
|
+
out_url
|
503
|
+
end
|
504
|
+
|
505
|
+
def Hive.read_by_dataset_path(dst_path,user)
|
506
|
+
cluster,source_path = dst_path.split("/").ie do |sp|
|
507
|
+
if sp.length == 2
|
508
|
+
[Hive.clusters.first.first,sp.join(".")]
|
509
|
+
else
|
510
|
+
[sp.first, sp[1..-1].join(".")]
|
511
|
+
end
|
512
|
+
end
|
513
|
+
hql = "set hive.cli.print.header=true;select * from #{source_path};"
|
514
|
+
Hive.run(hql,cluster,user)
|
515
|
+
end
|
516
|
+
|
517
|
+
def Hive.write_by_dataset_path(dst_path,source_tsv,user)
|
518
|
+
cluster,target_path = dst_path.split("/").ie do |sp|
|
519
|
+
if sp.length == 2
|
520
|
+
[Hive.clusters.first.first,sp.join(".")]
|
521
|
+
else
|
522
|
+
[sp.first, sp[1..-1].join(".")]
|
523
|
+
end
|
524
|
+
end
|
525
|
+
drop = true
|
526
|
+
Hive.tsv_to_table(cluster, source_tsv, target_path, user, drop)
|
527
|
+
end
|
528
|
+
end
|
529
|
+
|
530
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
namespace :mobilize_hive do
|
2
|
+
desc "Set up config and log folders and files"
|
3
|
+
task :setup do
|
4
|
+
sample_dir = File.dirname(__FILE__) + '/../samples/'
|
5
|
+
sample_files = Dir.entries(sample_dir)
|
6
|
+
config_dir = (ENV['MOBILIZE_CONFIG_DIR'] ||= "config/mobilize/")
|
7
|
+
log_dir = (ENV['MOBILIZE_LOG_DIR'] ||= "log/")
|
8
|
+
full_config_dir = "#{ENV['PWD']}/#{config_dir}"
|
9
|
+
full_log_dir = "#{ENV['PWD']}/#{log_dir}"
|
10
|
+
unless File.exists?(full_config_dir)
|
11
|
+
puts "creating #{config_dir}"
|
12
|
+
`mkdir -p #{full_config_dir}`
|
13
|
+
end
|
14
|
+
unless File.exists?(full_log_dir)
|
15
|
+
puts "creating #{log_dir}"
|
16
|
+
`mkdir -p #{full_log_dir}`
|
17
|
+
end
|
18
|
+
sample_files.each do |fname|
|
19
|
+
unless File.exists?("#{full_config_dir}#{fname}")
|
20
|
+
puts "creating #{config_dir}#{fname}"
|
21
|
+
`cp #{sample_dir}#{fname} #{full_config_dir}#{fname}`
|
22
|
+
end
|
23
|
+
end
|
24
|
+
#make sure that the jobtracker.yml is updated to include the
|
25
|
+
#mobilize-ssh library
|
26
|
+
jt_config_file = "#{config_dir}jobtracker.yml"
|
27
|
+
if File.exists?(jt_config_file)
|
28
|
+
yml_hash = YAML.load_file(jt_config_file)
|
29
|
+
yml_hash.keys.each do |k|
|
30
|
+
if yml_hash[k]['extensions'] and !yml_hash[k]['extensions'].include?('mobilize-hive')
|
31
|
+
puts "adding mobilize-hive to jobtracker.yml/#{k}/extensions"
|
32
|
+
yml_hash[k]['extensions'] = yml_hash[k]['extensions'].to_a + ['mobilize-hive']
|
33
|
+
end
|
34
|
+
end
|
35
|
+
File.open(jt_config_file,"w") {|f| f.print(yml_hash.to_yaml)}
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
---
|
2
|
+
development:
|
3
|
+
clusters:
|
4
|
+
dev_cluster:
|
5
|
+
max_slots: 5
|
6
|
+
temp_table_db: mobilize
|
7
|
+
exec_path: /path/to/hive
|
8
|
+
test:
|
9
|
+
clusters:
|
10
|
+
test_cluster:
|
11
|
+
max_slots: 5
|
12
|
+
temp_table_db: mobilize
|
13
|
+
exec_path: /path/to/hive
|
14
|
+
production:
|
15
|
+
clusters:
|
16
|
+
prod_cluster:
|
17
|
+
max_slots: 5
|
18
|
+
temp_table_db: mobilize
|
19
|
+
exec_path: /path/to/hive
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'mobilize-hive/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "mobilize-hive"
|
8
|
+
gem.version = Mobilize::Hive::VERSION
|
9
|
+
gem.authors = ["Cassio Paes-Leme"]
|
10
|
+
gem.email = ["cpaesleme@ngmoco.com"]
|
11
|
+
gem.description = %q{Adds hive read, write, and run support to mobilize-hdfs}
|
12
|
+
gem.summary = %q{Adds hive read, write, and run support to mobilize-hdfs}
|
13
|
+
gem.homepage = "http://github.com/dena/mobilize-hive"
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.require_paths = ["lib"]
|
19
|
+
gem.add_runtime_dependency "mobilize-hdfs","1.0.07"
|
20
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
---
|
2
|
+
- name: hive_test_1
|
3
|
+
active: true
|
4
|
+
trigger: once
|
5
|
+
status: ""
|
6
|
+
stage1: hive.write target:"mobilize/hive_test_1/act_date", drop:true,
|
7
|
+
source:"Runner_mobilize(test)/hive_test_1.in", schema:"hive_test_1.schema"
|
8
|
+
stage2: hive.run source:"hive_test_1.hql"
|
9
|
+
stage3: hive.run hql:"show databases;"
|
10
|
+
stage4: gsheet.write source:"stage2", target:"hive_test_1_stage_2.out"
|
11
|
+
stage5: gsheet.write source:"stage3", target:"hive_test_1_stage_3.out"
|
12
|
+
- name: hive_test_2
|
13
|
+
active: true
|
14
|
+
trigger: after hive_test_1
|
15
|
+
status: ""
|
16
|
+
stage1: hive.write source:"hdfs://user/mobilize/test/test_hdfs_1.out", target:"mobilize.hive_test_2", drop:true
|
17
|
+
stage2: hive.run hql:"select * from mobilize.hive_test_2;"
|
18
|
+
stage3: gsheet.write source:"stage2", target:"hive_test_2.out"
|
19
|
+
- name: hive_test_3
|
20
|
+
active: true
|
21
|
+
trigger: after hive_test_2
|
22
|
+
status: ""
|
23
|
+
stage1: hive.run hql:"select act_date as `date`,product,category,value from mobilize.hive_test_1;"
|
24
|
+
stage2: hive.write source:"stage1",target:"mobilize/hive_test_3/date/product", drop:true
|
25
|
+
stage3: hive.write hql:"select * from mobilize.hive_test_3;",target:"mobilize/hive_test_3/date/product", drop:false
|
26
|
+
stage4: gsheet.write source:"hive://mobilize/hive_test_3", target:"hive_test_3.out"
|
@@ -0,0 +1 @@
|
|
1
|
+
select act_date,product, sum(value) as sum from mobilize.hive_test_1 group by act_date,product;
|