mobilize-hive 1.0.11 → 1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +4 -2
- data/lib/mobilize-hive/handlers/hive.rb +249 -171
- data/lib/mobilize-hive/version.rb +1 -1
- data/mobilize-hive.gemspec +2 -2
- data/test/hive_job_rows.yml +3 -3
- data/test/mobilize-hive_test.rb +28 -3
- metadata +5 -5
data/README.md
CHANGED
@@ -146,7 +146,7 @@ Start
|
|
146
146
|
script in the hql or source sheet and returns any output specified at the
|
147
147
|
end. If the cmd or last query in source is a select statement, column headers will be
|
148
148
|
returned as well.
|
149
|
-
* hive.write `hql:<hql> || source:<source_path>, target:<hive_path>, user:<user>, cluster:<cluster>, schema:<gsheet_path>, drop:<true/false>`,
|
149
|
+
* hive.write `hql:<hql> || source:<source_path>, target:<hive_path>, partitions:<partition_path>, user:<user>, cluster:<cluster>, schema:<gsheet_path>, drop:<true/false>`,
|
150
150
|
which writes the source or query result to the selected hive table.
|
151
151
|
* hive_path
|
152
152
|
* should be of the form `<hive_db>/<table_name>` or `<hive_db>.<table_name>`.
|
@@ -156,8 +156,10 @@ Start
|
|
156
156
|
* if the file ends in .*ql, it's treated the same as passing hql
|
157
157
|
* otherwise it is treated as a tsv with the first row as column headers
|
158
158
|
* target:
|
159
|
-
*
|
159
|
+
* Should be a hive_path, as in `<hive_db>/<table_name>` or `<hive_db>.<table_name>`.
|
160
|
+
* partitions:
|
160
161
|
* Due to Hive limitation, partition names CANNOT be reserved keywords when writing from tsv (gsheet or hdfs source)
|
162
|
+
* Partitions should be specified as a path, as in partitions:`<partition1>/<partition2>`.
|
161
163
|
* schema:
|
162
164
|
* optional. gsheet_path to column schema.
|
163
165
|
* two columns: name, datatype
|
@@ -47,10 +47,69 @@ module Mobilize
|
|
47
47
|
end
|
48
48
|
end
|
49
49
|
|
50
|
+
def Hive.databases(cluster,user_name)
|
51
|
+
Hive.run(cluster,"show databases",user_name)['stdout'].split("\n")
|
52
|
+
end
|
53
|
+
|
54
|
+
# converts a source path or target path to a dst in the context of handler and stage
|
55
|
+
def Hive.path_to_dst(path,stage_path)
|
56
|
+
has_handler = true if path.index("://")
|
57
|
+
s = Stage.where(:path=>stage_path).first
|
58
|
+
params = s.params
|
59
|
+
target_path = params['target']
|
60
|
+
cluster = params['cluster'] if Hadoop.clusters.include?(params['cluster'].to_s)
|
61
|
+
is_target = true if path == target_path
|
62
|
+
red_path = path.split("://").last
|
63
|
+
first_path_node = red_path.gsub(".","/").split("/").first
|
64
|
+
cluster ||= Hadoop.clusters.include?(first_path_node) ? first_path_node : Hadoop.default_cluster
|
65
|
+
user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
|
66
|
+
#save some time on targets
|
67
|
+
databases = Hive.databases(cluster,user_name) unless is_target
|
68
|
+
#is user has a handler, is specifying a target,
|
69
|
+
#or their first path node is a cluster name
|
70
|
+
#or their first path node is actually a database
|
71
|
+
#assume it's a hive pointer
|
72
|
+
if is_target or
|
73
|
+
has_handler or
|
74
|
+
Hadoop.clusters.include?(first_path_node) or
|
75
|
+
databases.include?(first_path_node)
|
76
|
+
#make sure cluster is legit
|
77
|
+
hive_url = Hive.url_by_path(red_path,user_name,is_target)
|
78
|
+
return Dataset.find_or_create_by_url(hive_url)
|
79
|
+
end
|
80
|
+
#otherwise, use hdfs convention
|
81
|
+
return Ssh.path_to_dst(path,stage_path)
|
82
|
+
end
|
83
|
+
|
84
|
+
def Hive.url_by_path(path,user_name,is_target=false)
|
85
|
+
red_path = path.gsub(".","/")
|
86
|
+
cluster = red_path.split("/").first.to_s
|
87
|
+
if Hadoop.clusters.include?(cluster)
|
88
|
+
#cut node out of path
|
89
|
+
red_path = red_path.split("/")[1..-1].join("/")
|
90
|
+
else
|
91
|
+
cluster = Hadoop.default_cluster
|
92
|
+
end
|
93
|
+
db, table = red_path.split("/")[0..-1]
|
94
|
+
url = "hive://#{cluster}/#{db}/#{table}"
|
95
|
+
begin
|
96
|
+
#add table stats check only if not target
|
97
|
+
if is_target or Hive.table_stats(cluster, db, table, user_name)['stderr'].to_s.length == 0
|
98
|
+
return url
|
99
|
+
else
|
100
|
+
raise "Unable to find #{url} with error: #{stat_response['stderr']}"
|
101
|
+
end
|
102
|
+
rescue => exc
|
103
|
+
raise Exception, "Unable to find #{url} with error: #{exc.to_s}", exc.backtrace
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
50
107
|
#get field names and partition datatypes and size of a hive table
|
51
|
-
def Hive.table_stats(db,table,
|
52
|
-
describe_sql = "use #{db};describe extended #{table}"
|
53
|
-
|
108
|
+
def Hive.table_stats(cluster,db,table,user_name)
|
109
|
+
describe_sql = "use #{db};describe extended #{table};"
|
110
|
+
describe_response = Hive.run(cluster, describe_sql,user_name)
|
111
|
+
return describe_response if describe_response['stdout'].length==0
|
112
|
+
describe_output = describe_response['stdout']
|
54
113
|
describe_output.split("location:").last.split(",").first
|
55
114
|
#get location, fields, partitions
|
56
115
|
result_hash = {}
|
@@ -78,12 +137,12 @@ module Mobilize
|
|
78
137
|
#assign field defs after removing partitions
|
79
138
|
result_hash['field_defs'] = field_defs
|
80
139
|
#get size
|
81
|
-
result_hash['size'] = Hadoop.run("fs -dus #{result_hash['location']}",
|
140
|
+
result_hash['size'] = Hadoop.run(cluster,"fs -dus #{result_hash['location']}",user_name)['stdout'].split("\t").last.strip.to_i
|
82
141
|
return result_hash
|
83
142
|
end
|
84
143
|
|
85
144
|
#run a generic hive command, with the option of passing a file hash to be locally available
|
86
|
-
def Hive.run(hql,
|
145
|
+
def Hive.run(cluster,hql,user_name,file_hash=nil)
|
87
146
|
# no TempStatsStore
|
88
147
|
hql = "set hive.stats.autogather=false;#{hql}"
|
89
148
|
filename = hql.to_md5
|
@@ -93,22 +152,15 @@ module Mobilize
|
|
93
152
|
#at hadoop read limit
|
94
153
|
command = "#{Hive.exec_path(cluster)} -S -f #{filename} | head -c #{Hadoop.read_limit}"
|
95
154
|
gateway_node = Hadoop.gateway_node(cluster)
|
96
|
-
Ssh.run(gateway_node,command,
|
155
|
+
Ssh.run(gateway_node,command,user_name,file_hash)
|
97
156
|
end
|
98
157
|
|
99
158
|
def Hive.run_by_stage_path(stage_path)
|
100
159
|
s = Stage.where(:path=>stage_path).first
|
101
|
-
u = s.job.runner.user
|
102
160
|
params = s.params
|
103
|
-
user = params['user']
|
104
161
|
cluster = params['cluster'] || Hive.clusters.keys.first
|
105
|
-
|
106
|
-
|
107
|
-
raise "#{u.name} does not have su permissions for #{node}"
|
108
|
-
elsif user.nil? and Ssh.su_all_users(node)
|
109
|
-
user = u.name
|
110
|
-
end
|
111
|
-
|
162
|
+
user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
|
163
|
+
job_name = s.path.sub("Runner_","")
|
112
164
|
#slot Hive worker if available
|
113
165
|
slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
|
114
166
|
return false unless slot_id
|
@@ -122,13 +174,8 @@ module Mobilize
|
|
122
174
|
if params['hql']
|
123
175
|
hql = params['hql']
|
124
176
|
else
|
125
|
-
|
126
|
-
|
127
|
-
#return blank response if there are no slots available
|
128
|
-
return nil unless gdrive_slot
|
129
|
-
source_dst = s.source_dsts(gdrive_slot).first
|
130
|
-
Gdrive.unslot_worker_by_path(stage_path)
|
131
|
-
hql = source_dst.read(user)
|
177
|
+
source = s.sources.first
|
178
|
+
hql = source.read(user_name)
|
132
179
|
end
|
133
180
|
|
134
181
|
#check for select at end
|
@@ -137,55 +184,59 @@ module Mobilize
|
|
137
184
|
#nil if no prior commands
|
138
185
|
prior_hql = hql_array[0..-2].join(";") if hql_array.length > 1
|
139
186
|
select_hql = hql_array.last
|
140
|
-
output_table_hql = ["
|
187
|
+
output_table_hql = ["set mapred.job.name=#{job_name};",
|
188
|
+
"drop table if exists #{output_path}",
|
141
189
|
"create table #{output_path} as #{select_hql};"].join(";")
|
142
190
|
full_hql = [prior_hql, output_table_hql].compact.join(";")
|
143
|
-
Hive.run(full_hql,
|
144
|
-
#already populated, make sure dataset exists
|
191
|
+
result = Hive.run(cluster,full_hql, user_name)
|
145
192
|
Dataset.find_or_create_by_url(out_url)
|
146
193
|
else
|
147
|
-
|
148
|
-
|
149
|
-
Dataset.write_by_url(out_url,
|
194
|
+
result = Hive.run(cluster, hql, user_name)
|
195
|
+
Dataset.find_or_create_by_url(out_url)
|
196
|
+
Dataset.write_by_url(out_url,result['stdout'],user_name) if result['stdout'].to_s.length>0
|
150
197
|
end
|
151
198
|
#unslot worker
|
152
199
|
Hive.unslot_worker_by_path(stage_path)
|
153
|
-
|
200
|
+
response = {}
|
201
|
+
response['out_url'] = out_url
|
202
|
+
response['err_url'] = Dataset.write_by_url("gridfs://#{s.path}/err",result['stderr'].to_s,Gdrive.owner_name) if result['stderr'].to_s.length>0
|
203
|
+
response['signal'] = result['exit_code']
|
204
|
+
response
|
154
205
|
end
|
155
206
|
|
156
|
-
def Hive.schema_hash(schema_path,
|
207
|
+
def Hive.schema_hash(schema_path,user_name,gdrive_slot)
|
157
208
|
if schema_path.index("/")
|
158
209
|
#slashes mean sheets
|
159
|
-
out_tsv = Gsheet.find_by_path(schema_path,gdrive_slot).read(
|
210
|
+
out_tsv = Gsheet.find_by_path(schema_path,gdrive_slot).read(user_name)
|
160
211
|
else
|
161
|
-
u = User.where(:name=>
|
212
|
+
u = User.where(:name=>user_name).first
|
162
213
|
#check sheets in runner
|
163
214
|
r = u.runner
|
164
215
|
runner_sheet = r.gbook(gdrive_slot).worksheet_by_title(schema_path)
|
165
216
|
out_tsv = if runner_sheet
|
166
|
-
runner_sheet.read(
|
217
|
+
runner_sheet.read(user_name)
|
167
218
|
else
|
168
219
|
#check for gfile. will fail if there isn't one.
|
169
|
-
Gfile.find_by_path(schema_path).read(
|
220
|
+
Gfile.find_by_path(schema_path).read(user_name)
|
170
221
|
end
|
171
|
-
#use Gridfs to cache gdrive results
|
172
|
-
file_name = schema_path.split("/").last
|
173
|
-
out_url = "gridfs://#{schema_path}/#{file_name}"
|
174
|
-
Dataset.write_by_url(out_url,out_tsv,user)
|
175
|
-
schema_tsv = Dataset.find_by_url(out_url).read(user)
|
176
|
-
schema_hash = {}
|
177
|
-
schema_tsv.tsv_to_hash_array.each do |ha|
|
178
|
-
schema_hash[ha['name']] = ha['datatype']
|
179
|
-
end
|
180
|
-
schema_hash
|
181
222
|
end
|
223
|
+
#use Gridfs to cache gdrive results
|
224
|
+
file_name = schema_path.split("/").last
|
225
|
+
out_url = "gridfs://#{schema_path}/#{file_name}"
|
226
|
+
Dataset.write_by_url(out_url,out_tsv,user_name)
|
227
|
+
schema_tsv = Dataset.find_by_url(out_url).read(user_name)
|
228
|
+
schema_hash = {}
|
229
|
+
schema_tsv.tsv_to_hash_array.each do |ha|
|
230
|
+
schema_hash[ha['name']] = ha['datatype']
|
231
|
+
end
|
232
|
+
schema_hash
|
182
233
|
end
|
183
234
|
|
184
|
-
def Hive.path_params(cluster, path,
|
235
|
+
def Hive.path_params(cluster, path, user_name)
|
185
236
|
db, table, partitions = path.gsub(".","/").split("/").ie{|sp| [sp.first, sp.second, sp[2..-1]]}
|
186
237
|
#get existing table stats if any
|
187
238
|
curr_stats = begin
|
188
|
-
Hive.table_stats(db, table,
|
239
|
+
Hive.table_stats(cluster, db, table, user_name)
|
189
240
|
rescue
|
190
241
|
nil
|
191
242
|
end
|
@@ -195,27 +246,34 @@ module Mobilize
|
|
195
246
|
"curr_stats"=>curr_stats}
|
196
247
|
end
|
197
248
|
|
198
|
-
def Hive.hql_to_table(cluster, source_hql,
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
249
|
+
def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil)
|
250
|
+
table_path = [db,table].join(".")
|
251
|
+
target_params = Hive.path_params(cluster, table_path, user_name)
|
252
|
+
table_stats = target_params['curr_stats']
|
253
|
+
|
254
|
+
source_hql_array = source_hql.split(";")
|
255
|
+
last_select_i = source_hql_array.rindex{|hql| hql.downcase.strip.starts_with?("select")}
|
256
|
+
#find the last select query -- it should be used for the temp table creation
|
257
|
+
last_select_hql = (source_hql_array[last_select_i..-1].join(";")+";")
|
258
|
+
#if there is anything prior to the last select, add it in prior to table creation
|
259
|
+
prior_hql = ((source_hql_array[0..(last_select_i-1)].join(";")+";") if last_select_i and last_select_i>=1).to_s
|
203
260
|
|
204
261
|
#create temporary table so we can identify fields etc.
|
205
262
|
temp_db = Hive.output_db(cluster)
|
206
|
-
temp_table_name = (source_hql+
|
263
|
+
temp_table_name = (source_hql+table_path).to_md5
|
207
264
|
temp_table_path = [temp_db,temp_table_name].join(".")
|
265
|
+
temp_set_hql = "set mapred.job.name=#{job_name} (temp table);"
|
208
266
|
temp_drop_hql = "drop table if exists #{temp_table_path};"
|
209
|
-
temp_create_hql = "#{temp_drop_hql}create table #{temp_table_path} as #{
|
210
|
-
Hive.run(temp_create_hql,
|
267
|
+
temp_create_hql = "#{temp_set_hql}#{prior_hql}#{temp_drop_hql}create table #{temp_table_path} as #{last_select_hql}"
|
268
|
+
Hive.run(cluster,temp_create_hql,user_name)
|
211
269
|
|
212
|
-
source_params = Hive.path_params(cluster, temp_table_path,
|
270
|
+
source_params = Hive.path_params(cluster, temp_table_path, user_name)
|
213
271
|
source_table_path = ['db','table'].map{|k| source_params[k]}.join(".")
|
214
272
|
source_table_stats = source_params['curr_stats']
|
215
273
|
source_fields = source_table_stats['field_defs']
|
216
274
|
|
217
|
-
if
|
218
|
-
|
275
|
+
if part_array.length == 0 and
|
276
|
+
table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
|
219
277
|
#no partitions in either user params or the target table
|
220
278
|
|
221
279
|
target_headers = source_fields.map{|f| f['name']}
|
@@ -233,21 +291,27 @@ module Mobilize
|
|
233
291
|
end.join(",")})"
|
234
292
|
|
235
293
|
#always drop when no partititons
|
236
|
-
|
294
|
+
target_name_hql = "set mapred.job.name=#{job_name};"
|
295
|
+
|
296
|
+
target_drop_hql = "drop table if exists #{table_path};"
|
237
297
|
|
238
|
-
target_create_hql = "create table if not exists #{
|
298
|
+
target_create_hql = "create table if not exists #{table_path} #{field_def_stmt};"
|
239
299
|
|
240
|
-
target_insert_hql = "insert overwrite table #{
|
300
|
+
target_insert_hql = "insert overwrite table #{table_path} select #{target_field_stmt} from #{source_table_path};"
|
241
301
|
|
242
|
-
target_full_hql = [
|
302
|
+
target_full_hql = [target_name_hql,
|
303
|
+
target_drop_hql,
|
304
|
+
target_create_hql,
|
305
|
+
target_insert_hql,
|
306
|
+
temp_drop_hql].join
|
243
307
|
|
244
|
-
Hive.run(
|
308
|
+
Hive.run(cluster, target_full_hql, user_name)
|
245
309
|
|
246
|
-
elsif
|
247
|
-
|
310
|
+
elsif part_array.length > 0 and
|
311
|
+
table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == part_array}
|
248
312
|
#partitions and no target table or same partitions in both target table and user params
|
249
313
|
|
250
|
-
target_headers = source_fields.map{|f| f['name']}.reject{|h|
|
314
|
+
target_headers = source_fields.map{|f| f['name']}.reject{|h| part_array.include?(h)}
|
251
315
|
|
252
316
|
field_defs = {}
|
253
317
|
target_headers.each do |name|
|
@@ -260,7 +324,7 @@ module Mobilize
|
|
260
324
|
end.join(",")})"
|
261
325
|
|
262
326
|
part_defs = {}
|
263
|
-
|
327
|
+
part_array.each do |name|
|
264
328
|
datatype = schema_hash[name] || "string"
|
265
329
|
part_defs[name] = datatype
|
266
330
|
end
|
@@ -271,70 +335,70 @@ module Mobilize
|
|
271
335
|
|
272
336
|
target_field_stmt = target_headers.map{|h| "`#{h}`"}.join(",")
|
273
337
|
|
274
|
-
target_part_stmt =
|
338
|
+
target_part_stmt = part_array.map{|h| "`#{h}`"}.join(",")
|
275
339
|
|
276
|
-
target_set_hql = ["set
|
340
|
+
target_set_hql = ["set mapred.job.name=#{job_name};",
|
341
|
+
"set hive.exec.dynamic.partition.mode=nonstrict;",
|
277
342
|
"set hive.exec.max.dynamic.partitions.pernode=1000;",
|
278
343
|
"set hive.exec.dynamic.partition=true;",
|
279
344
|
"set hive.exec.max.created.files = 200000;",
|
280
345
|
"set hive.max.created.files = 200000;"].join
|
281
346
|
|
282
|
-
if drop or
|
283
|
-
target_drop_hql = "drop table if exists #{
|
347
|
+
if drop or table_stats.nil?
|
348
|
+
target_drop_hql = "drop table if exists #{table_path};"
|
284
349
|
target_create_hql = target_drop_hql +
|
285
|
-
"create table if not exists #{
|
350
|
+
"create table if not exists #{table_path} #{field_def_stmt} " +
|
286
351
|
"partitioned by #{part_def_stmt};"
|
287
352
|
|
288
353
|
else
|
289
|
-
target_db,target_table = target_table_path.split(".")
|
290
354
|
#get all the permutations of possible partititons
|
291
355
|
part_perm_hql = "set hive.cli.print.header=true;select distinct #{target_part_stmt} from #{source_table_path};"
|
292
|
-
part_perm_tsv = Hive.run(
|
356
|
+
part_perm_tsv = Hive.run(cluster, part_perm_hql, user_name)['stdout']
|
293
357
|
#having gotten the permutations, ensure they are dropped
|
294
358
|
part_hash_array = part_perm_tsv.tsv_to_hash_array
|
295
359
|
part_drop_hql = part_hash_array.map do |h|
|
296
360
|
part_drop_stmt = h.map do |name,value|
|
297
361
|
part_defs[name[1..-2]]=="string" ? "#{name}='#{value}'" : "#{name}=#{value}"
|
298
362
|
end.join(",")
|
299
|
-
"use #{
|
363
|
+
"use #{db};alter table #{table} drop if exists partition (#{part_drop_stmt});"
|
300
364
|
end.join
|
301
365
|
target_create_hql = part_drop_hql
|
302
366
|
end
|
303
367
|
|
304
|
-
target_insert_hql = "insert overwrite table #{
|
368
|
+
target_insert_hql = "insert overwrite table #{table_path} " +
|
305
369
|
"partition (#{target_part_stmt}) " +
|
306
370
|
"select #{target_field_stmt},#{target_part_stmt} from #{source_table_path};"
|
307
371
|
|
308
372
|
target_full_hql = [target_set_hql, target_create_hql, target_insert_hql, temp_drop_hql].join
|
309
373
|
|
310
|
-
Hive.run(
|
374
|
+
Hive.run(cluster, target_full_hql, user_name)
|
311
375
|
else
|
312
376
|
error_msg = "Incompatible partition specs"
|
313
377
|
raise error_msg
|
314
378
|
end
|
315
|
-
|
379
|
+
url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
|
380
|
+
return url
|
316
381
|
end
|
317
382
|
|
318
383
|
#turn a tsv into a hive table.
|
319
384
|
#Accepts options to drop existing target if any
|
320
385
|
#also schema with column datatype overrides
|
321
|
-
def Hive.tsv_to_table(cluster,
|
386
|
+
def Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop=false, schema_hash=nil)
|
322
387
|
source_headers = source_tsv.tsv_header_array
|
323
388
|
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
target_partitions = target_params['partitions'].to_a
|
328
|
-
target_table_stats = target_params['curr_stats']
|
389
|
+
table_path = [db,table].join(".")
|
390
|
+
target_params = Hive.path_params(cluster, table_path, user_name)
|
391
|
+
table_stats = target_params['curr_stats']
|
329
392
|
|
330
393
|
schema_hash ||= {}
|
331
394
|
|
332
|
-
if
|
333
|
-
|
395
|
+
if part_array.length == 0 and
|
396
|
+
table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
|
334
397
|
#no partitions in either user params or the target table
|
335
398
|
#or drop and start fresh
|
336
399
|
|
337
400
|
#one file only, strip headers, replace tab with ctrl-a for hive
|
401
|
+
#get rid of freaking carriage return characters
|
338
402
|
source_rows = source_tsv.split("\n")[1..-1].join("\n").gsub("\t","\001")
|
339
403
|
source_tsv_filename = "000000_0"
|
340
404
|
file_hash = {source_tsv_filename=>source_rows}
|
@@ -345,52 +409,52 @@ module Mobilize
|
|
345
409
|
end.ie{|fs| "(#{fs.join(",")})"}
|
346
410
|
|
347
411
|
#for single insert, use drop table and create table always
|
348
|
-
target_drop_hql = "drop table if exists #{
|
412
|
+
target_drop_hql = "drop table if exists #{table_path}"
|
349
413
|
|
350
|
-
target_create_hql = "create table #{
|
414
|
+
target_create_hql = "create table #{table_path} #{field_defs}"
|
351
415
|
|
352
416
|
#load source data
|
353
|
-
target_insert_hql = "load data local inpath '#{source_tsv_filename}' overwrite into table #{
|
417
|
+
target_insert_hql = "load data local inpath '#{source_tsv_filename}' overwrite into table #{table_path};"
|
354
418
|
|
355
419
|
target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql].join(";")
|
356
420
|
|
357
|
-
Hive.run(
|
421
|
+
Hive.run(cluster, target_full_hql, user_name, file_hash)
|
358
422
|
|
359
|
-
elsif
|
360
|
-
|
423
|
+
elsif part_array.length > 0 and
|
424
|
+
table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == part_array}
|
361
425
|
#partitions and no target table
|
362
426
|
#or same partitions in both target table and user params
|
363
427
|
#or drop and start fresh
|
364
428
|
|
365
|
-
target_headers = source_headers.reject{|h|
|
429
|
+
target_headers = source_headers.reject{|h| part_array.include?(h)}
|
366
430
|
|
367
431
|
field_defs = "(#{target_headers.map do |name|
|
368
432
|
datatype = schema_hash[name] || "string"
|
369
433
|
"`#{name}` #{datatype}"
|
370
434
|
end.join(",")})"
|
371
435
|
|
372
|
-
partition_defs = "(#{
|
436
|
+
partition_defs = "(#{part_array.map do |name|
|
373
437
|
datatype = schema_hash[name] || "string"
|
374
438
|
"#{name} #{datatype}"
|
375
439
|
end.join(",")})"
|
376
440
|
|
377
|
-
target_drop_hql = drop ? "drop table if exists #{
|
441
|
+
target_drop_hql = drop ? "drop table if exists #{table_path};" : ""
|
378
442
|
|
379
443
|
target_create_hql = target_drop_hql +
|
380
|
-
"create table if not exists #{
|
444
|
+
"create table if not exists #{table_path} #{field_defs} " +
|
381
445
|
"partitioned by #{partition_defs}"
|
382
446
|
|
383
447
|
#create target table early if not here
|
384
|
-
Hive.run(
|
448
|
+
Hive.run(cluster, target_create_hql, user_name)
|
385
449
|
|
386
|
-
|
450
|
+
table_stats = Hive.table_stats(cluster, db, table, user_name)
|
387
451
|
|
388
452
|
#create data hash from source hash array
|
389
453
|
data_hash = {}
|
390
454
|
source_hash_array = source_tsv.tsv_to_hash_array
|
391
455
|
source_hash_array.each do |ha|
|
392
|
-
tpmk =
|
393
|
-
tpmv = ha.reject{|k,v|
|
456
|
+
tpmk = part_array.map{|pn| "#{pn}=#{ha[pn]}"}.join("/")
|
457
|
+
tpmv = ha.reject{|k,v| part_array.include?(k)}.values.join("\001")
|
394
458
|
if data_hash[tpmk]
|
395
459
|
data_hash[tpmk] += "\n#{tpmv}"
|
396
460
|
else
|
@@ -399,61 +463,62 @@ module Mobilize
|
|
399
463
|
end
|
400
464
|
|
401
465
|
#go through completed data hash and write each key value to the table in question
|
466
|
+
target_part_hql = ""
|
402
467
|
data_hash.each do |tpmk,tpmv|
|
403
468
|
base_filename = "000000_0"
|
404
469
|
part_pairs = tpmk.split("/").map{|p| p.split("=").ie{|pa| ["#{pa.first}","#{pa.second}"]}}
|
405
470
|
part_dir = part_pairs.map{|pp| "#{pp.first}=#{pp.second}"}.join("/")
|
406
471
|
part_stmt = part_pairs.map{|pp| "#{pp.first}='#{pp.second}'"}.join(",")
|
407
|
-
hdfs_dir = "#{
|
408
|
-
|
409
|
-
|
472
|
+
hdfs_dir = "#{table_stats['location']}/#{part_dir}"
|
473
|
+
#source the partitions from a parallel load folder since filenames are all named the same
|
474
|
+
hdfs_source_url = "#{table_stats['location']}/part_load/#{part_dir}/#{base_filename}"
|
475
|
+
hdfs_target_url = hdfs_dir
|
410
476
|
#load partition into source path
|
411
|
-
puts "Writing to #{
|
412
|
-
Hdfs.write(
|
477
|
+
puts "Writing to #{hdfs_source_url} for #{user_name} at #{Time.now.utc}"
|
478
|
+
Hdfs.write(cluster,hdfs_source_url,tpmv,user_name)
|
413
479
|
#let Hive know where the partition is
|
414
|
-
target_add_part_hql = "use #{
|
415
|
-
target_insert_part_hql
|
416
|
-
target_part_hql
|
417
|
-
|
418
|
-
|
480
|
+
target_add_part_hql = "use #{db};alter table #{table} add if not exists partition (#{part_stmt}) location '#{hdfs_target_url}'"
|
481
|
+
target_insert_part_hql = "load data inpath '#{hdfs_source_url}' overwrite into table #{table} partition (#{part_stmt});"
|
482
|
+
target_part_hql += [target_add_part_hql,target_insert_part_hql].join(";")
|
483
|
+
end
|
484
|
+
#run actual partition adds all at once
|
485
|
+
if target_part_hql.length>0
|
486
|
+
puts "Adding partitions to #{cluster}/#{db}/#{table} for #{user_name} at #{Time.now.utc}"
|
487
|
+
Hive.run(cluster, target_part_hql, user_name)
|
419
488
|
end
|
420
489
|
else
|
421
490
|
error_msg = "Incompatible partition specs: " +
|
422
|
-
"target table:#{
|
423
|
-
"user_params:#{
|
491
|
+
"target table:#{table_stats['partitions'].to_s}, " +
|
492
|
+
"user_params:#{part_array.to_s}"
|
424
493
|
raise error_msg
|
425
494
|
end
|
426
|
-
|
495
|
+
url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
|
496
|
+
return url
|
427
497
|
end
|
428
498
|
|
429
499
|
def Hive.write_by_stage_path(stage_path)
|
430
500
|
s = Stage.where(:path=>stage_path).first
|
431
|
-
u = s.job.runner.user
|
432
501
|
params = s.params
|
433
|
-
|
434
|
-
|
502
|
+
source = s.sources.first
|
503
|
+
target = s.target
|
504
|
+
cluster, db, table = target.url.split("://").last.split("/")
|
505
|
+
#update stage with the node so we can use it
|
506
|
+
user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
|
507
|
+
job_name = s.path.sub("Runner_","")
|
435
508
|
|
436
509
|
#slot Hive worker if available
|
437
510
|
slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
|
438
511
|
return false unless slot_id
|
439
512
|
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
target_path = params['target']
|
449
|
-
|
450
|
-
gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
|
451
|
-
#return blank response if there are no slots available
|
452
|
-
return nil unless gdrive_slot
|
453
|
-
source_dst = s.source_dsts(gdrive_slot).first
|
454
|
-
schema_hash = params['schema'] ? Hive.schema_hash(params['schema'],user,gdrive_slot) : {}
|
513
|
+
schema_hash = if params['schema']
|
514
|
+
gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
|
515
|
+
#return blank response if there are no slots available
|
516
|
+
return nil unless gdrive_slot
|
517
|
+
Hive.schema_hash(params['schema'],user_name,gdrive_slot)
|
518
|
+
else
|
519
|
+
{}
|
520
|
+
end
|
455
521
|
Gdrive.unslot_worker_by_path(stage_path)
|
456
|
-
|
457
522
|
#drop target before create/insert?
|
458
523
|
drop = params['drop']
|
459
524
|
|
@@ -461,64 +526,77 @@ module Mobilize
|
|
461
526
|
source_tsv,source_hql = [nil]*2
|
462
527
|
if params['hql']
|
463
528
|
source_hql = params['hql']
|
464
|
-
elsif
|
465
|
-
if
|
529
|
+
elsif source
|
530
|
+
if source.handler == 'hive'
|
466
531
|
#source table
|
467
|
-
cluster,source_path =
|
532
|
+
cluster,source_path = source.path.split("/").ie{|sp| [sp.first, sp[1..-1].join(".")]}
|
468
533
|
source_hql = "select * from #{source_path};"
|
469
|
-
elsif ['gridfs','hdfs'].include?(
|
470
|
-
if
|
471
|
-
source_hql =
|
534
|
+
elsif ['gsheet','gridfs','hdfs'].include?(source.handler)
|
535
|
+
if source.path.ie{|sdp| sdp.index(/\.[A-Za-z]ql$/) or sdp.ends_with?(".ql")}
|
536
|
+
source_hql = source.read(user_name)
|
472
537
|
else
|
473
538
|
#tsv from sheet
|
474
|
-
source_tsv =
|
539
|
+
source_tsv = source.read(user_name)
|
475
540
|
end
|
476
541
|
end
|
477
542
|
end
|
478
543
|
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
544
|
+
part_array = if params['partitions']
|
545
|
+
params['partitions'].to_a.map{|p| p.gsub(".","/").split("/")}.flatten
|
546
|
+
elsif params['target']
|
547
|
+
#take the end parts of the target, that are not the cluster, db, table
|
548
|
+
target_array = params['target'].gsub(".","/").split("/")
|
549
|
+
[cluster,db,table].each do |term|
|
550
|
+
target_array = target_array[1..-1] if target_array.first == term
|
551
|
+
end
|
552
|
+
target_array
|
553
|
+
else
|
554
|
+
[]
|
555
|
+
end
|
487
556
|
|
557
|
+
result = begin
|
558
|
+
url = if source_hql
|
559
|
+
Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop, schema_hash)
|
560
|
+
elsif source_tsv
|
561
|
+
Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop, schema_hash)
|
562
|
+
else
|
563
|
+
raise "Unable to determine source tsv or source hql"
|
564
|
+
end
|
565
|
+
{'stdout'=>url,'exit_code'=>0}
|
566
|
+
rescue => exc
|
567
|
+
{'stderr'=>exc.to_s, 'exit_code'=>500}
|
568
|
+
end
|
488
569
|
|
489
570
|
#unslot worker and write result
|
490
571
|
Hive.unslot_worker_by_path(stage_path)
|
491
572
|
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
out_url
|
573
|
+
response = {}
|
574
|
+
response['out_url'] = Dataset.write_by_url("gridfs://#{s.path}/out",result['stdout'].to_s,Gdrive.owner_name) if result['stdout'].to_s.length>0
|
575
|
+
response['err_url'] = Dataset.write_by_url("gridfs://#{s.path}/err",result['stderr'].to_s,Gdrive.owner_name) if result['stderr'].to_s.length>0
|
576
|
+
response['signal'] = result['exit_code']
|
577
|
+
response
|
498
578
|
end
|
499
579
|
|
500
|
-
def Hive.read_by_dataset_path(dst_path,
|
501
|
-
cluster,
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
580
|
+
def Hive.read_by_dataset_path(dst_path,user_name,*args)
|
581
|
+
cluster, db, table = dst_path.split("/")
|
582
|
+
source_path = [db,table].join(".")
|
583
|
+
job_name = "read #{cluster}/#{db}/#{table}"
|
584
|
+
set_hql = "set hive.cli.print.header=true;set mapred.job.name=#{job_name};"
|
585
|
+
select_hql = "select * from #{source_path};"
|
586
|
+
hql = [set_hql,select_hql].join
|
587
|
+
response = Hive.run(cluster, hql,user_name)
|
588
|
+
if response['exit_code']==0
|
589
|
+
return response['stdout']
|
590
|
+
else
|
591
|
+
raise "Unable to read hive://#{dst_path} with error: #{response['stderr']}"
|
592
|
+
end
|
510
593
|
end
|
511
594
|
|
512
|
-
def Hive.write_by_dataset_path(dst_path,source_tsv,
|
513
|
-
cluster,
|
514
|
-
|
515
|
-
[Hive.clusters.first.first,sp.join(".")]
|
516
|
-
else
|
517
|
-
[sp.first, sp[1..-1].join(".")]
|
518
|
-
end
|
519
|
-
end
|
595
|
+
def Hive.write_by_dataset_path(dst_path,source_tsv,user_name,*args)
|
596
|
+
cluster,db,table = dst_path.split("/")
|
597
|
+
part_array = []
|
520
598
|
drop = true
|
521
|
-
Hive.tsv_to_table(cluster,
|
599
|
+
Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop)
|
522
600
|
end
|
523
601
|
end
|
524
602
|
|
data/mobilize-hive.gemspec
CHANGED
@@ -7,7 +7,7 @@ Gem::Specification.new do |gem|
|
|
7
7
|
gem.name = "mobilize-hive"
|
8
8
|
gem.version = Mobilize::Hive::VERSION
|
9
9
|
gem.authors = ["Cassio Paes-Leme"]
|
10
|
-
gem.email = ["cpaesleme@
|
10
|
+
gem.email = ["cpaesleme@dena.com"]
|
11
11
|
gem.description = %q{Adds hive read, write, and run support to mobilize-hdfs}
|
12
12
|
gem.summary = %q{Adds hive read, write, and run support to mobilize-hdfs}
|
13
13
|
gem.homepage = "http://github.com/dena/mobilize-hive"
|
@@ -16,5 +16,5 @@ Gem::Specification.new do |gem|
|
|
16
16
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
17
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
18
|
gem.require_paths = ["lib"]
|
19
|
-
gem.add_runtime_dependency "mobilize-hdfs","1.
|
19
|
+
gem.add_runtime_dependency "mobilize-hdfs","1.2"
|
20
20
|
end
|
data/test/hive_job_rows.yml
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
active: true
|
4
4
|
trigger: once
|
5
5
|
status: ""
|
6
|
-
stage1: hive.write target:"mobilize/hive_test_1
|
6
|
+
stage1: hive.write target:"mobilize/hive_test_1", partitions:"act_date", drop:true,
|
7
7
|
source:"Runner_mobilize(test)/hive_test_1.in", schema:"hive_test_1.schema"
|
8
8
|
stage2: hive.run source:"hive_test_1.hql"
|
9
9
|
stage3: hive.run hql:"show databases;"
|
@@ -21,6 +21,6 @@
|
|
21
21
|
trigger: after hive_test_2
|
22
22
|
status: ""
|
23
23
|
stage1: hive.run hql:"select act_date as `date`,product,category,value from mobilize.hive_test_1;"
|
24
|
-
stage2: hive.write source:"stage1",target:"mobilize/hive_test_3
|
25
|
-
stage3: hive.write hql:"select * from mobilize.hive_test_3;",target:"mobilize/hive_test_3
|
24
|
+
stage2: hive.write source:"stage1",target:"mobilize/hive_test_3", partitions:"date/product", drop:true
|
25
|
+
stage3: hive.write hql:"select * from mobilize.hive_test_3;",target:"mobilize/hive_test_3", partitions:"date/product", drop:false
|
26
26
|
stage4: gsheet.write source:"hive://mobilize/hive_test_3", target:"hive_test_3.out"
|
data/test/mobilize-hive_test.rb
CHANGED
@@ -52,9 +52,9 @@ describe "Mobilize" do
|
|
52
52
|
hive_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_3.out",gdrive_slot)
|
53
53
|
[hive_3_target_sheet].each{|s| s.delete if s}
|
54
54
|
|
55
|
-
puts "job row added, force enqueued requestor, wait
|
55
|
+
puts "job row added, force enqueued requestor, wait for stages"
|
56
56
|
r.enqueue!
|
57
|
-
|
57
|
+
wait_for_stages(1200)
|
58
58
|
|
59
59
|
puts "jobtracker posted data to test sheet"
|
60
60
|
hive_1_stage_2_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1_stage_2.out",gdrive_slot)
|
@@ -63,9 +63,34 @@ describe "Mobilize" do
|
|
63
63
|
hive_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_3.out",gdrive_slot)
|
64
64
|
|
65
65
|
assert hive_1_stage_2_target_sheet.read(u.name).length == 219
|
66
|
-
assert hive_1_stage_3_target_sheet.read(u.name).length
|
66
|
+
assert hive_1_stage_3_target_sheet.read(u.name).length > 3
|
67
67
|
assert hive_2_target_sheet.read(u.name).length == 599
|
68
68
|
assert hive_3_target_sheet.read(u.name).length == 347
|
69
69
|
end
|
70
70
|
|
71
|
+
def wait_for_stages(time_limit=600,stage_limit=120,wait_length=10)
|
72
|
+
time = 0
|
73
|
+
time_since_stage = 0
|
74
|
+
#check for 10 min
|
75
|
+
while time < time_limit and time_since_stage < stage_limit
|
76
|
+
sleep wait_length
|
77
|
+
job_classes = Mobilize::Resque.jobs.map{|j| j['class']}
|
78
|
+
if job_classes.include?("Mobilize::Stage")
|
79
|
+
time_since_stage = 0
|
80
|
+
puts "saw stage at #{time.to_s} seconds"
|
81
|
+
else
|
82
|
+
time_since_stage += wait_length
|
83
|
+
puts "#{time_since_stage.to_s} seconds since stage seen"
|
84
|
+
end
|
85
|
+
time += wait_length
|
86
|
+
puts "total wait time #{time.to_s} seconds"
|
87
|
+
end
|
88
|
+
|
89
|
+
if time >= time_limit
|
90
|
+
raise "Timed out before stage completion"
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
|
95
|
+
|
71
96
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mobilize-hive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: '1.2'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-03-
|
12
|
+
date: 2013-03-21 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mobilize-hdfs
|
@@ -18,7 +18,7 @@ dependencies:
|
|
18
18
|
requirements:
|
19
19
|
- - '='
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version: 1.
|
21
|
+
version: '1.2'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
24
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -26,10 +26,10 @@ dependencies:
|
|
26
26
|
requirements:
|
27
27
|
- - '='
|
28
28
|
- !ruby/object:Gem::Version
|
29
|
-
version: 1.
|
29
|
+
version: '1.2'
|
30
30
|
description: Adds hive read, write, and run support to mobilize-hdfs
|
31
31
|
email:
|
32
|
-
- cpaesleme@
|
32
|
+
- cpaesleme@dena.com
|
33
33
|
executables: []
|
34
34
|
extensions: []
|
35
35
|
extra_rdoc_files: []
|