mobilize-hive 1.0.11 → 1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +4 -2
- data/lib/mobilize-hive/handlers/hive.rb +249 -171
- data/lib/mobilize-hive/version.rb +1 -1
- data/mobilize-hive.gemspec +2 -2
- data/test/hive_job_rows.yml +3 -3
- data/test/mobilize-hive_test.rb +28 -3
- metadata +5 -5
data/README.md
CHANGED
@@ -146,7 +146,7 @@ Start
|
|
146
146
|
script in the hql or source sheet and returns any output specified at the
|
147
147
|
end. If the cmd or last query in source is a select statement, column headers will be
|
148
148
|
returned as well.
|
149
|
-
* hive.write `hql:<hql> || source:<source_path>, target:<hive_path>, user:<user>, cluster:<cluster>, schema:<gsheet_path>, drop:<true/false>`,
|
149
|
+
* hive.write `hql:<hql> || source:<source_path>, target:<hive_path>, partitions:<partition_path>, user:<user>, cluster:<cluster>, schema:<gsheet_path>, drop:<true/false>`,
|
150
150
|
which writes the source or query result to the selected hive table.
|
151
151
|
* hive_path
|
152
152
|
* should be of the form `<hive_db>/<table_name>` or `<hive_db>.<table_name>`.
|
@@ -156,8 +156,10 @@ Start
|
|
156
156
|
* if the file ends in .*ql, it's treated the same as passing hql
|
157
157
|
* otherwise it is treated as a tsv with the first row as column headers
|
158
158
|
* target:
|
159
|
-
*
|
159
|
+
* Should be a hive_path, as in `<hive_db>/<table_name>` or `<hive_db>.<table_name>`.
|
160
|
+
* partitions:
|
160
161
|
* Due to Hive limitation, partition names CANNOT be reserved keywords when writing from tsv (gsheet or hdfs source)
|
162
|
+
* Partitions should be specified as a path, as in partitions:`<partition1>/<partition2>`.
|
161
163
|
* schema:
|
162
164
|
* optional. gsheet_path to column schema.
|
163
165
|
* two columns: name, datatype
|
@@ -47,10 +47,69 @@ module Mobilize
|
|
47
47
|
end
|
48
48
|
end
|
49
49
|
|
50
|
+
def Hive.databases(cluster,user_name)
|
51
|
+
Hive.run(cluster,"show databases",user_name)['stdout'].split("\n")
|
52
|
+
end
|
53
|
+
|
54
|
+
# converts a source path or target path to a dst in the context of handler and stage
|
55
|
+
def Hive.path_to_dst(path,stage_path)
|
56
|
+
has_handler = true if path.index("://")
|
57
|
+
s = Stage.where(:path=>stage_path).first
|
58
|
+
params = s.params
|
59
|
+
target_path = params['target']
|
60
|
+
cluster = params['cluster'] if Hadoop.clusters.include?(params['cluster'].to_s)
|
61
|
+
is_target = true if path == target_path
|
62
|
+
red_path = path.split("://").last
|
63
|
+
first_path_node = red_path.gsub(".","/").split("/").first
|
64
|
+
cluster ||= Hadoop.clusters.include?(first_path_node) ? first_path_node : Hadoop.default_cluster
|
65
|
+
user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
|
66
|
+
#save some time on targets
|
67
|
+
databases = Hive.databases(cluster,user_name) unless is_target
|
68
|
+
#is user has a handler, is specifying a target,
|
69
|
+
#or their first path node is a cluster name
|
70
|
+
#or their first path node is actually a database
|
71
|
+
#assume it's a hive pointer
|
72
|
+
if is_target or
|
73
|
+
has_handler or
|
74
|
+
Hadoop.clusters.include?(first_path_node) or
|
75
|
+
databases.include?(first_path_node)
|
76
|
+
#make sure cluster is legit
|
77
|
+
hive_url = Hive.url_by_path(red_path,user_name,is_target)
|
78
|
+
return Dataset.find_or_create_by_url(hive_url)
|
79
|
+
end
|
80
|
+
#otherwise, use hdfs convention
|
81
|
+
return Ssh.path_to_dst(path,stage_path)
|
82
|
+
end
|
83
|
+
|
84
|
+
def Hive.url_by_path(path,user_name,is_target=false)
|
85
|
+
red_path = path.gsub(".","/")
|
86
|
+
cluster = red_path.split("/").first.to_s
|
87
|
+
if Hadoop.clusters.include?(cluster)
|
88
|
+
#cut node out of path
|
89
|
+
red_path = red_path.split("/")[1..-1].join("/")
|
90
|
+
else
|
91
|
+
cluster = Hadoop.default_cluster
|
92
|
+
end
|
93
|
+
db, table = red_path.split("/")[0..-1]
|
94
|
+
url = "hive://#{cluster}/#{db}/#{table}"
|
95
|
+
begin
|
96
|
+
#add table stats check only if not target
|
97
|
+
if is_target or Hive.table_stats(cluster, db, table, user_name)['stderr'].to_s.length == 0
|
98
|
+
return url
|
99
|
+
else
|
100
|
+
raise "Unable to find #{url} with error: #{stat_response['stderr']}"
|
101
|
+
end
|
102
|
+
rescue => exc
|
103
|
+
raise Exception, "Unable to find #{url} with error: #{exc.to_s}", exc.backtrace
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
50
107
|
#get field names and partition datatypes and size of a hive table
|
51
|
-
def Hive.table_stats(db,table,
|
52
|
-
describe_sql = "use #{db};describe extended #{table}"
|
53
|
-
|
108
|
+
def Hive.table_stats(cluster,db,table,user_name)
|
109
|
+
describe_sql = "use #{db};describe extended #{table};"
|
110
|
+
describe_response = Hive.run(cluster, describe_sql,user_name)
|
111
|
+
return describe_response if describe_response['stdout'].length==0
|
112
|
+
describe_output = describe_response['stdout']
|
54
113
|
describe_output.split("location:").last.split(",").first
|
55
114
|
#get location, fields, partitions
|
56
115
|
result_hash = {}
|
@@ -78,12 +137,12 @@ module Mobilize
|
|
78
137
|
#assign field defs after removing partitions
|
79
138
|
result_hash['field_defs'] = field_defs
|
80
139
|
#get size
|
81
|
-
result_hash['size'] = Hadoop.run("fs -dus #{result_hash['location']}",
|
140
|
+
result_hash['size'] = Hadoop.run(cluster,"fs -dus #{result_hash['location']}",user_name)['stdout'].split("\t").last.strip.to_i
|
82
141
|
return result_hash
|
83
142
|
end
|
84
143
|
|
85
144
|
#run a generic hive command, with the option of passing a file hash to be locally available
|
86
|
-
def Hive.run(hql,
|
145
|
+
def Hive.run(cluster,hql,user_name,file_hash=nil)
|
87
146
|
# no TempStatsStore
|
88
147
|
hql = "set hive.stats.autogather=false;#{hql}"
|
89
148
|
filename = hql.to_md5
|
@@ -93,22 +152,15 @@ module Mobilize
|
|
93
152
|
#at hadoop read limit
|
94
153
|
command = "#{Hive.exec_path(cluster)} -S -f #{filename} | head -c #{Hadoop.read_limit}"
|
95
154
|
gateway_node = Hadoop.gateway_node(cluster)
|
96
|
-
Ssh.run(gateway_node,command,
|
155
|
+
Ssh.run(gateway_node,command,user_name,file_hash)
|
97
156
|
end
|
98
157
|
|
99
158
|
def Hive.run_by_stage_path(stage_path)
|
100
159
|
s = Stage.where(:path=>stage_path).first
|
101
|
-
u = s.job.runner.user
|
102
160
|
params = s.params
|
103
|
-
user = params['user']
|
104
161
|
cluster = params['cluster'] || Hive.clusters.keys.first
|
105
|
-
|
106
|
-
|
107
|
-
raise "#{u.name} does not have su permissions for #{node}"
|
108
|
-
elsif user.nil? and Ssh.su_all_users(node)
|
109
|
-
user = u.name
|
110
|
-
end
|
111
|
-
|
162
|
+
user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
|
163
|
+
job_name = s.path.sub("Runner_","")
|
112
164
|
#slot Hive worker if available
|
113
165
|
slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
|
114
166
|
return false unless slot_id
|
@@ -122,13 +174,8 @@ module Mobilize
|
|
122
174
|
if params['hql']
|
123
175
|
hql = params['hql']
|
124
176
|
else
|
125
|
-
|
126
|
-
|
127
|
-
#return blank response if there are no slots available
|
128
|
-
return nil unless gdrive_slot
|
129
|
-
source_dst = s.source_dsts(gdrive_slot).first
|
130
|
-
Gdrive.unslot_worker_by_path(stage_path)
|
131
|
-
hql = source_dst.read(user)
|
177
|
+
source = s.sources.first
|
178
|
+
hql = source.read(user_name)
|
132
179
|
end
|
133
180
|
|
134
181
|
#check for select at end
|
@@ -137,55 +184,59 @@ module Mobilize
|
|
137
184
|
#nil if no prior commands
|
138
185
|
prior_hql = hql_array[0..-2].join(";") if hql_array.length > 1
|
139
186
|
select_hql = hql_array.last
|
140
|
-
output_table_hql = ["
|
187
|
+
output_table_hql = ["set mapred.job.name=#{job_name};",
|
188
|
+
"drop table if exists #{output_path}",
|
141
189
|
"create table #{output_path} as #{select_hql};"].join(";")
|
142
190
|
full_hql = [prior_hql, output_table_hql].compact.join(";")
|
143
|
-
Hive.run(full_hql,
|
144
|
-
#already populated, make sure dataset exists
|
191
|
+
result = Hive.run(cluster,full_hql, user_name)
|
145
192
|
Dataset.find_or_create_by_url(out_url)
|
146
193
|
else
|
147
|
-
|
148
|
-
|
149
|
-
Dataset.write_by_url(out_url,
|
194
|
+
result = Hive.run(cluster, hql, user_name)
|
195
|
+
Dataset.find_or_create_by_url(out_url)
|
196
|
+
Dataset.write_by_url(out_url,result['stdout'],user_name) if result['stdout'].to_s.length>0
|
150
197
|
end
|
151
198
|
#unslot worker
|
152
199
|
Hive.unslot_worker_by_path(stage_path)
|
153
|
-
|
200
|
+
response = {}
|
201
|
+
response['out_url'] = out_url
|
202
|
+
response['err_url'] = Dataset.write_by_url("gridfs://#{s.path}/err",result['stderr'].to_s,Gdrive.owner_name) if result['stderr'].to_s.length>0
|
203
|
+
response['signal'] = result['exit_code']
|
204
|
+
response
|
154
205
|
end
|
155
206
|
|
156
|
-
def Hive.schema_hash(schema_path,
|
207
|
+
def Hive.schema_hash(schema_path,user_name,gdrive_slot)
|
157
208
|
if schema_path.index("/")
|
158
209
|
#slashes mean sheets
|
159
|
-
out_tsv = Gsheet.find_by_path(schema_path,gdrive_slot).read(
|
210
|
+
out_tsv = Gsheet.find_by_path(schema_path,gdrive_slot).read(user_name)
|
160
211
|
else
|
161
|
-
u = User.where(:name=>
|
212
|
+
u = User.where(:name=>user_name).first
|
162
213
|
#check sheets in runner
|
163
214
|
r = u.runner
|
164
215
|
runner_sheet = r.gbook(gdrive_slot).worksheet_by_title(schema_path)
|
165
216
|
out_tsv = if runner_sheet
|
166
|
-
runner_sheet.read(
|
217
|
+
runner_sheet.read(user_name)
|
167
218
|
else
|
168
219
|
#check for gfile. will fail if there isn't one.
|
169
|
-
Gfile.find_by_path(schema_path).read(
|
220
|
+
Gfile.find_by_path(schema_path).read(user_name)
|
170
221
|
end
|
171
|
-
#use Gridfs to cache gdrive results
|
172
|
-
file_name = schema_path.split("/").last
|
173
|
-
out_url = "gridfs://#{schema_path}/#{file_name}"
|
174
|
-
Dataset.write_by_url(out_url,out_tsv,user)
|
175
|
-
schema_tsv = Dataset.find_by_url(out_url).read(user)
|
176
|
-
schema_hash = {}
|
177
|
-
schema_tsv.tsv_to_hash_array.each do |ha|
|
178
|
-
schema_hash[ha['name']] = ha['datatype']
|
179
|
-
end
|
180
|
-
schema_hash
|
181
222
|
end
|
223
|
+
#use Gridfs to cache gdrive results
|
224
|
+
file_name = schema_path.split("/").last
|
225
|
+
out_url = "gridfs://#{schema_path}/#{file_name}"
|
226
|
+
Dataset.write_by_url(out_url,out_tsv,user_name)
|
227
|
+
schema_tsv = Dataset.find_by_url(out_url).read(user_name)
|
228
|
+
schema_hash = {}
|
229
|
+
schema_tsv.tsv_to_hash_array.each do |ha|
|
230
|
+
schema_hash[ha['name']] = ha['datatype']
|
231
|
+
end
|
232
|
+
schema_hash
|
182
233
|
end
|
183
234
|
|
184
|
-
def Hive.path_params(cluster, path,
|
235
|
+
def Hive.path_params(cluster, path, user_name)
|
185
236
|
db, table, partitions = path.gsub(".","/").split("/").ie{|sp| [sp.first, sp.second, sp[2..-1]]}
|
186
237
|
#get existing table stats if any
|
187
238
|
curr_stats = begin
|
188
|
-
Hive.table_stats(db, table,
|
239
|
+
Hive.table_stats(cluster, db, table, user_name)
|
189
240
|
rescue
|
190
241
|
nil
|
191
242
|
end
|
@@ -195,27 +246,34 @@ module Mobilize
|
|
195
246
|
"curr_stats"=>curr_stats}
|
196
247
|
end
|
197
248
|
|
198
|
-
def Hive.hql_to_table(cluster, source_hql,
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
249
|
+
def Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop=false, schema_hash=nil)
|
250
|
+
table_path = [db,table].join(".")
|
251
|
+
target_params = Hive.path_params(cluster, table_path, user_name)
|
252
|
+
table_stats = target_params['curr_stats']
|
253
|
+
|
254
|
+
source_hql_array = source_hql.split(";")
|
255
|
+
last_select_i = source_hql_array.rindex{|hql| hql.downcase.strip.starts_with?("select")}
|
256
|
+
#find the last select query -- it should be used for the temp table creation
|
257
|
+
last_select_hql = (source_hql_array[last_select_i..-1].join(";")+";")
|
258
|
+
#if there is anything prior to the last select, add it in prior to table creation
|
259
|
+
prior_hql = ((source_hql_array[0..(last_select_i-1)].join(";")+";") if last_select_i and last_select_i>=1).to_s
|
203
260
|
|
204
261
|
#create temporary table so we can identify fields etc.
|
205
262
|
temp_db = Hive.output_db(cluster)
|
206
|
-
temp_table_name = (source_hql+
|
263
|
+
temp_table_name = (source_hql+table_path).to_md5
|
207
264
|
temp_table_path = [temp_db,temp_table_name].join(".")
|
265
|
+
temp_set_hql = "set mapred.job.name=#{job_name} (temp table);"
|
208
266
|
temp_drop_hql = "drop table if exists #{temp_table_path};"
|
209
|
-
temp_create_hql = "#{temp_drop_hql}create table #{temp_table_path} as #{
|
210
|
-
Hive.run(temp_create_hql,
|
267
|
+
temp_create_hql = "#{temp_set_hql}#{prior_hql}#{temp_drop_hql}create table #{temp_table_path} as #{last_select_hql}"
|
268
|
+
Hive.run(cluster,temp_create_hql,user_name)
|
211
269
|
|
212
|
-
source_params = Hive.path_params(cluster, temp_table_path,
|
270
|
+
source_params = Hive.path_params(cluster, temp_table_path, user_name)
|
213
271
|
source_table_path = ['db','table'].map{|k| source_params[k]}.join(".")
|
214
272
|
source_table_stats = source_params['curr_stats']
|
215
273
|
source_fields = source_table_stats['field_defs']
|
216
274
|
|
217
|
-
if
|
218
|
-
|
275
|
+
if part_array.length == 0 and
|
276
|
+
table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
|
219
277
|
#no partitions in either user params or the target table
|
220
278
|
|
221
279
|
target_headers = source_fields.map{|f| f['name']}
|
@@ -233,21 +291,27 @@ module Mobilize
|
|
233
291
|
end.join(",")})"
|
234
292
|
|
235
293
|
#always drop when no partititons
|
236
|
-
|
294
|
+
target_name_hql = "set mapred.job.name=#{job_name};"
|
295
|
+
|
296
|
+
target_drop_hql = "drop table if exists #{table_path};"
|
237
297
|
|
238
|
-
target_create_hql = "create table if not exists #{
|
298
|
+
target_create_hql = "create table if not exists #{table_path} #{field_def_stmt};"
|
239
299
|
|
240
|
-
target_insert_hql = "insert overwrite table #{
|
300
|
+
target_insert_hql = "insert overwrite table #{table_path} select #{target_field_stmt} from #{source_table_path};"
|
241
301
|
|
242
|
-
target_full_hql = [
|
302
|
+
target_full_hql = [target_name_hql,
|
303
|
+
target_drop_hql,
|
304
|
+
target_create_hql,
|
305
|
+
target_insert_hql,
|
306
|
+
temp_drop_hql].join
|
243
307
|
|
244
|
-
Hive.run(
|
308
|
+
Hive.run(cluster, target_full_hql, user_name)
|
245
309
|
|
246
|
-
elsif
|
247
|
-
|
310
|
+
elsif part_array.length > 0 and
|
311
|
+
table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == part_array}
|
248
312
|
#partitions and no target table or same partitions in both target table and user params
|
249
313
|
|
250
|
-
target_headers = source_fields.map{|f| f['name']}.reject{|h|
|
314
|
+
target_headers = source_fields.map{|f| f['name']}.reject{|h| part_array.include?(h)}
|
251
315
|
|
252
316
|
field_defs = {}
|
253
317
|
target_headers.each do |name|
|
@@ -260,7 +324,7 @@ module Mobilize
|
|
260
324
|
end.join(",")})"
|
261
325
|
|
262
326
|
part_defs = {}
|
263
|
-
|
327
|
+
part_array.each do |name|
|
264
328
|
datatype = schema_hash[name] || "string"
|
265
329
|
part_defs[name] = datatype
|
266
330
|
end
|
@@ -271,70 +335,70 @@ module Mobilize
|
|
271
335
|
|
272
336
|
target_field_stmt = target_headers.map{|h| "`#{h}`"}.join(",")
|
273
337
|
|
274
|
-
target_part_stmt =
|
338
|
+
target_part_stmt = part_array.map{|h| "`#{h}`"}.join(",")
|
275
339
|
|
276
|
-
target_set_hql = ["set
|
340
|
+
target_set_hql = ["set mapred.job.name=#{job_name};",
|
341
|
+
"set hive.exec.dynamic.partition.mode=nonstrict;",
|
277
342
|
"set hive.exec.max.dynamic.partitions.pernode=1000;",
|
278
343
|
"set hive.exec.dynamic.partition=true;",
|
279
344
|
"set hive.exec.max.created.files = 200000;",
|
280
345
|
"set hive.max.created.files = 200000;"].join
|
281
346
|
|
282
|
-
if drop or
|
283
|
-
target_drop_hql = "drop table if exists #{
|
347
|
+
if drop or table_stats.nil?
|
348
|
+
target_drop_hql = "drop table if exists #{table_path};"
|
284
349
|
target_create_hql = target_drop_hql +
|
285
|
-
"create table if not exists #{
|
350
|
+
"create table if not exists #{table_path} #{field_def_stmt} " +
|
286
351
|
"partitioned by #{part_def_stmt};"
|
287
352
|
|
288
353
|
else
|
289
|
-
target_db,target_table = target_table_path.split(".")
|
290
354
|
#get all the permutations of possible partititons
|
291
355
|
part_perm_hql = "set hive.cli.print.header=true;select distinct #{target_part_stmt} from #{source_table_path};"
|
292
|
-
part_perm_tsv = Hive.run(
|
356
|
+
part_perm_tsv = Hive.run(cluster, part_perm_hql, user_name)['stdout']
|
293
357
|
#having gotten the permutations, ensure they are dropped
|
294
358
|
part_hash_array = part_perm_tsv.tsv_to_hash_array
|
295
359
|
part_drop_hql = part_hash_array.map do |h|
|
296
360
|
part_drop_stmt = h.map do |name,value|
|
297
361
|
part_defs[name[1..-2]]=="string" ? "#{name}='#{value}'" : "#{name}=#{value}"
|
298
362
|
end.join(",")
|
299
|
-
"use #{
|
363
|
+
"use #{db};alter table #{table} drop if exists partition (#{part_drop_stmt});"
|
300
364
|
end.join
|
301
365
|
target_create_hql = part_drop_hql
|
302
366
|
end
|
303
367
|
|
304
|
-
target_insert_hql = "insert overwrite table #{
|
368
|
+
target_insert_hql = "insert overwrite table #{table_path} " +
|
305
369
|
"partition (#{target_part_stmt}) " +
|
306
370
|
"select #{target_field_stmt},#{target_part_stmt} from #{source_table_path};"
|
307
371
|
|
308
372
|
target_full_hql = [target_set_hql, target_create_hql, target_insert_hql, temp_drop_hql].join
|
309
373
|
|
310
|
-
Hive.run(
|
374
|
+
Hive.run(cluster, target_full_hql, user_name)
|
311
375
|
else
|
312
376
|
error_msg = "Incompatible partition specs"
|
313
377
|
raise error_msg
|
314
378
|
end
|
315
|
-
|
379
|
+
url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
|
380
|
+
return url
|
316
381
|
end
|
317
382
|
|
318
383
|
#turn a tsv into a hive table.
|
319
384
|
#Accepts options to drop existing target if any
|
320
385
|
#also schema with column datatype overrides
|
321
|
-
def Hive.tsv_to_table(cluster,
|
386
|
+
def Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop=false, schema_hash=nil)
|
322
387
|
source_headers = source_tsv.tsv_header_array
|
323
388
|
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
target_partitions = target_params['partitions'].to_a
|
328
|
-
target_table_stats = target_params['curr_stats']
|
389
|
+
table_path = [db,table].join(".")
|
390
|
+
target_params = Hive.path_params(cluster, table_path, user_name)
|
391
|
+
table_stats = target_params['curr_stats']
|
329
392
|
|
330
393
|
schema_hash ||= {}
|
331
394
|
|
332
|
-
if
|
333
|
-
|
395
|
+
if part_array.length == 0 and
|
396
|
+
table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].nil?}
|
334
397
|
#no partitions in either user params or the target table
|
335
398
|
#or drop and start fresh
|
336
399
|
|
337
400
|
#one file only, strip headers, replace tab with ctrl-a for hive
|
401
|
+
#get rid of freaking carriage return characters
|
338
402
|
source_rows = source_tsv.split("\n")[1..-1].join("\n").gsub("\t","\001")
|
339
403
|
source_tsv_filename = "000000_0"
|
340
404
|
file_hash = {source_tsv_filename=>source_rows}
|
@@ -345,52 +409,52 @@ module Mobilize
|
|
345
409
|
end.ie{|fs| "(#{fs.join(",")})"}
|
346
410
|
|
347
411
|
#for single insert, use drop table and create table always
|
348
|
-
target_drop_hql = "drop table if exists #{
|
412
|
+
target_drop_hql = "drop table if exists #{table_path}"
|
349
413
|
|
350
|
-
target_create_hql = "create table #{
|
414
|
+
target_create_hql = "create table #{table_path} #{field_defs}"
|
351
415
|
|
352
416
|
#load source data
|
353
|
-
target_insert_hql = "load data local inpath '#{source_tsv_filename}' overwrite into table #{
|
417
|
+
target_insert_hql = "load data local inpath '#{source_tsv_filename}' overwrite into table #{table_path};"
|
354
418
|
|
355
419
|
target_full_hql = [target_drop_hql,target_create_hql,target_insert_hql].join(";")
|
356
420
|
|
357
|
-
Hive.run(
|
421
|
+
Hive.run(cluster, target_full_hql, user_name, file_hash)
|
358
422
|
|
359
|
-
elsif
|
360
|
-
|
423
|
+
elsif part_array.length > 0 and
|
424
|
+
table_stats.ie{|tts| tts.nil? || drop || tts['partitions'].to_a.map{|p| p['name']} == part_array}
|
361
425
|
#partitions and no target table
|
362
426
|
#or same partitions in both target table and user params
|
363
427
|
#or drop and start fresh
|
364
428
|
|
365
|
-
target_headers = source_headers.reject{|h|
|
429
|
+
target_headers = source_headers.reject{|h| part_array.include?(h)}
|
366
430
|
|
367
431
|
field_defs = "(#{target_headers.map do |name|
|
368
432
|
datatype = schema_hash[name] || "string"
|
369
433
|
"`#{name}` #{datatype}"
|
370
434
|
end.join(",")})"
|
371
435
|
|
372
|
-
partition_defs = "(#{
|
436
|
+
partition_defs = "(#{part_array.map do |name|
|
373
437
|
datatype = schema_hash[name] || "string"
|
374
438
|
"#{name} #{datatype}"
|
375
439
|
end.join(",")})"
|
376
440
|
|
377
|
-
target_drop_hql = drop ? "drop table if exists #{
|
441
|
+
target_drop_hql = drop ? "drop table if exists #{table_path};" : ""
|
378
442
|
|
379
443
|
target_create_hql = target_drop_hql +
|
380
|
-
"create table if not exists #{
|
444
|
+
"create table if not exists #{table_path} #{field_defs} " +
|
381
445
|
"partitioned by #{partition_defs}"
|
382
446
|
|
383
447
|
#create target table early if not here
|
384
|
-
Hive.run(
|
448
|
+
Hive.run(cluster, target_create_hql, user_name)
|
385
449
|
|
386
|
-
|
450
|
+
table_stats = Hive.table_stats(cluster, db, table, user_name)
|
387
451
|
|
388
452
|
#create data hash from source hash array
|
389
453
|
data_hash = {}
|
390
454
|
source_hash_array = source_tsv.tsv_to_hash_array
|
391
455
|
source_hash_array.each do |ha|
|
392
|
-
tpmk =
|
393
|
-
tpmv = ha.reject{|k,v|
|
456
|
+
tpmk = part_array.map{|pn| "#{pn}=#{ha[pn]}"}.join("/")
|
457
|
+
tpmv = ha.reject{|k,v| part_array.include?(k)}.values.join("\001")
|
394
458
|
if data_hash[tpmk]
|
395
459
|
data_hash[tpmk] += "\n#{tpmv}"
|
396
460
|
else
|
@@ -399,61 +463,62 @@ module Mobilize
|
|
399
463
|
end
|
400
464
|
|
401
465
|
#go through completed data hash and write each key value to the table in question
|
466
|
+
target_part_hql = ""
|
402
467
|
data_hash.each do |tpmk,tpmv|
|
403
468
|
base_filename = "000000_0"
|
404
469
|
part_pairs = tpmk.split("/").map{|p| p.split("=").ie{|pa| ["#{pa.first}","#{pa.second}"]}}
|
405
470
|
part_dir = part_pairs.map{|pp| "#{pp.first}=#{pp.second}"}.join("/")
|
406
471
|
part_stmt = part_pairs.map{|pp| "#{pp.first}='#{pp.second}'"}.join(",")
|
407
|
-
hdfs_dir = "#{
|
408
|
-
|
409
|
-
|
472
|
+
hdfs_dir = "#{table_stats['location']}/#{part_dir}"
|
473
|
+
#source the partitions from a parallel load folder since filenames are all named the same
|
474
|
+
hdfs_source_url = "#{table_stats['location']}/part_load/#{part_dir}/#{base_filename}"
|
475
|
+
hdfs_target_url = hdfs_dir
|
410
476
|
#load partition into source path
|
411
|
-
puts "Writing to #{
|
412
|
-
Hdfs.write(
|
477
|
+
puts "Writing to #{hdfs_source_url} for #{user_name} at #{Time.now.utc}"
|
478
|
+
Hdfs.write(cluster,hdfs_source_url,tpmv,user_name)
|
413
479
|
#let Hive know where the partition is
|
414
|
-
target_add_part_hql = "use #{
|
415
|
-
target_insert_part_hql
|
416
|
-
target_part_hql
|
417
|
-
|
418
|
-
|
480
|
+
target_add_part_hql = "use #{db};alter table #{table} add if not exists partition (#{part_stmt}) location '#{hdfs_target_url}'"
|
481
|
+
target_insert_part_hql = "load data inpath '#{hdfs_source_url}' overwrite into table #{table} partition (#{part_stmt});"
|
482
|
+
target_part_hql += [target_add_part_hql,target_insert_part_hql].join(";")
|
483
|
+
end
|
484
|
+
#run actual partition adds all at once
|
485
|
+
if target_part_hql.length>0
|
486
|
+
puts "Adding partitions to #{cluster}/#{db}/#{table} for #{user_name} at #{Time.now.utc}"
|
487
|
+
Hive.run(cluster, target_part_hql, user_name)
|
419
488
|
end
|
420
489
|
else
|
421
490
|
error_msg = "Incompatible partition specs: " +
|
422
|
-
"target table:#{
|
423
|
-
"user_params:#{
|
491
|
+
"target table:#{table_stats['partitions'].to_s}, " +
|
492
|
+
"user_params:#{part_array.to_s}"
|
424
493
|
raise error_msg
|
425
494
|
end
|
426
|
-
|
495
|
+
url = "hive://" + [cluster,db,table,part_array.compact.join("/")].join("/")
|
496
|
+
return url
|
427
497
|
end
|
428
498
|
|
429
499
|
def Hive.write_by_stage_path(stage_path)
|
430
500
|
s = Stage.where(:path=>stage_path).first
|
431
|
-
u = s.job.runner.user
|
432
501
|
params = s.params
|
433
|
-
|
434
|
-
|
502
|
+
source = s.sources.first
|
503
|
+
target = s.target
|
504
|
+
cluster, db, table = target.url.split("://").last.split("/")
|
505
|
+
#update stage with the node so we can use it
|
506
|
+
user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
|
507
|
+
job_name = s.path.sub("Runner_","")
|
435
508
|
|
436
509
|
#slot Hive worker if available
|
437
510
|
slot_id = Hive.slot_worker_by_cluster_and_path(cluster,stage_path)
|
438
511
|
return false unless slot_id
|
439
512
|
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
target_path = params['target']
|
449
|
-
|
450
|
-
gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
|
451
|
-
#return blank response if there are no slots available
|
452
|
-
return nil unless gdrive_slot
|
453
|
-
source_dst = s.source_dsts(gdrive_slot).first
|
454
|
-
schema_hash = params['schema'] ? Hive.schema_hash(params['schema'],user,gdrive_slot) : {}
|
513
|
+
schema_hash = if params['schema']
|
514
|
+
gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
|
515
|
+
#return blank response if there are no slots available
|
516
|
+
return nil unless gdrive_slot
|
517
|
+
Hive.schema_hash(params['schema'],user_name,gdrive_slot)
|
518
|
+
else
|
519
|
+
{}
|
520
|
+
end
|
455
521
|
Gdrive.unslot_worker_by_path(stage_path)
|
456
|
-
|
457
522
|
#drop target before create/insert?
|
458
523
|
drop = params['drop']
|
459
524
|
|
@@ -461,64 +526,77 @@ module Mobilize
|
|
461
526
|
source_tsv,source_hql = [nil]*2
|
462
527
|
if params['hql']
|
463
528
|
source_hql = params['hql']
|
464
|
-
elsif
|
465
|
-
if
|
529
|
+
elsif source
|
530
|
+
if source.handler == 'hive'
|
466
531
|
#source table
|
467
|
-
cluster,source_path =
|
532
|
+
cluster,source_path = source.path.split("/").ie{|sp| [sp.first, sp[1..-1].join(".")]}
|
468
533
|
source_hql = "select * from #{source_path};"
|
469
|
-
elsif ['gridfs','hdfs'].include?(
|
470
|
-
if
|
471
|
-
source_hql =
|
534
|
+
elsif ['gsheet','gridfs','hdfs'].include?(source.handler)
|
535
|
+
if source.path.ie{|sdp| sdp.index(/\.[A-Za-z]ql$/) or sdp.ends_with?(".ql")}
|
536
|
+
source_hql = source.read(user_name)
|
472
537
|
else
|
473
538
|
#tsv from sheet
|
474
|
-
source_tsv =
|
539
|
+
source_tsv = source.read(user_name)
|
475
540
|
end
|
476
541
|
end
|
477
542
|
end
|
478
543
|
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
544
|
+
part_array = if params['partitions']
|
545
|
+
params['partitions'].to_a.map{|p| p.gsub(".","/").split("/")}.flatten
|
546
|
+
elsif params['target']
|
547
|
+
#take the end parts of the target, that are not the cluster, db, table
|
548
|
+
target_array = params['target'].gsub(".","/").split("/")
|
549
|
+
[cluster,db,table].each do |term|
|
550
|
+
target_array = target_array[1..-1] if target_array.first == term
|
551
|
+
end
|
552
|
+
target_array
|
553
|
+
else
|
554
|
+
[]
|
555
|
+
end
|
487
556
|
|
557
|
+
result = begin
|
558
|
+
url = if source_hql
|
559
|
+
Hive.hql_to_table(cluster, db, table, part_array, source_hql, user_name, job_name, drop, schema_hash)
|
560
|
+
elsif source_tsv
|
561
|
+
Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop, schema_hash)
|
562
|
+
else
|
563
|
+
raise "Unable to determine source tsv or source hql"
|
564
|
+
end
|
565
|
+
{'stdout'=>url,'exit_code'=>0}
|
566
|
+
rescue => exc
|
567
|
+
{'stderr'=>exc.to_s, 'exit_code'=>500}
|
568
|
+
end
|
488
569
|
|
489
570
|
#unslot worker and write result
|
490
571
|
Hive.unslot_worker_by_path(stage_path)
|
491
572
|
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
out_url
|
573
|
+
response = {}
|
574
|
+
response['out_url'] = Dataset.write_by_url("gridfs://#{s.path}/out",result['stdout'].to_s,Gdrive.owner_name) if result['stdout'].to_s.length>0
|
575
|
+
response['err_url'] = Dataset.write_by_url("gridfs://#{s.path}/err",result['stderr'].to_s,Gdrive.owner_name) if result['stderr'].to_s.length>0
|
576
|
+
response['signal'] = result['exit_code']
|
577
|
+
response
|
498
578
|
end
|
499
579
|
|
500
|
-
def Hive.read_by_dataset_path(dst_path,
|
501
|
-
cluster,
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
580
|
+
def Hive.read_by_dataset_path(dst_path,user_name,*args)
|
581
|
+
cluster, db, table = dst_path.split("/")
|
582
|
+
source_path = [db,table].join(".")
|
583
|
+
job_name = "read #{cluster}/#{db}/#{table}"
|
584
|
+
set_hql = "set hive.cli.print.header=true;set mapred.job.name=#{job_name};"
|
585
|
+
select_hql = "select * from #{source_path};"
|
586
|
+
hql = [set_hql,select_hql].join
|
587
|
+
response = Hive.run(cluster, hql,user_name)
|
588
|
+
if response['exit_code']==0
|
589
|
+
return response['stdout']
|
590
|
+
else
|
591
|
+
raise "Unable to read hive://#{dst_path} with error: #{response['stderr']}"
|
592
|
+
end
|
510
593
|
end
|
511
594
|
|
512
|
-
def Hive.write_by_dataset_path(dst_path,source_tsv,
|
513
|
-
cluster,
|
514
|
-
|
515
|
-
[Hive.clusters.first.first,sp.join(".")]
|
516
|
-
else
|
517
|
-
[sp.first, sp[1..-1].join(".")]
|
518
|
-
end
|
519
|
-
end
|
595
|
+
def Hive.write_by_dataset_path(dst_path,source_tsv,user_name,*args)
|
596
|
+
cluster,db,table = dst_path.split("/")
|
597
|
+
part_array = []
|
520
598
|
drop = true
|
521
|
-
Hive.tsv_to_table(cluster,
|
599
|
+
Hive.tsv_to_table(cluster, db, table, part_array, source_tsv, user_name, drop)
|
522
600
|
end
|
523
601
|
end
|
524
602
|
|
data/mobilize-hive.gemspec
CHANGED
@@ -7,7 +7,7 @@ Gem::Specification.new do |gem|
|
|
7
7
|
gem.name = "mobilize-hive"
|
8
8
|
gem.version = Mobilize::Hive::VERSION
|
9
9
|
gem.authors = ["Cassio Paes-Leme"]
|
10
|
-
gem.email = ["cpaesleme@
|
10
|
+
gem.email = ["cpaesleme@dena.com"]
|
11
11
|
gem.description = %q{Adds hive read, write, and run support to mobilize-hdfs}
|
12
12
|
gem.summary = %q{Adds hive read, write, and run support to mobilize-hdfs}
|
13
13
|
gem.homepage = "http://github.com/dena/mobilize-hive"
|
@@ -16,5 +16,5 @@ Gem::Specification.new do |gem|
|
|
16
16
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
17
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
18
|
gem.require_paths = ["lib"]
|
19
|
-
gem.add_runtime_dependency "mobilize-hdfs","1.
|
19
|
+
gem.add_runtime_dependency "mobilize-hdfs","1.2"
|
20
20
|
end
|
data/test/hive_job_rows.yml
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
active: true
|
4
4
|
trigger: once
|
5
5
|
status: ""
|
6
|
-
stage1: hive.write target:"mobilize/hive_test_1
|
6
|
+
stage1: hive.write target:"mobilize/hive_test_1", partitions:"act_date", drop:true,
|
7
7
|
source:"Runner_mobilize(test)/hive_test_1.in", schema:"hive_test_1.schema"
|
8
8
|
stage2: hive.run source:"hive_test_1.hql"
|
9
9
|
stage3: hive.run hql:"show databases;"
|
@@ -21,6 +21,6 @@
|
|
21
21
|
trigger: after hive_test_2
|
22
22
|
status: ""
|
23
23
|
stage1: hive.run hql:"select act_date as `date`,product,category,value from mobilize.hive_test_1;"
|
24
|
-
stage2: hive.write source:"stage1",target:"mobilize/hive_test_3
|
25
|
-
stage3: hive.write hql:"select * from mobilize.hive_test_3;",target:"mobilize/hive_test_3
|
24
|
+
stage2: hive.write source:"stage1",target:"mobilize/hive_test_3", partitions:"date/product", drop:true
|
25
|
+
stage3: hive.write hql:"select * from mobilize.hive_test_3;",target:"mobilize/hive_test_3", partitions:"date/product", drop:false
|
26
26
|
stage4: gsheet.write source:"hive://mobilize/hive_test_3", target:"hive_test_3.out"
|
data/test/mobilize-hive_test.rb
CHANGED
@@ -52,9 +52,9 @@ describe "Mobilize" do
|
|
52
52
|
hive_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_3.out",gdrive_slot)
|
53
53
|
[hive_3_target_sheet].each{|s| s.delete if s}
|
54
54
|
|
55
|
-
puts "job row added, force enqueued requestor, wait
|
55
|
+
puts "job row added, force enqueued requestor, wait for stages"
|
56
56
|
r.enqueue!
|
57
|
-
|
57
|
+
wait_for_stages(1200)
|
58
58
|
|
59
59
|
puts "jobtracker posted data to test sheet"
|
60
60
|
hive_1_stage_2_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_1_stage_2.out",gdrive_slot)
|
@@ -63,9 +63,34 @@ describe "Mobilize" do
|
|
63
63
|
hive_3_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/hive_test_3.out",gdrive_slot)
|
64
64
|
|
65
65
|
assert hive_1_stage_2_target_sheet.read(u.name).length == 219
|
66
|
-
assert hive_1_stage_3_target_sheet.read(u.name).length
|
66
|
+
assert hive_1_stage_3_target_sheet.read(u.name).length > 3
|
67
67
|
assert hive_2_target_sheet.read(u.name).length == 599
|
68
68
|
assert hive_3_target_sheet.read(u.name).length == 347
|
69
69
|
end
|
70
70
|
|
71
|
+
def wait_for_stages(time_limit=600,stage_limit=120,wait_length=10)
|
72
|
+
time = 0
|
73
|
+
time_since_stage = 0
|
74
|
+
#check for 10 min
|
75
|
+
while time < time_limit and time_since_stage < stage_limit
|
76
|
+
sleep wait_length
|
77
|
+
job_classes = Mobilize::Resque.jobs.map{|j| j['class']}
|
78
|
+
if job_classes.include?("Mobilize::Stage")
|
79
|
+
time_since_stage = 0
|
80
|
+
puts "saw stage at #{time.to_s} seconds"
|
81
|
+
else
|
82
|
+
time_since_stage += wait_length
|
83
|
+
puts "#{time_since_stage.to_s} seconds since stage seen"
|
84
|
+
end
|
85
|
+
time += wait_length
|
86
|
+
puts "total wait time #{time.to_s} seconds"
|
87
|
+
end
|
88
|
+
|
89
|
+
if time >= time_limit
|
90
|
+
raise "Timed out before stage completion"
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
|
95
|
+
|
71
96
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mobilize-hive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: '1.2'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-03-
|
12
|
+
date: 2013-03-21 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mobilize-hdfs
|
@@ -18,7 +18,7 @@ dependencies:
|
|
18
18
|
requirements:
|
19
19
|
- - '='
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version: 1.
|
21
|
+
version: '1.2'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
24
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -26,10 +26,10 @@ dependencies:
|
|
26
26
|
requirements:
|
27
27
|
- - '='
|
28
28
|
- !ruby/object:Gem::Version
|
29
|
-
version: 1.
|
29
|
+
version: '1.2'
|
30
30
|
description: Adds hive read, write, and run support to mobilize-hdfs
|
31
31
|
email:
|
32
|
-
- cpaesleme@
|
32
|
+
- cpaesleme@dena.com
|
33
33
|
executables: []
|
34
34
|
extensions: []
|
35
35
|
extra_rdoc_files: []
|