hodor 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (73) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +16 -0
  3. data/.gitmodules +3 -0
  4. data/.rspec +2 -0
  5. data/.ruby-gemset +1 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +5 -0
  8. data/Gemfile +4 -0
  9. data/Guardfile +11 -0
  10. data/README.md +105 -0
  11. data/Rakefile +105 -0
  12. data/bin/hodor +18 -0
  13. data/hodor.gemspec +47 -0
  14. data/lib/config/log4r_config.xml +35 -0
  15. data/lib/hodor.rb +83 -0
  16. data/lib/hodor/api/hdfs.rb +222 -0
  17. data/lib/hodor/api/oozie.rb +215 -0
  18. data/lib/hodor/api/oozie/action.rb +52 -0
  19. data/lib/hodor/api/oozie/bundle.rb +27 -0
  20. data/lib/hodor/api/oozie/coordinator.rb +53 -0
  21. data/lib/hodor/api/oozie/hadoop_job.rb +29 -0
  22. data/lib/hodor/api/oozie/job.rb +192 -0
  23. data/lib/hodor/api/oozie/materialization.rb +56 -0
  24. data/lib/hodor/api/oozie/query.rb +115 -0
  25. data/lib/hodor/api/oozie/session.rb +170 -0
  26. data/lib/hodor/api/oozie/workflow.rb +58 -0
  27. data/lib/hodor/cli.rb +146 -0
  28. data/lib/hodor/command.rb +164 -0
  29. data/lib/hodor/configuration.rb +80 -0
  30. data/lib/hodor/environment.rb +437 -0
  31. data/lib/hodor/ui/table.rb +130 -0
  32. data/lib/hodor/version.rb +3 -0
  33. data/lib/tasks/hdfs.thor +138 -0
  34. data/lib/tasks/master.thor +61 -0
  35. data/lib/tasks/oozie.thor +399 -0
  36. data/lib/tasks/sandbox.thor +87 -0
  37. data/spec/integration/api/oozie/action_spec.rb +69 -0
  38. data/spec/integration/api/oozie/bundle_spec.rb +33 -0
  39. data/spec/integration/api/oozie/coordinator_spec.rb +66 -0
  40. data/spec/integration/api/oozie/hadoop_job_spec.rb +29 -0
  41. data/spec/integration/api/oozie/job_spec.rb +15 -0
  42. data/spec/integration/api/oozie/materialization_spec.rb +66 -0
  43. data/spec/integration/api/oozie/query_spec.rb +43 -0
  44. data/spec/integration/api/oozie/session_spec.rb +18 -0
  45. data/spec/integration/api/oozie/workflow_spec.rb +65 -0
  46. data/spec/integration/api/oozie_spec.rb +198 -0
  47. data/spec/integration/fixtures/api/running_coordinators/req_resp_00.memo +6 -0
  48. data/spec/integration/fixtures/api/sample_action/req_resp_00.memo +5 -0
  49. data/spec/integration/fixtures/api/sample_action/req_resp_01.memo +7 -0
  50. data/spec/integration/fixtures/api/sample_bundle/req_resp_00.memo +6 -0
  51. data/spec/integration/fixtures/api/sample_coordinator/req_resp_00.memo +5 -0
  52. data/spec/integration/fixtures/api/sample_materialization/req_resp_00.memo +5 -0
  53. data/spec/integration/fixtures/api/sample_materialization/req_resp_01.memo +7 -0
  54. data/spec/integration/fixtures/api/sample_workflow/req_resp_00.memo +5 -0
  55. data/spec/spec_helper.rb +92 -0
  56. data/spec/support/d_v_r.rb +125 -0
  57. data/spec/support/hodor_api.rb +15 -0
  58. data/spec/unit/hodor/api/hdfs_spec.rb +63 -0
  59. data/spec/unit/hodor/api/oozie_spec.rb +32 -0
  60. data/spec/unit/hodor/environment_spec.rb +52 -0
  61. data/topics/hdfs/corresponding_paths.txt +31 -0
  62. data/topics/hdfs/overview.txt +10 -0
  63. data/topics/master/clusters.yml.txt +36 -0
  64. data/topics/master/overview.txt +17 -0
  65. data/topics/oozie/blocking_coordinators.txt +46 -0
  66. data/topics/oozie/composing_job_properties.txt +68 -0
  67. data/topics/oozie/display_job.txt +52 -0
  68. data/topics/oozie/driver_scenarios.txt +42 -0
  69. data/topics/oozie/inspecting_jobs.txt +59 -0
  70. data/topics/oozie/jobs.yml.txt +185 -0
  71. data/topics/oozie/overview.txt +43 -0
  72. data/topics/oozie/workers_and_drivers.txt +40 -0
  73. metadata +455 -0
@@ -0,0 +1,222 @@
1
+ require 'hodor'
2
+ require 'singleton'
3
+
4
+ # curl -i "http://sample_domain.com:50070/webhdfs/v1/pipeline?op=LISTSTATUS"
5
+ module Hodor
6
+
7
+ # HDFS Api wrapper
8
+ class Hdfs
9
+ include Singleton
10
+
11
+ def env
12
+ Hodor::Environment.instance
13
+ end
14
+
15
+ def logger
16
+ env.logger
17
+ end
18
+
19
+ def hdfs_root
20
+ env.settings[:hdfs_root]
21
+ end
22
+
23
+ def pwd
24
+ "#{hdfs_root}#{env.pwd}"
25
+ end
26
+
27
+ def path_on_hdfs(file)
28
+ git_path = env.path_on_github(file)
29
+ "#{hdfs_root}/#{git_path}".sub(/\/\/\//, '/').sub(/\/\//, '/').sub(/\/\.\//, '/').sub(/\/\.$/, '')
30
+ end
31
+
32
+ def user
33
+ env.settings[:hdfs_user]
34
+ end
35
+
36
+ def target
37
+ env.settings[:target]
38
+ end
39
+
40
+ class FailedToRemovePath < Hodor::NestedError; end
41
+
42
+ def rm(path)
43
+ dest_path = path_on_hdfs(path||".")
44
+ rm_path_script = %Q[HADOOP_USER_NAME=#{user} hadoop fs -rm -skipTrash #{dest_path}]
45
+ env.ssh rm_path_script
46
+ rescue StandardError => ex
47
+ raise FailedToRemovePath.new ex,
48
+ msg: "Unable to remove HDFS path.",
49
+ ssh_user: env.ssh_user,
50
+ path_to_remove: dest_path
51
+ end
52
+
53
+ def rm_f(path)
54
+ dest_path = path_on_hdfs(path||".")
55
+ rm_path_script = %Q[HADOOP_USER_NAME=#{user} hadoop fs -rm -f -skipTrash #{dest_path}]
56
+ env.ssh rm_path_script
57
+ rescue StandardError => ex
58
+ raise FailedToRemovePath.new ex,
59
+ msg: "Unable to remove HDFS path.",
60
+ ssh_user: env.ssh_user,
61
+ path_to_remove: dest_path
62
+ end
63
+
64
+ def rm_rf(path)
65
+ hdfs_path = path_on_hdfs(path||".")
66
+ rm_path_script = %Q[HADOOP_USER_NAME=#{user} hadoop fs -rm -f -R -skipTrash #{hdfs_path}]
67
+ env.ssh rm_path_script
68
+ rescue StandardError => ex
69
+ raise FailedToRemovePath.new ex,
70
+ msg: "Unable to remove HDFS path.",
71
+ ssh_user: env.ssh_user,
72
+ path_to_remove: dest_path
73
+ end
74
+
75
+ def ls
76
+ dest_path = path_on_hdfs(".")
77
+ ls_script = %Q[HADOOP_USER_NAME=#{user} hadoop fs -ls #{dest_path}]
78
+ env.ssh ls_script, echo:true
79
+ rescue StandardError => ex
80
+ raise FailedToRemovePath.new ex,
81
+ msg: "Unable to list HDFS path.",
82
+ ssh_user: env.ssh_user,
83
+ path_to_list: dest_path
84
+ end
85
+
86
+ class FailedToPutFile < Hodor::NestedError; end
87
+
88
+ # put_file
89
+ # Puts a local file on HDFS, preserving path and replacing if necessary. Files
90
+ # with .erb extensions are ERB expanded before deployment.
91
+ def put_file(file, options = {})
92
+
93
+ disc_path = env.path_on_disc(file)
94
+ hdfs_path = path_on_hdfs(file)
95
+ git_path = env.path_on_github(file)
96
+
97
+ raise "File '#{disc_path}' not found." if !File.exists?(disc_path)
98
+
99
+ logger.info "\tdeploying '#{git_path}'"
100
+
101
+ src_file = file
102
+ if disc_path.end_with?('.erb')
103
+ erb_expanded = env.erb_load(disc_path)
104
+ src_file = "/tmp/#{File.basename(disc_path.sub(/\.erb$/,''))}"
105
+ hdfs_path.sub!(/\.erb$/, '')
106
+ puts "ends with erb srcfile = #{src_file}"
107
+ File.open(src_file, 'w') { |f| f.write(erb_expanded) }
108
+ end
109
+
110
+ raise "File '#{src_file}' not found." if !File.exists?(src_file)
111
+
112
+ put_script = "HADOOP_USER_NAME=#{user} hadoop fs -put - #{hdfs_path}"
113
+ unless options[:already_cleaned]
114
+ rm_script = "HADOOP_USER_NAME=#{user} hadoop fs -rm -f #{hdfs_path}; "
115
+ put_script = rm_script + put_script
116
+ end
117
+
118
+ env.run_local %Q[cat #{src_file} | ssh #{env.ssh_addr} "#{put_script}"],
119
+ echo: true, echo_cmd: true
120
+ rescue StandardError => ex
121
+ raise FailedToPutFile.new ex,
122
+ msg: "Unable to write file to HDFS.",
123
+ ssh_user: env.ssh_user,
124
+ path_on_disc: disc_path,
125
+ path_on_github: git_path,
126
+ path_on_hdfs: hdfs_path,
127
+ src_file: src_file
128
+ end
129
+
130
+ class FailedToPutDir < Hodor::NestedError; end
131
+
132
+ def put_dir(path, options)
133
+ if env.dryrun? and env.verbose?
134
+ logger.info ""
135
+ logger.info " ********************* Dry Run *********************"
136
+ logger.info ""
137
+ end
138
+
139
+ disc_path = env.path_on_disc(path)
140
+ git_path = env.path_on_github(path)
141
+ hdfs_path = path_on_hdfs(path)
142
+
143
+ sync_file = "#{disc_path}/.hdfs-#{target}.sync"
144
+
145
+ logger.info "Deploying: #{git_path}" unless env.silent?
146
+
147
+ fail "Path '#{disc_path}' not found." unless File.exists?(disc_path)
148
+ fail "Path '#{disc_path}' exists but is not a directory." unless File.directory?(disc_path)
149
+
150
+ if env.clean?
151
+ logger.info " cleaning: #{git_path}"
152
+ FileUtils.rm_f sync_file unless env.dryrun?
153
+ rm_rf(git_path)
154
+ clean_done = true
155
+ end
156
+
157
+ fargs = if sync_file && File.exists?(sync_file) && !env.clean?
158
+ "-newer '#{sync_file}'"
159
+ else
160
+ ""
161
+ end
162
+ fargs << " -maxdepth #{options[:maxdepth]}" unless options[:maxdepth].nil?
163
+ mod_files = env.run_local %Q[find #{disc_path} #{fargs} -type f]
164
+ mod_files.split("\n").each { |file|
165
+ basename = File.basename(file)
166
+ next if basename.start_with?('job.properties') ||
167
+ basename.eql?("run.properties") ||
168
+ basename.eql?(".DS_Store") ||
169
+ basename.eql?(".bak") ||
170
+ basename.eql?(".tmp") ||
171
+ basename.eql?(".hdfs") ||
172
+ basename.eql?("Rakefile") ||
173
+ basename.end_with?(".sync") ||
174
+ file.include?("migrations/") ||
175
+ file.include?(".bak/") ||
176
+ file.include?(".tmp/")
177
+ put_file(file, already_cleaned: clean_done)
178
+ }
179
+ rescue StandardError => ex
180
+ raise FailedToPutDir.new ex,
181
+ msg: "Unable to write directory to HDFS.",
182
+ ssh_user: env.ssh_user,
183
+ path_on_disc: disc_path,
184
+ path_on_github: git_path,
185
+ path_on_hdfs: hdfs_path,
186
+ sync_file: sync_file,
187
+ max_depth: options[:maxdepth],
188
+ clean: env.clean? ? "true" : "false"
189
+ else
190
+ env.run_local %Q[touch '#{sync_file}'] unless env.dryrun?
191
+ end
192
+
193
+ class FailedToGetFile < Hodor::NestedError; end
194
+
195
+ # get
196
+ # Gets a file from HDFS and copies it to a local file
197
+ def get_file(file, options = {})
198
+ disc_path = env.path_on_disc(file)
199
+ hdfs_path = path_on_hdfs(file)
200
+ git_path = env.path_on_github(file)
201
+ dest_path = "#{file}.hdfs_copy"
202
+
203
+ logger.info "\tgetting '#{git_path}'. Writing to '#{dest_path}'."
204
+
205
+ get_script = %Q["rm -f #{dest_path}; HADOOP_USER_NAME=#{user} hadoop fs -get #{hdfs_path} #{dest_path}"]
206
+ env.ssh get_script, echo: true, echo_cmd: true
207
+ if options[:clobber]
208
+ FileUtils.rm_f dest_path
209
+ end
210
+ env.run_local %Q[scp #{env.ssh_user}@#{env[:ssh_host]}:#{dest_path} .],
211
+ echo: true, echo_cmd: true
212
+ rescue StandardError => ex
213
+ raise FailedToGetFile.new ex,
214
+ msg: "Unable to get file from HDFS.",
215
+ ssh_user: env.ssh_user,
216
+ path_on_disc: disc_path,
217
+ path_on_github: git_path,
218
+ path_on_hdfs: hdfs_path,
219
+ dest_file: dest_file
220
+ end
221
+ end
222
+ end
@@ -0,0 +1,215 @@
1
+ require_relative "oozie/job"
2
+ require_relative "oozie/query"
3
+ require_relative "oozie/session"
4
+ require_relative "oozie/bundle"
5
+ require_relative "oozie/coordinator"
6
+ require_relative "oozie/materialization"
7
+ require_relative "oozie/workflow"
8
+ require_relative "oozie/action"
9
+ require_relative "oozie/hadoop_job"
10
+ require_relative "hdfs"
11
+
12
+ module Hodor::Oozie
13
+
14
+ class << self
15
+
16
+ def env
17
+ Hodor::Environment.instance
18
+ end
19
+
20
+ def session
21
+ Hodor::Oozie::Session.instance
22
+ end
23
+
24
+ def hdfs
25
+ Hodor::Hdfs.instance
26
+ end
27
+
28
+ def logger
29
+ env.logger
30
+ end
31
+
32
+ def build_rest_param(filter)
33
+ params = filter.map { |match|
34
+ case match
35
+ when :killed; "status%3DKILLED"
36
+ when :succeeded; "status%3DSUCCEEDED"
37
+ end
38
+ }
39
+
40
+ if params.size > 0
41
+ filter_exp = "filter=#{params.join(';')}"
42
+ else
43
+ filter_exp = ""
44
+ end
45
+ end
46
+
47
+ def job_by_id(job_id, filter = nil)
48
+ if (job_id.nil?)
49
+ result = Hodor::Oozie::Query.new status: [:running_first]
50
+ else
51
+ if job_id =~ /job_\d+/
52
+ result = HadoopJob.new(session.current_id, job_id)
53
+ else
54
+ if filter
55
+ response = session.get_job_state(job_id, build_rest_param(filter))
56
+ else
57
+ response = session.get_job_state(job_id)
58
+ end
59
+ json = JSON.parse(response)
60
+ job_type = json["toString"]
61
+ case job_type.split(" ").first.downcase.to_sym
62
+ when :bundle;
63
+ result = Bundle.new(json)
64
+ when :coordinator;
65
+ result = Coordinator.new(json)
66
+ when :workflow;
67
+ result = Workflow.new(json)
68
+ when :action;
69
+ result = Action.new(json)
70
+ when :coordinatoraction;
71
+ result = Materialization.new(json)
72
+ else
73
+ end
74
+ end
75
+ end
76
+ end
77
+
78
+ def job_by_path(job_path, make_current = false, filter = nil)
79
+ if job_path.nil? || job_path.eql?(".")
80
+ movement = :none
81
+ elsif job_path.eql?("/")
82
+ movement = :root
83
+ elsif job_path.eql?("b") || job_path.eql?("back") ||
84
+ job_path.eql?("u") || job_path.eql?("up") || job_path.eql?("..")
85
+ movement = :up
86
+ elsif job_path.eql?("d") || job_path.eql?("down") ||
87
+ job_path.eql?("f") || job_path.eql?("forward") || job_path.length < 5
88
+ movement = :down
89
+ else
90
+ movement = :jump
91
+ end
92
+
93
+ job_id = session.job_relative(movement, job_path)
94
+ job = job_by_id(job_id, filter)
95
+ session.make_current(job) if make_current
96
+ job
97
+ end
98
+
99
+ def change_job(job_path, filter = nil)
100
+ job_by_path(job_path, true, filter)
101
+ end
102
+
103
+ def select_job(job)
104
+ # load jobs.yml file
105
+ pwd = Dir.pwd
106
+ if File.exists?("jobs.yml")
107
+ jobs = env.yml_load(File.expand_path('jobs.yml', pwd))
108
+ marked_jobs = jobs.keys.select { |key| key.start_with?('^') }
109
+ marked_jobs.each { |mark|
110
+ jobs[mark[1..-1]] = jobs[mark]
111
+ }
112
+ if job.nil?
113
+ # No job explicitly specified, so look for a
114
+ # marked job (i.e. job starting with ^)
115
+ jobs.each_pair { |key, val|
116
+ if key.to_s.strip.start_with?('^')
117
+ job = key.to_s
118
+ end
119
+ }
120
+ fail "You must specify which job from jobs.yml to run" if !job
121
+ end
122
+ jobs = jobs.symbolize_keys
123
+ if !jobs.has_key?(job.to_sym)
124
+ caret = "^#{job.to_s}"
125
+ fail "Job '#{job}' was not defined in jobs.yml" if !jobs.has_key?(caret.to_sym)
126
+ end
127
+ selected_job = jobs[job.to_sym]
128
+ env.select_job(selected_job)
129
+ else
130
+ fail "No jobs.yml file exists in the current directory. You must specify a jobs.yml file"
131
+ end
132
+ end
133
+
134
+ # collect all job.properties.erb files up to root of repo
135
+ # and compose them together in top down order (i.e. deeper
136
+ # directories override properties in higher directories.)
137
+ # If direct job properties file is provided, properties will
138
+ # be interpolated using values in that file.
139
+ def compose_job_file(direct_job = nil, prefix = '')
140
+ if direct_job.nil?
141
+ pwd = Dir.pwd
142
+ paths = env.paths_from_root(pwd)
143
+ composite_jobfile = paths.inject('') { |result, path|
144
+ jobfile = File.expand_path('job.properties.erb', path)
145
+ if File.exists?(jobfile)
146
+ result << "\nFrom Job File '#{jobfile}':\n"
147
+ result << File.read(jobfile)
148
+ end
149
+ result
150
+ }
151
+ FileUtils.mkdir './.tmp' unless Dir.exists?('./.tmp')
152
+ composite_properties_file = File.expand_path(".tmp/runjob.properties.erb", pwd)
153
+ File.open(composite_properties_file, "w") do |f|
154
+ f.puts composite_jobfile
155
+ end
156
+ out_file = composite_properties_file.sub(/\.erb$/,'')
157
+ dest_file = generate_and_write_job_file(out_file, composite_properties_file, prefix)
158
+ else
159
+ raise "Job file '#{direct_job}' not found" unless File.exists?(direct_job)
160
+ if direct_job.end_with?('.erb')
161
+ out_file = "/tmp/#{File.basename(direct_job.sub(/.erb$/,''))}"
162
+ dest_file = generate_and_write_job_file(out_file, direct_job, prefix)
163
+ else
164
+ direct_job
165
+ out_file = append_prefix_to_filename(direct_job, prefix)
166
+ unless prefix.blank?
167
+ FileUtils.cp(direct_job, out_file)
168
+ dest_file = out_file
169
+ end
170
+ end
171
+ end
172
+ dest_file
173
+ end
174
+
175
+ def generate_and_write_job_file(file_name, in_file, prefix = '')
176
+ out_file = append_prefix_to_filename(file_name, prefix)
177
+ File.open(out_file, 'w') { |f| f.write(env.erb_load(in_file)) }
178
+ out_file
179
+ end
180
+
181
+ def append_prefix_to_filename(file_name, prefix = '')
182
+ insert_index = file_name.rindex(File::SEPARATOR)
183
+ String.new(file_name).insert((insert_index.nil? ? 0 : insert_index+1) , prefix)
184
+ end
185
+
186
+
187
+ def deploy_job(job, clean_deploy)
188
+ select_job(job)
189
+ fail "No deploy section for job '#{job}'." if !env.job.has_key?("deploy")
190
+ if env.job[:deploy].nil?
191
+ fail "Nothing to deploy. Check the deploy section of your jobs.yml file"
192
+ else
193
+ env.job[:deploy].split.each { |path|
194
+ hdfs.put_dir File.expand_path(path, env.root), { clean: clean_deploy }
195
+ }
196
+ end
197
+ end
198
+
199
+ # If job references a job.properties or job.properties.erb file, that file will be
200
+ # used directly to interpolate job property values.
201
+ def run_job(job = nil, dry_run = false, file_prefix = '')
202
+ if job && (job =~ /job.properties.erb$/ || job =~ /job.properties/)
203
+ jobfile = compose_job_file(job, file_prefix)
204
+ else
205
+ select_job(job)
206
+ jobfile = compose_job_file(nil, file_prefix)
207
+ end
208
+ unless dry_run
209
+ runfile = env.deploy_tmp_file(jobfile)
210
+ env.ssh "oozie job -oozie :oozie_url -config #{runfile} -run", echo: true, echo_cmd: true
211
+ end
212
+ jobfile
213
+ end
214
+ end
215
+ end
@@ -0,0 +1,52 @@
1
+ module Hodor::Oozie
2
+
3
+ class Action < Job
4
+
5
+ attr_reader :parent_id, :json, :status, :error_message, :data, :transition, :external_status, :cred,
6
+ :type, :end_time, :external_id, :start_time, :external_child_ids, :name, :error_code,
7
+ :tracker_url, :retries, :to_string, :console_url
8
+
9
+ class << self
10
+ def default_columns
11
+ [:index, :id, :name, :status, :created_at, :nominal_time]
12
+ end
13
+ end
14
+
15
+ def initialize(json)
16
+ super()
17
+ @json = json
18
+
19
+ @error_message = json["errorMessage"]
20
+ @status = json["status"]
21
+ @stats = json["stats"]
22
+ @data = json["data"]
23
+ @transition = json["transition"]
24
+ @external_status = json["externalStatus"]
25
+ @cred = json["cred"]
26
+ @conf = json["conf"]
27
+ @type = json["type"]
28
+ @end_time = parse_time json["endTime"]
29
+ @external_id = json["externalId"]
30
+ @id = json["id"]
31
+ @start_time = parse_time json["startTime"]
32
+ @external_child_ids = json["externalChildIDs"]
33
+ @name = json["name"]
34
+ @error_code = json["errorCode"]
35
+ @tracker_url = json["trackerUri"]
36
+ @retries = json["retries"]
37
+ @to_string = json["toString"]
38
+ @console_url = json["consoleUrl"]
39
+ @parent_id = @id[0..@id.index('@')-1]
40
+ end
41
+
42
+ def expand
43
+ if @external_id && !@external_id.eql?('-')
44
+ [ oozie.job_by_id(@external_id) ]
45
+ else
46
+ nil
47
+ end
48
+ end
49
+
50
+ end
51
+
52
+ end