hodor 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +16 -0
  3. data/.gitmodules +3 -0
  4. data/.rspec +2 -0
  5. data/.ruby-gemset +1 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +5 -0
  8. data/Gemfile +4 -0
  9. data/Guardfile +11 -0
  10. data/README.md +105 -0
  11. data/Rakefile +105 -0
  12. data/bin/hodor +18 -0
  13. data/hodor.gemspec +47 -0
  14. data/lib/config/log4r_config.xml +35 -0
  15. data/lib/hodor.rb +83 -0
  16. data/lib/hodor/api/hdfs.rb +222 -0
  17. data/lib/hodor/api/oozie.rb +215 -0
  18. data/lib/hodor/api/oozie/action.rb +52 -0
  19. data/lib/hodor/api/oozie/bundle.rb +27 -0
  20. data/lib/hodor/api/oozie/coordinator.rb +53 -0
  21. data/lib/hodor/api/oozie/hadoop_job.rb +29 -0
  22. data/lib/hodor/api/oozie/job.rb +192 -0
  23. data/lib/hodor/api/oozie/materialization.rb +56 -0
  24. data/lib/hodor/api/oozie/query.rb +115 -0
  25. data/lib/hodor/api/oozie/session.rb +170 -0
  26. data/lib/hodor/api/oozie/workflow.rb +58 -0
  27. data/lib/hodor/cli.rb +146 -0
  28. data/lib/hodor/command.rb +164 -0
  29. data/lib/hodor/configuration.rb +80 -0
  30. data/lib/hodor/environment.rb +437 -0
  31. data/lib/hodor/ui/table.rb +130 -0
  32. data/lib/hodor/version.rb +3 -0
  33. data/lib/tasks/hdfs.thor +138 -0
  34. data/lib/tasks/master.thor +61 -0
  35. data/lib/tasks/oozie.thor +399 -0
  36. data/lib/tasks/sandbox.thor +87 -0
  37. data/spec/integration/api/oozie/action_spec.rb +69 -0
  38. data/spec/integration/api/oozie/bundle_spec.rb +33 -0
  39. data/spec/integration/api/oozie/coordinator_spec.rb +66 -0
  40. data/spec/integration/api/oozie/hadoop_job_spec.rb +29 -0
  41. data/spec/integration/api/oozie/job_spec.rb +15 -0
  42. data/spec/integration/api/oozie/materialization_spec.rb +66 -0
  43. data/spec/integration/api/oozie/query_spec.rb +43 -0
  44. data/spec/integration/api/oozie/session_spec.rb +18 -0
  45. data/spec/integration/api/oozie/workflow_spec.rb +65 -0
  46. data/spec/integration/api/oozie_spec.rb +198 -0
  47. data/spec/integration/fixtures/api/running_coordinators/req_resp_00.memo +6 -0
  48. data/spec/integration/fixtures/api/sample_action/req_resp_00.memo +5 -0
  49. data/spec/integration/fixtures/api/sample_action/req_resp_01.memo +7 -0
  50. data/spec/integration/fixtures/api/sample_bundle/req_resp_00.memo +6 -0
  51. data/spec/integration/fixtures/api/sample_coordinator/req_resp_00.memo +5 -0
  52. data/spec/integration/fixtures/api/sample_materialization/req_resp_00.memo +5 -0
  53. data/spec/integration/fixtures/api/sample_materialization/req_resp_01.memo +7 -0
  54. data/spec/integration/fixtures/api/sample_workflow/req_resp_00.memo +5 -0
  55. data/spec/spec_helper.rb +92 -0
  56. data/spec/support/d_v_r.rb +125 -0
  57. data/spec/support/hodor_api.rb +15 -0
  58. data/spec/unit/hodor/api/hdfs_spec.rb +63 -0
  59. data/spec/unit/hodor/api/oozie_spec.rb +32 -0
  60. data/spec/unit/hodor/environment_spec.rb +52 -0
  61. data/topics/hdfs/corresponding_paths.txt +31 -0
  62. data/topics/hdfs/overview.txt +10 -0
  63. data/topics/master/clusters.yml.txt +36 -0
  64. data/topics/master/overview.txt +17 -0
  65. data/topics/oozie/blocking_coordinators.txt +46 -0
  66. data/topics/oozie/composing_job_properties.txt +68 -0
  67. data/topics/oozie/display_job.txt +52 -0
  68. data/topics/oozie/driver_scenarios.txt +42 -0
  69. data/topics/oozie/inspecting_jobs.txt +59 -0
  70. data/topics/oozie/jobs.yml.txt +185 -0
  71. data/topics/oozie/overview.txt +43 -0
  72. data/topics/oozie/workers_and_drivers.txt +40 -0
  73. metadata +455 -0
@@ -0,0 +1,222 @@
1
+ require 'hodor'
2
+ require 'singleton'
3
+
4
+ # curl -i "http://sample_domain.com:50070/webhdfs/v1/pipeline?op=LISTSTATUS"
5
+ module Hodor
6
+
7
+ # HDFS Api wrapper
8
+ class Hdfs
9
+ include Singleton
10
+
11
+ def env
12
+ Hodor::Environment.instance
13
+ end
14
+
15
+ def logger
16
+ env.logger
17
+ end
18
+
19
+ def hdfs_root
20
+ env.settings[:hdfs_root]
21
+ end
22
+
23
+ def pwd
24
+ "#{hdfs_root}#{env.pwd}"
25
+ end
26
+
27
+ def path_on_hdfs(file)
28
+ git_path = env.path_on_github(file)
29
+ "#{hdfs_root}/#{git_path}".sub(/\/\/\//, '/').sub(/\/\//, '/').sub(/\/\.\//, '/').sub(/\/\.$/, '')
30
+ end
31
+
32
+ def user
33
+ env.settings[:hdfs_user]
34
+ end
35
+
36
+ def target
37
+ env.settings[:target]
38
+ end
39
+
40
+ class FailedToRemovePath < Hodor::NestedError; end
41
+
42
+ def rm(path)
43
+ dest_path = path_on_hdfs(path||".")
44
+ rm_path_script = %Q[HADOOP_USER_NAME=#{user} hadoop fs -rm -skipTrash #{dest_path}]
45
+ env.ssh rm_path_script
46
+ rescue StandardError => ex
47
+ raise FailedToRemovePath.new ex,
48
+ msg: "Unable to remove HDFS path.",
49
+ ssh_user: env.ssh_user,
50
+ path_to_remove: dest_path
51
+ end
52
+
53
+ def rm_f(path)
54
+ dest_path = path_on_hdfs(path||".")
55
+ rm_path_script = %Q[HADOOP_USER_NAME=#{user} hadoop fs -rm -f -skipTrash #{dest_path}]
56
+ env.ssh rm_path_script
57
+ rescue StandardError => ex
58
+ raise FailedToRemovePath.new ex,
59
+ msg: "Unable to remove HDFS path.",
60
+ ssh_user: env.ssh_user,
61
+ path_to_remove: dest_path
62
+ end
63
+
64
+ def rm_rf(path)
65
+ hdfs_path = path_on_hdfs(path||".")
66
+ rm_path_script = %Q[HADOOP_USER_NAME=#{user} hadoop fs -rm -f -R -skipTrash #{hdfs_path}]
67
+ env.ssh rm_path_script
68
+ rescue StandardError => ex
69
+ raise FailedToRemovePath.new ex,
70
+ msg: "Unable to remove HDFS path.",
71
+ ssh_user: env.ssh_user,
72
+ path_to_remove: dest_path
73
+ end
74
+
75
+ def ls
76
+ dest_path = path_on_hdfs(".")
77
+ ls_script = %Q[HADOOP_USER_NAME=#{user} hadoop fs -ls #{dest_path}]
78
+ env.ssh ls_script, echo:true
79
+ rescue StandardError => ex
80
+ raise FailedToRemovePath.new ex,
81
+ msg: "Unable to list HDFS path.",
82
+ ssh_user: env.ssh_user,
83
+ path_to_list: dest_path
84
+ end
85
+
86
+ class FailedToPutFile < Hodor::NestedError; end
87
+
88
+ # put_file
89
+ # Puts a local file on HDFS, preserving path and replacing if necessary. Files
90
+ # with .erb extensions are ERB expanded before deployment.
91
+ def put_file(file, options = {})
92
+
93
+ disc_path = env.path_on_disc(file)
94
+ hdfs_path = path_on_hdfs(file)
95
+ git_path = env.path_on_github(file)
96
+
97
+ raise "File '#{disc_path}' not found." if !File.exists?(disc_path)
98
+
99
+ logger.info "\tdeploying '#{git_path}'"
100
+
101
+ src_file = file
102
+ if disc_path.end_with?('.erb')
103
+ erb_expanded = env.erb_load(disc_path)
104
+ src_file = "/tmp/#{File.basename(disc_path.sub(/\.erb$/,''))}"
105
+ hdfs_path.sub!(/\.erb$/, '')
106
+ puts "ends with erb srcfile = #{src_file}"
107
+ File.open(src_file, 'w') { |f| f.write(erb_expanded) }
108
+ end
109
+
110
+ raise "File '#{src_file}' not found." if !File.exists?(src_file)
111
+
112
+ put_script = "HADOOP_USER_NAME=#{user} hadoop fs -put - #{hdfs_path}"
113
+ unless options[:already_cleaned]
114
+ rm_script = "HADOOP_USER_NAME=#{user} hadoop fs -rm -f #{hdfs_path}; "
115
+ put_script = rm_script + put_script
116
+ end
117
+
118
+ env.run_local %Q[cat #{src_file} | ssh #{env.ssh_addr} "#{put_script}"],
119
+ echo: true, echo_cmd: true
120
+ rescue StandardError => ex
121
+ raise FailedToPutFile.new ex,
122
+ msg: "Unable to write file to HDFS.",
123
+ ssh_user: env.ssh_user,
124
+ path_on_disc: disc_path,
125
+ path_on_github: git_path,
126
+ path_on_hdfs: hdfs_path,
127
+ src_file: src_file
128
+ end
129
+
130
+ class FailedToPutDir < Hodor::NestedError; end
131
+
132
+ def put_dir(path, options)
133
+ if env.dryrun? and env.verbose?
134
+ logger.info ""
135
+ logger.info " ********************* Dry Run *********************"
136
+ logger.info ""
137
+ end
138
+
139
+ disc_path = env.path_on_disc(path)
140
+ git_path = env.path_on_github(path)
141
+ hdfs_path = path_on_hdfs(path)
142
+
143
+ sync_file = "#{disc_path}/.hdfs-#{target}.sync"
144
+
145
+ logger.info "Deploying: #{git_path}" unless env.silent?
146
+
147
+ fail "Path '#{disc_path}' not found." unless File.exists?(disc_path)
148
+ fail "Path '#{disc_path}' exists but is not a directory." unless File.directory?(disc_path)
149
+
150
+ if env.clean?
151
+ logger.info " cleaning: #{git_path}"
152
+ FileUtils.rm_f sync_file unless env.dryrun?
153
+ rm_rf(git_path)
154
+ clean_done = true
155
+ end
156
+
157
+ fargs = if sync_file && File.exists?(sync_file) && !env.clean?
158
+ "-newer '#{sync_file}'"
159
+ else
160
+ ""
161
+ end
162
+ fargs << " -maxdepth #{options[:maxdepth]}" unless options[:maxdepth].nil?
163
+ mod_files = env.run_local %Q[find #{disc_path} #{fargs} -type f]
164
+ mod_files.split("\n").each { |file|
165
+ basename = File.basename(file)
166
+ next if basename.start_with?('job.properties') ||
167
+ basename.eql?("run.properties") ||
168
+ basename.eql?(".DS_Store") ||
169
+ basename.eql?(".bak") ||
170
+ basename.eql?(".tmp") ||
171
+ basename.eql?(".hdfs") ||
172
+ basename.eql?("Rakefile") ||
173
+ basename.end_with?(".sync") ||
174
+ file.include?("migrations/") ||
175
+ file.include?(".bak/") ||
176
+ file.include?(".tmp/")
177
+ put_file(file, already_cleaned: clean_done)
178
+ }
179
+ rescue StandardError => ex
180
+ raise FailedToPutDir.new ex,
181
+ msg: "Unable to write directory to HDFS.",
182
+ ssh_user: env.ssh_user,
183
+ path_on_disc: disc_path,
184
+ path_on_github: git_path,
185
+ path_on_hdfs: hdfs_path,
186
+ sync_file: sync_file,
187
+ max_depth: options[:maxdepth],
188
+ clean: env.clean? ? "true" : "false"
189
+ else
190
+ env.run_local %Q[touch '#{sync_file}'] unless env.dryrun?
191
+ end
192
+
193
+ class FailedToGetFile < Hodor::NestedError; end
194
+
195
+ # get
196
+ # Gets a file from HDFS and copies it to a local file
197
+ def get_file(file, options = {})
198
+ disc_path = env.path_on_disc(file)
199
+ hdfs_path = path_on_hdfs(file)
200
+ git_path = env.path_on_github(file)
201
+ dest_path = "#{file}.hdfs_copy"
202
+
203
+ logger.info "\tgetting '#{git_path}'. Writing to '#{dest_path}'."
204
+
205
+ get_script = %Q["rm -f #{dest_path}; HADOOP_USER_NAME=#{user} hadoop fs -get #{hdfs_path} #{dest_path}"]
206
+ env.ssh get_script, echo: true, echo_cmd: true
207
+ if options[:clobber]
208
+ FileUtils.rm_f dest_path
209
+ end
210
+ env.run_local %Q[scp #{env.ssh_user}@#{env[:ssh_host]}:#{dest_path} .],
211
+ echo: true, echo_cmd: true
212
+ rescue StandardError => ex
213
+ raise FailedToGetFile.new ex,
214
+ msg: "Unable to get file from HDFS.",
215
+ ssh_user: env.ssh_user,
216
+ path_on_disc: disc_path,
217
+ path_on_github: git_path,
218
+ path_on_hdfs: hdfs_path,
219
+ dest_file: dest_file
220
+ end
221
+ end
222
+ end
@@ -0,0 +1,215 @@
1
+ require_relative "oozie/job"
2
+ require_relative "oozie/query"
3
+ require_relative "oozie/session"
4
+ require_relative "oozie/bundle"
5
+ require_relative "oozie/coordinator"
6
+ require_relative "oozie/materialization"
7
+ require_relative "oozie/workflow"
8
+ require_relative "oozie/action"
9
+ require_relative "oozie/hadoop_job"
10
+ require_relative "hdfs"
11
+
12
+ module Hodor::Oozie
13
+
14
+ class << self
15
+
16
+ def env
17
+ Hodor::Environment.instance
18
+ end
19
+
20
+ def session
21
+ Hodor::Oozie::Session.instance
22
+ end
23
+
24
+ def hdfs
25
+ Hodor::Hdfs.instance
26
+ end
27
+
28
+ def logger
29
+ env.logger
30
+ end
31
+
32
+ def build_rest_param(filter)
33
+ params = filter.map { |match|
34
+ case match
35
+ when :killed; "status%3DKILLED"
36
+ when :succeeded; "status%3DSUCCEEDED"
37
+ end
38
+ }
39
+
40
+ if params.size > 0
41
+ filter_exp = "filter=#{params.join(';')}"
42
+ else
43
+ filter_exp = ""
44
+ end
45
+ end
46
+
47
+ def job_by_id(job_id, filter = nil)
48
+ if (job_id.nil?)
49
+ result = Hodor::Oozie::Query.new status: [:running_first]
50
+ else
51
+ if job_id =~ /job_\d+/
52
+ result = HadoopJob.new(session.current_id, job_id)
53
+ else
54
+ if filter
55
+ response = session.get_job_state(job_id, build_rest_param(filter))
56
+ else
57
+ response = session.get_job_state(job_id)
58
+ end
59
+ json = JSON.parse(response)
60
+ job_type = json["toString"]
61
+ case job_type.split(" ").first.downcase.to_sym
62
+ when :bundle;
63
+ result = Bundle.new(json)
64
+ when :coordinator;
65
+ result = Coordinator.new(json)
66
+ when :workflow;
67
+ result = Workflow.new(json)
68
+ when :action;
69
+ result = Action.new(json)
70
+ when :coordinatoraction;
71
+ result = Materialization.new(json)
72
+ else
73
+ end
74
+ end
75
+ end
76
+ end
77
+
78
+ def job_by_path(job_path, make_current = false, filter = nil)
79
+ if job_path.nil? || job_path.eql?(".")
80
+ movement = :none
81
+ elsif job_path.eql?("/")
82
+ movement = :root
83
+ elsif job_path.eql?("b") || job_path.eql?("back") ||
84
+ job_path.eql?("u") || job_path.eql?("up") || job_path.eql?("..")
85
+ movement = :up
86
+ elsif job_path.eql?("d") || job_path.eql?("down") ||
87
+ job_path.eql?("f") || job_path.eql?("forward") || job_path.length < 5
88
+ movement = :down
89
+ else
90
+ movement = :jump
91
+ end
92
+
93
+ job_id = session.job_relative(movement, job_path)
94
+ job = job_by_id(job_id, filter)
95
+ session.make_current(job) if make_current
96
+ job
97
+ end
98
+
99
+ def change_job(job_path, filter = nil)
100
+ job_by_path(job_path, true, filter)
101
+ end
102
+
103
+ def select_job(job)
104
+ # load jobs.yml file
105
+ pwd = Dir.pwd
106
+ if File.exists?("jobs.yml")
107
+ jobs = env.yml_load(File.expand_path('jobs.yml', pwd))
108
+ marked_jobs = jobs.keys.select { |key| key.start_with?('^') }
109
+ marked_jobs.each { |mark|
110
+ jobs[mark[1..-1]] = jobs[mark]
111
+ }
112
+ if job.nil?
113
+ # No job explicitly specified, so look for a
114
+ # marked job (i.e. job starting with ^)
115
+ jobs.each_pair { |key, val|
116
+ if key.to_s.strip.start_with?('^')
117
+ job = key.to_s
118
+ end
119
+ }
120
+ fail "You must specify which job from jobs.yml to run" if !job
121
+ end
122
+ jobs = jobs.symbolize_keys
123
+ if !jobs.has_key?(job.to_sym)
124
+ caret = "^#{job.to_s}"
125
+ fail "Job '#{job}' was not defined in jobs.yml" if !jobs.has_key?(caret.to_sym)
126
+ end
127
+ selected_job = jobs[job.to_sym]
128
+ env.select_job(selected_job)
129
+ else
130
+ fail "No jobs.yml file exists in the current directory. You must specify a jobs.yml file"
131
+ end
132
+ end
133
+
134
+ # collect all job.properties.erb files up to root of repo
135
+ # and compose them together in top down order (i.e. deeper
136
+ # directories override properties in higher directories.)
137
+ # If direct job properties file is provided, properties will
138
+ # be interpolated using values in that file.
139
+ def compose_job_file(direct_job = nil, prefix = '')
140
+ if direct_job.nil?
141
+ pwd = Dir.pwd
142
+ paths = env.paths_from_root(pwd)
143
+ composite_jobfile = paths.inject('') { |result, path|
144
+ jobfile = File.expand_path('job.properties.erb', path)
145
+ if File.exists?(jobfile)
146
+ result << "\nFrom Job File '#{jobfile}':\n"
147
+ result << File.read(jobfile)
148
+ end
149
+ result
150
+ }
151
+ FileUtils.mkdir './.tmp' unless Dir.exists?('./.tmp')
152
+ composite_properties_file = File.expand_path(".tmp/runjob.properties.erb", pwd)
153
+ File.open(composite_properties_file, "w") do |f|
154
+ f.puts composite_jobfile
155
+ end
156
+ out_file = composite_properties_file.sub(/\.erb$/,'')
157
+ dest_file = generate_and_write_job_file(out_file, composite_properties_file, prefix)
158
+ else
159
+ raise "Job file '#{direct_job}' not found" unless File.exists?(direct_job)
160
+ if direct_job.end_with?('.erb')
161
+ out_file = "/tmp/#{File.basename(direct_job.sub(/.erb$/,''))}"
162
+ dest_file = generate_and_write_job_file(out_file, direct_job, prefix)
163
+ else
164
+ direct_job
165
+ out_file = append_prefix_to_filename(direct_job, prefix)
166
+ unless prefix.blank?
167
+ FileUtils.cp(direct_job, out_file)
168
+ dest_file = out_file
169
+ end
170
+ end
171
+ end
172
+ dest_file
173
+ end
174
+
175
+ def generate_and_write_job_file(file_name, in_file, prefix = '')
176
+ out_file = append_prefix_to_filename(file_name, prefix)
177
+ File.open(out_file, 'w') { |f| f.write(env.erb_load(in_file)) }
178
+ out_file
179
+ end
180
+
181
+ def append_prefix_to_filename(file_name, prefix = '')
182
+ insert_index = file_name.rindex(File::SEPARATOR)
183
+ String.new(file_name).insert((insert_index.nil? ? 0 : insert_index+1) , prefix)
184
+ end
185
+
186
+
187
+ def deploy_job(job, clean_deploy)
188
+ select_job(job)
189
+ fail "No deploy section for job '#{job}'." if !env.job.has_key?("deploy")
190
+ if env.job[:deploy].nil?
191
+ fail "Nothing to deploy. Check the deploy section of your jobs.yml file"
192
+ else
193
+ env.job[:deploy].split.each { |path|
194
+ hdfs.put_dir File.expand_path(path, env.root), { clean: clean_deploy }
195
+ }
196
+ end
197
+ end
198
+
199
+ # If job references a job.properties or job.properties.erb file, that file will be
200
+ # used directly to interpolate job property values.
201
+ def run_job(job = nil, dry_run = false, file_prefix = '')
202
+ if job && (job =~ /job.properties.erb$/ || job =~ /job.properties/)
203
+ jobfile = compose_job_file(job, file_prefix)
204
+ else
205
+ select_job(job)
206
+ jobfile = compose_job_file(nil, file_prefix)
207
+ end
208
+ unless dry_run
209
+ runfile = env.deploy_tmp_file(jobfile)
210
+ env.ssh "oozie job -oozie :oozie_url -config #{runfile} -run", echo: true, echo_cmd: true
211
+ end
212
+ jobfile
213
+ end
214
+ end
215
+ end
@@ -0,0 +1,52 @@
1
+ module Hodor::Oozie
2
+
3
+ class Action < Job
4
+
5
+ attr_reader :parent_id, :json, :status, :error_message, :data, :transition, :external_status, :cred,
6
+ :type, :end_time, :external_id, :start_time, :external_child_ids, :name, :error_code,
7
+ :tracker_url, :retries, :to_string, :console_url
8
+
9
+ class << self
10
+ def default_columns
11
+ [:index, :id, :name, :status, :created_at, :nominal_time]
12
+ end
13
+ end
14
+
15
+ def initialize(json)
16
+ super()
17
+ @json = json
18
+
19
+ @error_message = json["errorMessage"]
20
+ @status = json["status"]
21
+ @stats = json["stats"]
22
+ @data = json["data"]
23
+ @transition = json["transition"]
24
+ @external_status = json["externalStatus"]
25
+ @cred = json["cred"]
26
+ @conf = json["conf"]
27
+ @type = json["type"]
28
+ @end_time = parse_time json["endTime"]
29
+ @external_id = json["externalId"]
30
+ @id = json["id"]
31
+ @start_time = parse_time json["startTime"]
32
+ @external_child_ids = json["externalChildIDs"]
33
+ @name = json["name"]
34
+ @error_code = json["errorCode"]
35
+ @tracker_url = json["trackerUri"]
36
+ @retries = json["retries"]
37
+ @to_string = json["toString"]
38
+ @console_url = json["consoleUrl"]
39
+ @parent_id = @id[0..@id.index('@')-1]
40
+ end
41
+
42
+ def expand
43
+ if @external_id && !@external_id.eql?('-')
44
+ [ oozie.job_by_id(@external_id) ]
45
+ else
46
+ nil
47
+ end
48
+ end
49
+
50
+ end
51
+
52
+ end