hodor 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +16 -0
- data/.gitmodules +3 -0
- data/.rspec +2 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/.travis.yml +5 -0
- data/Gemfile +4 -0
- data/Guardfile +11 -0
- data/README.md +105 -0
- data/Rakefile +105 -0
- data/bin/hodor +18 -0
- data/hodor.gemspec +47 -0
- data/lib/config/log4r_config.xml +35 -0
- data/lib/hodor.rb +83 -0
- data/lib/hodor/api/hdfs.rb +222 -0
- data/lib/hodor/api/oozie.rb +215 -0
- data/lib/hodor/api/oozie/action.rb +52 -0
- data/lib/hodor/api/oozie/bundle.rb +27 -0
- data/lib/hodor/api/oozie/coordinator.rb +53 -0
- data/lib/hodor/api/oozie/hadoop_job.rb +29 -0
- data/lib/hodor/api/oozie/job.rb +192 -0
- data/lib/hodor/api/oozie/materialization.rb +56 -0
- data/lib/hodor/api/oozie/query.rb +115 -0
- data/lib/hodor/api/oozie/session.rb +170 -0
- data/lib/hodor/api/oozie/workflow.rb +58 -0
- data/lib/hodor/cli.rb +146 -0
- data/lib/hodor/command.rb +164 -0
- data/lib/hodor/configuration.rb +80 -0
- data/lib/hodor/environment.rb +437 -0
- data/lib/hodor/ui/table.rb +130 -0
- data/lib/hodor/version.rb +3 -0
- data/lib/tasks/hdfs.thor +138 -0
- data/lib/tasks/master.thor +61 -0
- data/lib/tasks/oozie.thor +399 -0
- data/lib/tasks/sandbox.thor +87 -0
- data/spec/integration/api/oozie/action_spec.rb +69 -0
- data/spec/integration/api/oozie/bundle_spec.rb +33 -0
- data/spec/integration/api/oozie/coordinator_spec.rb +66 -0
- data/spec/integration/api/oozie/hadoop_job_spec.rb +29 -0
- data/spec/integration/api/oozie/job_spec.rb +15 -0
- data/spec/integration/api/oozie/materialization_spec.rb +66 -0
- data/spec/integration/api/oozie/query_spec.rb +43 -0
- data/spec/integration/api/oozie/session_spec.rb +18 -0
- data/spec/integration/api/oozie/workflow_spec.rb +65 -0
- data/spec/integration/api/oozie_spec.rb +198 -0
- data/spec/integration/fixtures/api/running_coordinators/req_resp_00.memo +6 -0
- data/spec/integration/fixtures/api/sample_action/req_resp_00.memo +5 -0
- data/spec/integration/fixtures/api/sample_action/req_resp_01.memo +7 -0
- data/spec/integration/fixtures/api/sample_bundle/req_resp_00.memo +6 -0
- data/spec/integration/fixtures/api/sample_coordinator/req_resp_00.memo +5 -0
- data/spec/integration/fixtures/api/sample_materialization/req_resp_00.memo +5 -0
- data/spec/integration/fixtures/api/sample_materialization/req_resp_01.memo +7 -0
- data/spec/integration/fixtures/api/sample_workflow/req_resp_00.memo +5 -0
- data/spec/spec_helper.rb +92 -0
- data/spec/support/d_v_r.rb +125 -0
- data/spec/support/hodor_api.rb +15 -0
- data/spec/unit/hodor/api/hdfs_spec.rb +63 -0
- data/spec/unit/hodor/api/oozie_spec.rb +32 -0
- data/spec/unit/hodor/environment_spec.rb +52 -0
- data/topics/hdfs/corresponding_paths.txt +31 -0
- data/topics/hdfs/overview.txt +10 -0
- data/topics/master/clusters.yml.txt +36 -0
- data/topics/master/overview.txt +17 -0
- data/topics/oozie/blocking_coordinators.txt +46 -0
- data/topics/oozie/composing_job_properties.txt +68 -0
- data/topics/oozie/display_job.txt +52 -0
- data/topics/oozie/driver_scenarios.txt +42 -0
- data/topics/oozie/inspecting_jobs.txt +59 -0
- data/topics/oozie/jobs.yml.txt +185 -0
- data/topics/oozie/overview.txt +43 -0
- data/topics/oozie/workers_and_drivers.txt +40 -0
- metadata +455 -0
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'hodor/api/oozie/session'
|
2
|
+
|
3
|
+
shared_context "hodor api" do
|
4
|
+
|
5
|
+
attr_reader :memo
|
6
|
+
|
7
|
+
subject(:env) { ::Hodor::Environment.instance }
|
8
|
+
subject(:session) { ::Hodor::Oozie::Session.instance }
|
9
|
+
subject(:oozie) { ::Hodor::Oozie }
|
10
|
+
|
11
|
+
before(:each) do
|
12
|
+
@memo = DVR.new(self) unless (self.methods & [:scenario, :playback, :record]).empty?
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'hodor/api/hdfs'
|
2
|
+
|
3
|
+
module Hodor
|
4
|
+
|
5
|
+
describe Hdfs do
|
6
|
+
|
7
|
+
describe "Required Public Interface" do
|
8
|
+
|
9
|
+
# .instance instead of .new necessitated by singleton:
|
10
|
+
subject(:hdfs_methods) { Hodor::Hdfs.instance_methods }
|
11
|
+
|
12
|
+
# Public methods
|
13
|
+
it { should include :pwd }
|
14
|
+
it { should include :path_on_hdfs }
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
context "test local to hdfs path operations" do
|
19
|
+
|
20
|
+
before(:each) do
|
21
|
+
use_settings hdfs_root: "/", hdfs_user: "hdfs"
|
22
|
+
use_pwd "company/workers/noop", false
|
23
|
+
end
|
24
|
+
|
25
|
+
context "ensure pwd maps correctly between file systems" do
|
26
|
+
|
27
|
+
subject(:hdfs) { Hodor::Hdfs.instance }
|
28
|
+
|
29
|
+
it "should correctly map test repo path to HDFS path" do
|
30
|
+
expect(hdfs.pwd).to match(/\/company\/workers\/noop/)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
context "test putting file to HDFS" do
|
35
|
+
|
36
|
+
subject(:env) { Hodor::Environment.instance }
|
37
|
+
subject(:hdfs) { Hodor::Hdfs.instance }
|
38
|
+
|
39
|
+
it "should successfully construct ssh commandline to put file to HDFS" do
|
40
|
+
expect(File).to receive(:exists?).twice { true }
|
41
|
+
expect(env).to receive(:run_local).with(/cat workflow.xml.*=hdfs\s.*-put - \/company\/workers\/noop\/workflow.xml/, anything)
|
42
|
+
hdfs.put_file("workflow.xml")
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
context "test putting directory to HDFS" do
|
47
|
+
|
48
|
+
subject(:env) { Hodor::Environment.instance }
|
49
|
+
subject(:hdfs) { Hodor::Hdfs.instance }
|
50
|
+
|
51
|
+
it "should successfully construct ssh commandline to put directory to HDFS" do
|
52
|
+
expect(File).to receive(:exists?).twice { true }
|
53
|
+
expect(env).to receive(:run_local).with(/cat workflow.xml.*=hdfs\s.*-put - \/company\/workers\/noop\/workflow.xml/, anything)
|
54
|
+
hdfs.put_file("workflow.xml")
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Hodor
|
2
|
+
describe Oozie do
|
3
|
+
describe 'Required Public Interface' do
|
4
|
+
subject { Hodor::Oozie }
|
5
|
+
|
6
|
+
# Public methods
|
7
|
+
it { should respond_to? :job_by_id }
|
8
|
+
it { should respond_to? :job_by_path }
|
9
|
+
it { should respond_to? :change_job }
|
10
|
+
it { should respond_to? :compose_job_file }
|
11
|
+
it { should respond_to? :run_job }
|
12
|
+
end
|
13
|
+
context 'Filename prefixes' do
|
14
|
+
let(:prefix) { 'Test_prefix_' }
|
15
|
+
let(:full_path) { 'foo/foo/foo' }
|
16
|
+
let(:just_name_path) { 'foo' }
|
17
|
+
let(:correctly_prefixed) { 'foo/foo/Test_prefix_foo' }
|
18
|
+
|
19
|
+
it 'appends a supplied prefix to the file name' do
|
20
|
+
expect(subject.append_prefix_to_filename(full_path, prefix)).to eq(correctly_prefixed)
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'appends a supplied prefix to a simple file name' do
|
24
|
+
expect(subject.append_prefix_to_filename(just_name_path, prefix)).to eq(prefix+just_name_path)
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'keeps original filename if no prefix supplied' do
|
28
|
+
expect(subject.append_prefix_to_filename(full_path)).to eq(full_path)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'hodor/environment'
|
2
|
+
|
3
|
+
module Hodor
|
4
|
+
|
5
|
+
describe Environment do
|
6
|
+
|
7
|
+
describe "Required Public Interface" do
|
8
|
+
|
9
|
+
# .instance instead of .new necessitated by singleton:
|
10
|
+
subject(:hadoop_env) { Hodor::Environment.instance_methods }
|
11
|
+
|
12
|
+
# Public fields
|
13
|
+
it { should include :logger }
|
14
|
+
|
15
|
+
# Public methods
|
16
|
+
it { should include :erb_sub }
|
17
|
+
it { should include :erb_load }
|
18
|
+
it { should include :yml_load }
|
19
|
+
it { should include :root }
|
20
|
+
end
|
21
|
+
|
22
|
+
describe "Ensure usable test repo" do
|
23
|
+
|
24
|
+
# .instance instead of .new necessitated by singleton:
|
25
|
+
subject(:env) { Hodor::Environment.instance }
|
26
|
+
|
27
|
+
it "should have correct root" do
|
28
|
+
expect(subject.root).to match(/spec\/test_repo/)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
context "Test basic environment methods" do
|
33
|
+
|
34
|
+
subject(:env) { Hodor::Environment.instance }
|
35
|
+
|
36
|
+
before(:each) do
|
37
|
+
use_settings hdfs_root: "/", hdfs_user: "hdfs"
|
38
|
+
use_pwd "drivers/testbench"
|
39
|
+
end
|
40
|
+
|
41
|
+
it "should fail if no jobs.yml file exists" do
|
42
|
+
expect(
|
43
|
+
env.paths_from_root(Dir.pwd)
|
44
|
+
).to match_array(
|
45
|
+
[/spec\/test_repo/,
|
46
|
+
/spec\/test_repo\/drivers/,
|
47
|
+
/spec\/test_repo\/drivers\/testbench/]
|
48
|
+
)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
HDFS Corresponding Paths
|
2
|
+
--------------------------------------------------------------------------------------------
|
3
|
+
There are two methods for running Hdfs commands using this namespace. First, you can run the
|
4
|
+
"hdfs:fs" pass through command, as follows:
|
5
|
+
|
6
|
+
$ hodor hdfs:fs -ls /big_data/pipeline/drivers/testbench
|
7
|
+
|
8
|
+
Because "fs" is a pass-through command, it passes all of its arguments and options through
|
9
|
+
as-is to the "hadoop fs" command line tool on the remote host. The "fs" command does no
|
10
|
+
argument processing locally, it simply passes them via ssh for handling by the remote tool.
|
11
|
+
Alternatively, you could accomplish the same directory listing as the above command, using
|
12
|
+
the 'hdfs:ls' command:
|
13
|
+
|
14
|
+
$ cd /big_data/pipeline/drivers/testbench
|
15
|
+
$ hodor hdfs:ls
|
16
|
+
|
17
|
+
The "ls" command uses your current local path to calculate the "corresponding path" on the
|
18
|
+
remote HDFS volume, and lists the contents of that directory. The "corresponding path"
|
19
|
+
on the remote HDFS volume is automatically determined by the Hdfs namespace and used by
|
20
|
+
each of its commands.
|
21
|
+
|
22
|
+
This HDFS path inference from your local path is done by all commands in the Hdfs namespace,
|
23
|
+
except for the "fs" pass-through. For example, to upload a file in your current local directory
|
24
|
+
to the corresponding path on the remote HDFS volume:
|
25
|
+
|
26
|
+
$ cd /big_data/pipeline/drivers/
|
27
|
+
$ hodor hdfs:put ingestion.xml
|
28
|
+
|
29
|
+
Note: corresponding path is determined by first calculating your local path relative to
|
30
|
+
the root of your git repo, know as the "repo relative path". Next, the repo relative
|
31
|
+
path is appended to the HDFS root that you configure in your 'clusters.yml' file.
|
@@ -0,0 +1,10 @@
|
|
1
|
+
Hodor's Hdfs namespace assembles a command set that operates on a remote HDFS volume. Instead
|
2
|
+
of ssh'ing to the remote Hadoop master, and running 'hadoop fs <command>' directly on the
|
3
|
+
remote host, you run commands from Hodor's Hdfs namespace locally.
|
4
|
+
|
5
|
+
HELP TOPICS:
|
6
|
+
------------
|
7
|
+
* Local vs HDFS paths - to view help about HDFS corresponding paths, type:
|
8
|
+
$ hodor hdfs:topic corresponding_paths
|
9
|
+
|
10
|
+
\x5
|
@@ -0,0 +1,36 @@
|
|
1
|
+
The Clusters.yml File
|
2
|
+
---------------------------------------------------------------------------------------------------
|
3
|
+
The "clusters.yml" file is a file Hodor reads at startup that allows you to define multiple hadoop
|
4
|
+
clusters, and the url and resource bindings they require, so that your Hodor session can be easily
|
5
|
+
switched between multiple available Hadoop clusters. Hodor''s namespaces and commands use the url
|
6
|
+
and resource bindings from clusters.yml to select the destination for its various SSH and REST
|
7
|
+
calls. The clusters.yml file is expected in your hadoop project git repo at the path:
|
8
|
+
|
9
|
+
"<repo_root>/config/clusters.yml"
|
10
|
+
|
11
|
+
The clusters.yml file will typically look something like the following:
|
12
|
+
|
13
|
+
:production:
|
14
|
+
:nameNode: hdfs://hadoop-prod.mycompany.com:8020
|
15
|
+
:jobTracker: hadoop-prod.mycompany.com:8050
|
16
|
+
:oozie_url: http://hadoop-prod.mycompany.com:11000/oozie
|
17
|
+
|
18
|
+
:staging:
|
19
|
+
:nameNode: hdfs://hadoop-stage.mycompany.com:8020
|
20
|
+
:jobTracker: hadoop-stage.mycompany.com:8050
|
21
|
+
:oozie_url: http://hadoop-stage.mycompany.com:11000/oozie
|
22
|
+
|
23
|
+
The section (staging or production) that Hodor uses (e.g. the "target cluster") is configured using:
|
24
|
+
|
25
|
+
$ export HADOOP_ENV=production
|
26
|
+
|
27
|
+
To display a list of all key/value pairs that are defined for the current target cluster:
|
28
|
+
|
29
|
+
$ hodor master:config
|
30
|
+
|
31
|
+
To view more information about how the clusters.yml keys and values can be used to
|
32
|
+
parameterize your job properties, type:
|
33
|
+
|
34
|
+
$ hodor oozie:topic compose_job_properties
|
35
|
+
|
36
|
+
|
@@ -0,0 +1,17 @@
|
|
1
|
+
|
2
|
+
Hodor's Master namespace functions as a local proxy for the master (e.g. "Name node") of your target
|
3
|
+
Hadoop cluster. The commands in this namespace either operate directly on the master node or
|
4
|
+
provide information about the cluster the master is master of. For example, to run a Linux command
|
5
|
+
on the master node:
|
6
|
+
\xt $ hodor master:exec ps -ef | grep java
|
7
|
+
|
8
|
+
Part of the value of this namespace, it is allows you to operate on the master node, whatever that
|
9
|
+
may be. For example, regardless of whether you are targetting a staging or production hadoop cluster
|
10
|
+
(as configured in your "clusters.yml" file), the 'master:exec' command will route appropriately.
|
11
|
+
|
12
|
+
HELP TOPICS:
|
13
|
+
------------
|
14
|
+
* The clusters.yml file - to view help about the clusters.yml, type:
|
15
|
+
$ hodor master:topic clusters.yml
|
16
|
+
|
17
|
+
\x5
|
@@ -0,0 +1,46 @@
|
|
1
|
+
Blocking Coordinators
|
2
|
+
=====================
|
3
|
+
Oozie's coordinators can block on 2 things: the clock and data availability.
|
4
|
+
A coordinator that blocks on both the clock and data availability is called
|
5
|
+
a "blocking coordinator" because it will not run until both the time and the
|
6
|
+
input data it is waiting on arrive. The run.properties file allows you to
|
7
|
+
choose either a blocking coordinator or a non-blocking coordinator to pair
|
8
|
+
with your driver workflow. To specify a blocking coordinator, you will set
|
9
|
+
the following property values:
|
10
|
+
|
11
|
+
oozie.coord.application.path=${PWD}/coordinator-1-input.xml
|
12
|
+
|
13
|
+
This property value setting specifies that you want to use a blocking
|
14
|
+
coordinator that has 1 data input that it waits on. There is also a
|
15
|
+
similarly named 2-input coordinator if you require that. It is not sufficient
|
16
|
+
to specify that your coordinator should block on 1 input, you must also
|
17
|
+
specify which input that is. To do that, you'll use the "input_1" variable.
|
18
|
+
For example:
|
19
|
+
|
20
|
+
input_1=${service_data}
|
21
|
+
|
22
|
+
The "input_1" variable is expanded by the 1-input coordinator to specify
|
23
|
+
which input dataset, the coordinator monitors for data availability flags.
|
24
|
+
Input 1 is set to the variable ${service_data}, which is defined in the
|
25
|
+
shared (higher level) job.properties.erb file as follows:
|
26
|
+
|
27
|
+
service_data=${shared}/ready/service_data
|
28
|
+
|
29
|
+
By setting the input_1 and coordinator path variables above in the
|
30
|
+
run.properties file, the rake task will launch the corresponding coordinator
|
31
|
+
from the scenarios directory. The subdirectory with scenarios is chosen
|
32
|
+
according to your scenario variable assignment:
|
33
|
+
|
34
|
+
scenario: hourly/incremental
|
35
|
+
|
36
|
+
With this assignment, the 1-input blocking coordinator located in the
|
37
|
+
hourly/incremental subdirectory of scenarios will be launch. It will monitor
|
38
|
+
the service_data dataset, and when new data arrives, it will run the
|
39
|
+
workflow indicated by your assignment to the "driver" variable:
|
40
|
+
|
41
|
+
driver=periodic/analyze_campaign
|
42
|
+
|
43
|
+
The approach treats the frequency, data dependencies, run contexts and
|
44
|
+
workflow selections as distinct concerns and allows you to mix
|
45
|
+
and match them via run.properties settings as the situation warrants.
|
46
|
+
|
@@ -0,0 +1,68 @@
|
|
1
|
+
Job.property Files
|
2
|
+
-----------------------------------------------------------------------------------------------
|
3
|
+
Before you can run a job, you must first specify the properties it should receive. This is
|
4
|
+
done via a job.properties file. Hodor's Oozie namespace enables you to assemble a hierarchy
|
5
|
+
of job.property.erb files into a single job.property file suitable for execution of an
|
6
|
+
Oozie job. The following directory structure shows several Job.properties.erb files at
|
7
|
+
different locations within the hierarchy:
|
8
|
+
|
9
|
+
<git_repo>
|
10
|
+
job.properties.erb
|
11
|
+
<drivers/>
|
12
|
+
job.properties.erb
|
13
|
+
<billing>
|
14
|
+
job.properties.erb
|
15
|
+
<campaigns>
|
16
|
+
job.properties.erb
|
17
|
+
<diagnostic>
|
18
|
+
job.properties.erb
|
19
|
+
|
20
|
+
For example, running a workflow contained in the <campaigns> directory, will pull in three
|
21
|
+
job.properties.erb files. First, the one locate at the root, followed by the one located in
|
22
|
+
<drivers>, followed by the the one located in <compaigns>. Each successive job.properties.erb
|
23
|
+
file overrides the properties of its parents located higher in the directory structure.
|
24
|
+
|
25
|
+
A typical job.properties file might look like the following:
|
26
|
+
|
27
|
+
nameNode=<%= env[:nameNode] %>
|
28
|
+
jobTracker=<%= env[:jobTracker] %>
|
29
|
+
oozie_url=<%= env[:oozie_url] %>
|
30
|
+
|
31
|
+
Each line in the above job.property file sample represents the coming together of two
|
32
|
+
distinct property systems, Java's "Velocity" property system and Ruby's "ERB" property
|
33
|
+
system:
|
34
|
+
|
35
|
+
<Hadoop Property> = <Ruby ERB expansion>
|
36
|
+
|
37
|
+
The key on the left (i.e. nameNode) is a Hadoop Java property that is referenced and expanded
|
38
|
+
by workflows and coordinators at runtime on the Hadoop cluster. The value on the right
|
39
|
+
(i.e. env[:nameNode])is a Ruby ERB expansion, taken from the clusters.yml file that is
|
40
|
+
replace _on you local machine_ prior to deploying the workflows and coordinators to HDFS.
|
41
|
+
In other words, Java variables are expand post-deployment. ERB variables are expanded
|
42
|
+
pre-deployment. ERB expansions within a job.properties file provide a means of passing a
|
43
|
+
value from ERB to Velocity. The "<%= env[:nameNode] %>" part of the above
|
44
|
+
job.properties.erb sample is replaced by the ":nameNode" key value (from clusters.yml)
|
45
|
+
for the currently targetted Hadoop cluster. This variable substitution happens when you
|
46
|
+
run the "oozie:run_job" command, so the target cluster's configuration is referenced
|
47
|
+
at that moment.
|
48
|
+
|
49
|
+
This approach of parameterization the urls and resource bindings that reach a target
|
50
|
+
Hadoop cluster allows one a single job.properties file to work for all Hadoop clusters
|
51
|
+
(i.e. production, staging, ad-hoc) in your infrastructure. To get a listing of what ERB
|
52
|
+
variables are available for expansion within the hadoop environment currently being
|
53
|
+
targeted:
|
54
|
+
|
55
|
+
$ hodor master:config
|
56
|
+
INFO|nameNode : hdfs://sandbox.hortonworks.com:8020
|
57
|
+
INFO|jobTracker : sandbox.hortonworks.com:8050
|
58
|
+
INFO|oozie_url : http://127.0.0.1:11000/oozie
|
59
|
+
INFO|ssh_user : root
|
60
|
+
INFO|ssh_host : 127.0.0.1
|
61
|
+
INFO|ssh_port : 2222
|
62
|
+
|
63
|
+
Finally, the motivation for breaking a single job.properties file up into muliple segments
|
64
|
+
arranged hierarchically is adherance to the "Do not repeat yourself" (DRY) principle.
|
65
|
+
By breaking the job.properties.erb file into a hierarchy, we avoid lots of duplication
|
66
|
+
that would exist in a flat file system. For example, properties that are shared by all
|
67
|
+
jobs are pushed to higher job.properties.erb files. Properties that are specific to a
|
68
|
+
particular job, are pushed to lower ones.
|
@@ -0,0 +1,52 @@
|
|
1
|
+
The display_job command allows you to explore and print information about the current
|
2
|
+
job within the Oozie job hierarchy and (optionally) to move between jobs. Display_job
|
3
|
+
is the most feature-rich command in the command set, allowing the user to display
|
4
|
+
information about a running or completed bundle, coordinator, workflow, oozie action,
|
5
|
+
or hadoop job. In addition, for each type of job, you can display different aspects
|
6
|
+
of the job's state. For example, the job's definition, it's log file, its config
|
7
|
+
settings, etc. Display_jobs info display also shows the children of the current job
|
8
|
+
organized in a table. If the current job is a coordinator, the children will be all
|
9
|
+
the materializations of that coordinator, for exaple. Some of the options apply to
|
10
|
+
the info display about the current job (i.e. -v option) and some of the options
|
11
|
+
apply to the table of children (ie. -l option). The following display_job commands
|
12
|
+
illustrate the display of different job state aspects and their options:
|
13
|
+
\x5 $ hodor oozie:display_job info # displays overall info about the job
|
14
|
+
$ hodor oozie:display_job -v info # displays all available information
|
15
|
+
$ hodor oozie:display_job info -m 'data_source' # only show matching child rows
|
16
|
+
$ hodor oozie:display_job info -l 100 # show 100 children of current job
|
17
|
+
$ hodor oozie:display_job info -l 100 -o 100 # show second 100 children
|
18
|
+
$ hodor oozie:display_job info -k # display only killed children
|
19
|
+
$ hodor oozie:display_job log # displays the job's log file
|
20
|
+
$ hodor oozie:display_job log -w fail.log # writes log output to file
|
21
|
+
$ hodor oozie:display_job definition # displays the job's definition
|
22
|
+
$ hodor oozie:display_job conf # displays the job's property settings
|
23
|
+
$ hodor oozie:display_job conf -m 'rdbms' # matching properties (keys or values)
|
24
|
+
$ hodor oozie:display_job rest # displays current REST request url
|
25
|
+
$ hodor oozie:display_job json # raw json output of REST call
|
26
|
+
$ hodor oozie:display_job json -w rest.json # the -w option works for all aspects
|
27
|
+
|
28
|
+
In addition to displaying information about the current job, display_job can also
|
29
|
+
describe non-current jobs. Just pass in a job id, as follows:
|
30
|
+
\x5 $ hodor oozie:display_job 3 json # rest output for child #3
|
31
|
+
$ hodor oozie:display_job 0035642-151002103648730-oozie-oozi-W log
|
32
|
+
# ^^ display log for job ID
|
33
|
+
|
34
|
+
Display_job can function in 1 of 2 modes: query mode or change mode. In query mode,
|
35
|
+
display_job just queries for and displays the jobs state, but does not change the
|
36
|
+
current_job to it. In change mode, display_job still shows the job's state, but
|
37
|
+
also changes the current_job, like change_job does. The default mode is change_mode,
|
38
|
+
but this behavior can be altered in two ways. First, you can add the -q option to the
|
39
|
+
command line:
|
40
|
+
\x5 $ hodor oozie:display_job -q 3 # Just queries child 3, does not change to it
|
41
|
+
|
42
|
+
Alternatively, you can change display_job's default behavior by modifying the Hodor
|
43
|
+
preference ":display_job_query_mode" in ~/.hodor.yml as follows:
|
44
|
+
\x5 FILE: ${HOME}/.hodor.yml ----
|
45
|
+
:display_job_query_mode: true
|
46
|
+
|
47
|
+
Hodor reads ~/.hodor.yml on startup, and if the above flag is true, query mode becomes
|
48
|
+
display_job's default behavior.
|
49
|
+
|
50
|
+
Suggested Alias:
|
51
|
+
\x5 $ alias dj='hodor oozie:display_job'
|
52
|
+
|
@@ -0,0 +1,42 @@
|
|
1
|
+
Driver Scenarios
|
2
|
+
---------------------------------------------------------------------------
|
3
|
+
Within the drivers directory, a subdirectory named "scenarios" is expected
|
4
|
+
by Hodor. The term "scenario" has aspecial meaning. A scenario is
|
5
|
+
the combination of the "fill_frequency" plus the "fill_type" that a workflow
|
6
|
+
should execute for. For example, some workflows run hourly, some daily. And
|
7
|
+
some workflows run over historic ranges of time (i.e. a "backfill"), and some
|
8
|
+
workflow run for future ranges of time (i.e. "incremental"). So, a driver
|
9
|
+
can run with a "scenario", which is the combination of the two:
|
10
|
+
|
11
|
+
fill frequency + fill type).
|
12
|
+
|
13
|
+
This scenarios that are typically defined are:
|
14
|
+
|
15
|
+
* hourly/incremental
|
16
|
+
* hourly/backfill
|
17
|
+
* daily/incremental
|
18
|
+
* daily/backfill
|
19
|
+
|
20
|
+
The scenarios directory, under drivers, defines Oozie coordinators and context
|
21
|
+
code artifacts that implement the scheduling and variable assignment concerns necessary
|
22
|
+
to implement the scenario. A typical scenario is organized as follows
|
23
|
+
|
24
|
+
|
25
|
+
hourly/
|
26
|
+
incremental/
|
27
|
+
context.xml
|
28
|
+
coordinator.xml # blocks on time online
|
29
|
+
coordinator-1.xml # blocks on time + 1 data input
|
30
|
+
|
31
|
+
For example, the above "hourly/incremental" driver run scenario defines a
|
32
|
+
coordinator that has an hourly frequency, along with a context.xml workflow that
|
33
|
+
defines the table partitioning scheme to partition down to hourly granularity
|
34
|
+
("i.e. an hourly fill_frequency"). The "fill_type" part of the scenario indicates whether
|
35
|
+
the workflow is running from current time forward, or for historic date ranges.
|
36
|
+
If running over historic date ranges (i.e. backfill fill_type), certain optimizatons
|
37
|
+
can be made that run many hours at once.
|
38
|
+
|
39
|
+
Each driver cites the run scenario it expects to run within, via the "jobs.yml"
|
40
|
+
file. For more information about the jobs.yml file, type:
|
41
|
+
|
42
|
+
hodor oozie:topic jobs.yml
|