hodor 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +16 -0
  3. data/.gitmodules +3 -0
  4. data/.rspec +2 -0
  5. data/.ruby-gemset +1 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +5 -0
  8. data/Gemfile +4 -0
  9. data/Guardfile +11 -0
  10. data/README.md +105 -0
  11. data/Rakefile +105 -0
  12. data/bin/hodor +18 -0
  13. data/hodor.gemspec +47 -0
  14. data/lib/config/log4r_config.xml +35 -0
  15. data/lib/hodor.rb +83 -0
  16. data/lib/hodor/api/hdfs.rb +222 -0
  17. data/lib/hodor/api/oozie.rb +215 -0
  18. data/lib/hodor/api/oozie/action.rb +52 -0
  19. data/lib/hodor/api/oozie/bundle.rb +27 -0
  20. data/lib/hodor/api/oozie/coordinator.rb +53 -0
  21. data/lib/hodor/api/oozie/hadoop_job.rb +29 -0
  22. data/lib/hodor/api/oozie/job.rb +192 -0
  23. data/lib/hodor/api/oozie/materialization.rb +56 -0
  24. data/lib/hodor/api/oozie/query.rb +115 -0
  25. data/lib/hodor/api/oozie/session.rb +170 -0
  26. data/lib/hodor/api/oozie/workflow.rb +58 -0
  27. data/lib/hodor/cli.rb +146 -0
  28. data/lib/hodor/command.rb +164 -0
  29. data/lib/hodor/configuration.rb +80 -0
  30. data/lib/hodor/environment.rb +437 -0
  31. data/lib/hodor/ui/table.rb +130 -0
  32. data/lib/hodor/version.rb +3 -0
  33. data/lib/tasks/hdfs.thor +138 -0
  34. data/lib/tasks/master.thor +61 -0
  35. data/lib/tasks/oozie.thor +399 -0
  36. data/lib/tasks/sandbox.thor +87 -0
  37. data/spec/integration/api/oozie/action_spec.rb +69 -0
  38. data/spec/integration/api/oozie/bundle_spec.rb +33 -0
  39. data/spec/integration/api/oozie/coordinator_spec.rb +66 -0
  40. data/spec/integration/api/oozie/hadoop_job_spec.rb +29 -0
  41. data/spec/integration/api/oozie/job_spec.rb +15 -0
  42. data/spec/integration/api/oozie/materialization_spec.rb +66 -0
  43. data/spec/integration/api/oozie/query_spec.rb +43 -0
  44. data/spec/integration/api/oozie/session_spec.rb +18 -0
  45. data/spec/integration/api/oozie/workflow_spec.rb +65 -0
  46. data/spec/integration/api/oozie_spec.rb +198 -0
  47. data/spec/integration/fixtures/api/running_coordinators/req_resp_00.memo +6 -0
  48. data/spec/integration/fixtures/api/sample_action/req_resp_00.memo +5 -0
  49. data/spec/integration/fixtures/api/sample_action/req_resp_01.memo +7 -0
  50. data/spec/integration/fixtures/api/sample_bundle/req_resp_00.memo +6 -0
  51. data/spec/integration/fixtures/api/sample_coordinator/req_resp_00.memo +5 -0
  52. data/spec/integration/fixtures/api/sample_materialization/req_resp_00.memo +5 -0
  53. data/spec/integration/fixtures/api/sample_materialization/req_resp_01.memo +7 -0
  54. data/spec/integration/fixtures/api/sample_workflow/req_resp_00.memo +5 -0
  55. data/spec/spec_helper.rb +92 -0
  56. data/spec/support/d_v_r.rb +125 -0
  57. data/spec/support/hodor_api.rb +15 -0
  58. data/spec/unit/hodor/api/hdfs_spec.rb +63 -0
  59. data/spec/unit/hodor/api/oozie_spec.rb +32 -0
  60. data/spec/unit/hodor/environment_spec.rb +52 -0
  61. data/topics/hdfs/corresponding_paths.txt +31 -0
  62. data/topics/hdfs/overview.txt +10 -0
  63. data/topics/master/clusters.yml.txt +36 -0
  64. data/topics/master/overview.txt +17 -0
  65. data/topics/oozie/blocking_coordinators.txt +46 -0
  66. data/topics/oozie/composing_job_properties.txt +68 -0
  67. data/topics/oozie/display_job.txt +52 -0
  68. data/topics/oozie/driver_scenarios.txt +42 -0
  69. data/topics/oozie/inspecting_jobs.txt +59 -0
  70. data/topics/oozie/jobs.yml.txt +185 -0
  71. data/topics/oozie/overview.txt +43 -0
  72. data/topics/oozie/workers_and_drivers.txt +40 -0
  73. metadata +455 -0
@@ -0,0 +1,15 @@
1
+ require 'hodor/api/oozie/session'
2
+
3
+ shared_context "hodor api" do
4
+
5
+ attr_reader :memo
6
+
7
+ subject(:env) { ::Hodor::Environment.instance }
8
+ subject(:session) { ::Hodor::Oozie::Session.instance }
9
+ subject(:oozie) { ::Hodor::Oozie }
10
+
11
+ before(:each) do
12
+ @memo = DVR.new(self) unless (self.methods & [:scenario, :playback, :record]).empty?
13
+ end
14
+
15
+ end
@@ -0,0 +1,63 @@
1
+ require 'hodor/api/hdfs'
2
+
3
+ module Hodor
4
+
5
+ describe Hdfs do
6
+
7
+ describe "Required Public Interface" do
8
+
9
+ # .instance instead of .new necessitated by singleton:
10
+ subject(:hdfs_methods) { Hodor::Hdfs.instance_methods }
11
+
12
+ # Public methods
13
+ it { should include :pwd }
14
+ it { should include :path_on_hdfs }
15
+
16
+ end
17
+
18
+ context "test local to hdfs path operations" do
19
+
20
+ before(:each) do
21
+ use_settings hdfs_root: "/", hdfs_user: "hdfs"
22
+ use_pwd "company/workers/noop", false
23
+ end
24
+
25
+ context "ensure pwd maps correctly between file systems" do
26
+
27
+ subject(:hdfs) { Hodor::Hdfs.instance }
28
+
29
+ it "should correctly map test repo path to HDFS path" do
30
+ expect(hdfs.pwd).to match(/\/company\/workers\/noop/)
31
+ end
32
+ end
33
+
34
+ context "test putting file to HDFS" do
35
+
36
+ subject(:env) { Hodor::Environment.instance }
37
+ subject(:hdfs) { Hodor::Hdfs.instance }
38
+
39
+ it "should successfully construct ssh commandline to put file to HDFS" do
40
+ expect(File).to receive(:exists?).twice { true }
41
+ expect(env).to receive(:run_local).with(/cat workflow.xml.*=hdfs\s.*-put - \/company\/workers\/noop\/workflow.xml/, anything)
42
+ hdfs.put_file("workflow.xml")
43
+ end
44
+ end
45
+
46
+ context "test putting directory to HDFS" do
47
+
48
+ subject(:env) { Hodor::Environment.instance }
49
+ subject(:hdfs) { Hodor::Hdfs.instance }
50
+
51
+ it "should successfully construct ssh commandline to put directory to HDFS" do
52
+ expect(File).to receive(:exists?).twice { true }
53
+ expect(env).to receive(:run_local).with(/cat workflow.xml.*=hdfs\s.*-put - \/company\/workers\/noop\/workflow.xml/, anything)
54
+ hdfs.put_file("workflow.xml")
55
+ end
56
+ end
57
+
58
+
59
+ end
60
+
61
+ end
62
+
63
+ end
@@ -0,0 +1,32 @@
1
+ module Hodor
2
+ describe Oozie do
3
+ describe 'Required Public Interface' do
4
+ subject { Hodor::Oozie }
5
+
6
+ # Public methods
7
+ it { should respond_to? :job_by_id }
8
+ it { should respond_to? :job_by_path }
9
+ it { should respond_to? :change_job }
10
+ it { should respond_to? :compose_job_file }
11
+ it { should respond_to? :run_job }
12
+ end
13
+ context 'Filename prefixes' do
14
+ let(:prefix) { 'Test_prefix_' }
15
+ let(:full_path) { 'foo/foo/foo' }
16
+ let(:just_name_path) { 'foo' }
17
+ let(:correctly_prefixed) { 'foo/foo/Test_prefix_foo' }
18
+
19
+ it 'appends a supplied prefix to the file name' do
20
+ expect(subject.append_prefix_to_filename(full_path, prefix)).to eq(correctly_prefixed)
21
+ end
22
+
23
+ it 'appends a supplied prefix to a simple file name' do
24
+ expect(subject.append_prefix_to_filename(just_name_path, prefix)).to eq(prefix+just_name_path)
25
+ end
26
+
27
+ it 'keeps original filename if no prefix supplied' do
28
+ expect(subject.append_prefix_to_filename(full_path)).to eq(full_path)
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,52 @@
1
+ require 'hodor/environment'
2
+
3
+ module Hodor
4
+
5
+ describe Environment do
6
+
7
+ describe "Required Public Interface" do
8
+
9
+ # .instance instead of .new necessitated by singleton:
10
+ subject(:hadoop_env) { Hodor::Environment.instance_methods }
11
+
12
+ # Public fields
13
+ it { should include :logger }
14
+
15
+ # Public methods
16
+ it { should include :erb_sub }
17
+ it { should include :erb_load }
18
+ it { should include :yml_load }
19
+ it { should include :root }
20
+ end
21
+
22
+ describe "Ensure usable test repo" do
23
+
24
+ # .instance instead of .new necessitated by singleton:
25
+ subject(:env) { Hodor::Environment.instance }
26
+
27
+ it "should have correct root" do
28
+ expect(subject.root).to match(/spec\/test_repo/)
29
+ end
30
+ end
31
+
32
+ context "Test basic environment methods" do
33
+
34
+ subject(:env) { Hodor::Environment.instance }
35
+
36
+ before(:each) do
37
+ use_settings hdfs_root: "/", hdfs_user: "hdfs"
38
+ use_pwd "drivers/testbench"
39
+ end
40
+
41
+ it "should fail if no jobs.yml file exists" do
42
+ expect(
43
+ env.paths_from_root(Dir.pwd)
44
+ ).to match_array(
45
+ [/spec\/test_repo/,
46
+ /spec\/test_repo\/drivers/,
47
+ /spec\/test_repo\/drivers\/testbench/]
48
+ )
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,31 @@
1
+ HDFS Corresponding Paths
2
+ --------------------------------------------------------------------------------------------
3
+ There are two methods for running Hdfs commands using this namespace. First, you can run the
4
+ "hdfs:fs" pass through command, as follows:
5
+
6
+ $ hodor hdfs:fs -ls /big_data/pipeline/drivers/testbench
7
+
8
+ Because "fs" is a pass-through command, it passes all of its arguments and options through
9
+ as-is to the "hadoop fs" command line tool on the remote host. The "fs" command does no
10
+ argument processing locally, it simply passes them via ssh for handling by the remote tool.
11
+ Alternatively, you could accomplish the same directory listing as the above command, using
12
+ the 'hdfs:ls' command:
13
+
14
+ $ cd /big_data/pipeline/drivers/testbench
15
+ $ hodor hdfs:ls
16
+
17
+ The "ls" command uses your current local path to calculate the "corresponding path" on the
18
+ remote HDFS volume, and lists the contents of that directory. The "corresponding path"
19
+ on the remote HDFS volume is automatically determined by the Hdfs namespace and used by
20
+ each of its commands.
21
+
22
+ This HDFS path inference from your local path is done by all commands in the Hdfs namespace,
23
+ except for the "fs" pass-through. For example, to upload a file in your current local directory
24
+ to the corresponding path on the remote HDFS volume:
25
+
26
+ $ cd /big_data/pipeline/drivers/
27
+ $ hodor hdfs:put ingestion.xml
28
+
29
+ Note: corresponding path is determined by first calculating your local path relative to
30
+ the root of your git repo, know as the "repo relative path". Next, the repo relative
31
+ path is appended to the HDFS root that you configure in your 'clusters.yml' file.
@@ -0,0 +1,10 @@
1
+ Hodor's Hdfs namespace assembles a command set that operates on a remote HDFS volume. Instead
2
+ of ssh'ing to the remote Hadoop master, and running 'hadoop fs <command>' directly on the
3
+ remote host, you run commands from Hodor's Hdfs namespace locally.
4
+
5
+ HELP TOPICS:
6
+ ------------
7
+ * Local vs HDFS paths - to view help about HDFS corresponding paths, type:
8
+ $ hodor hdfs:topic corresponding_paths
9
+
10
+ \x5
@@ -0,0 +1,36 @@
1
+ The Clusters.yml File
2
+ ---------------------------------------------------------------------------------------------------
3
+ The "clusters.yml" file is a file Hodor reads at startup that allows you to define multiple hadoop
4
+ clusters, and the url and resource bindings they require, so that your Hodor session can be easily
5
+ switched between multiple available Hadoop clusters. Hodor''s namespaces and commands use the url
6
+ and resource bindings from clusters.yml to select the destination for its various SSH and REST
7
+ calls. The clusters.yml file is expected in your hadoop project git repo at the path:
8
+
9
+ "<repo_root>/config/clusters.yml"
10
+
11
+ The clusters.yml file will typically look something like the following:
12
+
13
+ :production:
14
+ :nameNode: hdfs://hadoop-prod.mycompany.com:8020
15
+ :jobTracker: hadoop-prod.mycompany.com:8050
16
+ :oozie_url: http://hadoop-prod.mycompany.com:11000/oozie
17
+
18
+ :staging:
19
+ :nameNode: hdfs://hadoop-stage.mycompany.com:8020
20
+ :jobTracker: hadoop-stage.mycompany.com:8050
21
+ :oozie_url: http://hadoop-stage.mycompany.com:11000/oozie
22
+
23
+ The section (staging or production) that Hodor uses (e.g. the "target cluster") is configured using:
24
+
25
+ $ export HADOOP_ENV=production
26
+
27
+ To display a list of all key/value pairs that are defined for the current target cluster:
28
+
29
+ $ hodor master:config
30
+
31
+ To view more information about how the clusters.yml keys and values can be used to
32
+ parameterize your job properties, type:
33
+
34
+ $ hodor oozie:topic compose_job_properties
35
+
36
+
@@ -0,0 +1,17 @@
1
+
2
+ Hodor's Master namespace functions as a local proxy for the master (e.g. "Name node") of your target
3
+ Hadoop cluster. The commands in this namespace either operate directly on the master node or
4
+ provide information about the cluster the master is master of. For example, to run a Linux command
5
+ on the master node:
6
+ \xt $ hodor master:exec ps -ef | grep java
7
+
8
+ Part of the value of this namespace, it is allows you to operate on the master node, whatever that
9
+ may be. For example, regardless of whether you are targetting a staging or production hadoop cluster
10
+ (as configured in your "clusters.yml" file), the 'master:exec' command will route appropriately.
11
+
12
+ HELP TOPICS:
13
+ ------------
14
+ * The clusters.yml file - to view help about the clusters.yml, type:
15
+ $ hodor master:topic clusters.yml
16
+
17
+ \x5
@@ -0,0 +1,46 @@
1
+ Blocking Coordinators
2
+ =====================
3
+ Oozie's coordinators can block on 2 things: the clock and data availability.
4
+ A coordinator that blocks on both the clock and data availability is called
5
+ a "blocking coordinator" because it will not run until both the time and the
6
+ input data it is waiting on arrive. The run.properties file allows you to
7
+ choose either a blocking coordinator or a non-blocking coordinator to pair
8
+ with your driver workflow. To specify a blocking coordinator, you will set
9
+ the following property values:
10
+
11
+ oozie.coord.application.path=${PWD}/coordinator-1-input.xml
12
+
13
+ This property value setting specifies that you want to use a blocking
14
+ coordinator that has 1 data input that it waits on. There is also a
15
+ similarly named 2-input coordinator if you require that. It is not sufficient
16
+ to specify that your coordinator should block on 1 input, you must also
17
+ specify which input that is. To do that, you'll use the "input_1" variable.
18
+ For example:
19
+
20
+ input_1=${service_data}
21
+
22
+ The "input_1" variable is expanded by the 1-input coordinator to specify
23
+ which input dataset, the coordinator monitors for data availability flags.
24
+ Input 1 is set to the variable ${service_data}, which is defined in the
25
+ shared (higher level) job.properties.erb file as follows:
26
+
27
+ service_data=${shared}/ready/service_data
28
+
29
+ By setting the input_1 and coordinator path variables above in the
30
+ run.properties file, the rake task will launch the corresponding coordinator
31
+ from the scenarios directory. The subdirectory with scenarios is chosen
32
+ according to your scenario variable assignment:
33
+
34
+ scenario: hourly/incremental
35
+
36
+ With this assignment, the 1-input blocking coordinator located in the
37
+ hourly/incremental subdirectory of scenarios will be launch. It will monitor
38
+ the service_data dataset, and when new data arrives, it will run the
39
+ workflow indicated by your assignment to the "driver" variable:
40
+
41
+ driver=periodic/analyze_campaign
42
+
43
+ The approach treats the frequency, data dependencies, run contexts and
44
+ workflow selections as distinct concerns and allows you to mix
45
+ and match them via run.properties settings as the situation warrants.
46
+
@@ -0,0 +1,68 @@
1
+ Job.property Files
2
+ -----------------------------------------------------------------------------------------------
3
+ Before you can run a job, you must first specify the properties it should receive. This is
4
+ done via a job.properties file. Hodor's Oozie namespace enables you to assemble a hierarchy
5
+ of job.property.erb files into a single job.property file suitable for execution of an
6
+ Oozie job. The following directory structure shows several Job.properties.erb files at
7
+ different locations within the hierarchy:
8
+
9
+ <git_repo>
10
+ job.properties.erb
11
+ <drivers/>
12
+ job.properties.erb
13
+ <billing>
14
+ job.properties.erb
15
+ <campaigns>
16
+ job.properties.erb
17
+ <diagnostic>
18
+ job.properties.erb
19
+
20
+ For example, running a workflow contained in the <campaigns> directory, will pull in three
21
+ job.properties.erb files. First, the one locate at the root, followed by the one located in
22
+ <drivers>, followed by the the one located in <compaigns>. Each successive job.properties.erb
23
+ file overrides the properties of its parents located higher in the directory structure.
24
+
25
+ A typical job.properties file might look like the following:
26
+
27
+ nameNode=<%= env[:nameNode] %>
28
+ jobTracker=<%= env[:jobTracker] %>
29
+ oozie_url=<%= env[:oozie_url] %>
30
+
31
+ Each line in the above job.property file sample represents the coming together of two
32
+ distinct property systems, Java's "Velocity" property system and Ruby's "ERB" property
33
+ system:
34
+
35
+ <Hadoop Property> = <Ruby ERB expansion>
36
+
37
+ The key on the left (i.e. nameNode) is a Hadoop Java property that is referenced and expanded
38
+ by workflows and coordinators at runtime on the Hadoop cluster. The value on the right
39
+ (i.e. env[:nameNode])is a Ruby ERB expansion, taken from the clusters.yml file that is
40
+ replace _on you local machine_ prior to deploying the workflows and coordinators to HDFS.
41
+ In other words, Java variables are expand post-deployment. ERB variables are expanded
42
+ pre-deployment. ERB expansions within a job.properties file provide a means of passing a
43
+ value from ERB to Velocity. The "<%= env[:nameNode] %>" part of the above
44
+ job.properties.erb sample is replaced by the ":nameNode" key value (from clusters.yml)
45
+ for the currently targetted Hadoop cluster. This variable substitution happens when you
46
+ run the "oozie:run_job" command, so the target cluster's configuration is referenced
47
+ at that moment.
48
+
49
+ This approach of parameterization the urls and resource bindings that reach a target
50
+ Hadoop cluster allows one a single job.properties file to work for all Hadoop clusters
51
+ (i.e. production, staging, ad-hoc) in your infrastructure. To get a listing of what ERB
52
+ variables are available for expansion within the hadoop environment currently being
53
+ targeted:
54
+
55
+ $ hodor master:config
56
+ INFO|nameNode : hdfs://sandbox.hortonworks.com:8020
57
+ INFO|jobTracker : sandbox.hortonworks.com:8050
58
+ INFO|oozie_url : http://127.0.0.1:11000/oozie
59
+ INFO|ssh_user : root
60
+ INFO|ssh_host : 127.0.0.1
61
+ INFO|ssh_port : 2222
62
+
63
+ Finally, the motivation for breaking a single job.properties file up into muliple segments
64
+ arranged hierarchically is adherance to the "Do not repeat yourself" (DRY) principle.
65
+ By breaking the job.properties.erb file into a hierarchy, we avoid lots of duplication
66
+ that would exist in a flat file system. For example, properties that are shared by all
67
+ jobs are pushed to higher job.properties.erb files. Properties that are specific to a
68
+ particular job, are pushed to lower ones.
@@ -0,0 +1,52 @@
1
+ The display_job command allows you to explore and print information about the current
2
+ job within the Oozie job hierarchy and (optionally) to move between jobs. Display_job
3
+ is the most feature-rich command in the command set, allowing the user to display
4
+ information about a running or completed bundle, coordinator, workflow, oozie action,
5
+ or hadoop job. In addition, for each type of job, you can display different aspects
6
+ of the job's state. For example, the job's definition, it's log file, its config
7
+ settings, etc. Display_jobs info display also shows the children of the current job
8
+ organized in a table. If the current job is a coordinator, the children will be all
9
+ the materializations of that coordinator, for exaple. Some of the options apply to
10
+ the info display about the current job (i.e. -v option) and some of the options
11
+ apply to the table of children (ie. -l option). The following display_job commands
12
+ illustrate the display of different job state aspects and their options:
13
+ \x5 $ hodor oozie:display_job info # displays overall info about the job
14
+ $ hodor oozie:display_job -v info # displays all available information
15
+ $ hodor oozie:display_job info -m 'data_source' # only show matching child rows
16
+ $ hodor oozie:display_job info -l 100 # show 100 children of current job
17
+ $ hodor oozie:display_job info -l 100 -o 100 # show second 100 children
18
+ $ hodor oozie:display_job info -k # display only killed children
19
+ $ hodor oozie:display_job log # displays the job's log file
20
+ $ hodor oozie:display_job log -w fail.log # writes log output to file
21
+ $ hodor oozie:display_job definition # displays the job's definition
22
+ $ hodor oozie:display_job conf # displays the job's property settings
23
+ $ hodor oozie:display_job conf -m 'rdbms' # matching properties (keys or values)
24
+ $ hodor oozie:display_job rest # displays current REST request url
25
+ $ hodor oozie:display_job json # raw json output of REST call
26
+ $ hodor oozie:display_job json -w rest.json # the -w option works for all aspects
27
+
28
+ In addition to displaying information about the current job, display_job can also
29
+ describe non-current jobs. Just pass in a job id, as follows:
30
+ \x5 $ hodor oozie:display_job 3 json # rest output for child #3
31
+ $ hodor oozie:display_job 0035642-151002103648730-oozie-oozi-W log
32
+ # ^^ display log for job ID
33
+
34
+ Display_job can function in 1 of 2 modes: query mode or change mode. In query mode,
35
+ display_job just queries for and displays the jobs state, but does not change the
36
+ current_job to it. In change mode, display_job still shows the job's state, but
37
+ also changes the current_job, like change_job does. The default mode is change_mode,
38
+ but this behavior can be altered in two ways. First, you can add the -q option to the
39
+ command line:
40
+ \x5 $ hodor oozie:display_job -q 3 # Just queries child 3, does not change to it
41
+
42
+ Alternatively, you can change display_job's default behavior by modifying the Hodor
43
+ preference ":display_job_query_mode" in ~/.hodor.yml as follows:
44
+ \x5 FILE: ${HOME}/.hodor.yml ----
45
+ :display_job_query_mode: true
46
+
47
+ Hodor reads ~/.hodor.yml on startup, and if the above flag is true, query mode becomes
48
+ display_job's default behavior.
49
+
50
+ Suggested Alias:
51
+ \x5 $ alias dj='hodor oozie:display_job'
52
+
@@ -0,0 +1,42 @@
1
+ Driver Scenarios
2
+ ---------------------------------------------------------------------------
3
+ Within the drivers directory, a subdirectory named "scenarios" is expected
4
+ by Hodor. The term "scenario" has aspecial meaning. A scenario is
5
+ the combination of the "fill_frequency" plus the "fill_type" that a workflow
6
+ should execute for. For example, some workflows run hourly, some daily. And
7
+ some workflows run over historic ranges of time (i.e. a "backfill"), and some
8
+ workflow run for future ranges of time (i.e. "incremental"). So, a driver
9
+ can run with a "scenario", which is the combination of the two:
10
+
11
+ fill frequency + fill type).
12
+
13
+ This scenarios that are typically defined are:
14
+
15
+ * hourly/incremental
16
+ * hourly/backfill
17
+ * daily/incremental
18
+ * daily/backfill
19
+
20
+ The scenarios directory, under drivers, defines Oozie coordinators and context
21
+ code artifacts that implement the scheduling and variable assignment concerns necessary
22
+ to implement the scenario. A typical scenario is organized as follows
23
+
24
+
25
+ hourly/
26
+ incremental/
27
+ context.xml
28
+ coordinator.xml # blocks on time online
29
+ coordinator-1.xml # blocks on time + 1 data input
30
+
31
+ For example, the above "hourly/incremental" driver run scenario defines a
32
+ coordinator that has an hourly frequency, along with a context.xml workflow that
33
+ defines the table partitioning scheme to partition down to hourly granularity
34
+ ("i.e. an hourly fill_frequency"). The "fill_type" part of the scenario indicates whether
35
+ the workflow is running from current time forward, or for historic date ranges.
36
+ If running over historic date ranges (i.e. backfill fill_type), certain optimizatons
37
+ can be made that run many hours at once.
38
+
39
+ Each driver cites the run scenario it expects to run within, via the "jobs.yml"
40
+ file. For more information about the jobs.yml file, type:
41
+
42
+ hodor oozie:topic jobs.yml