hodor 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (73) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +16 -0
  3. data/.gitmodules +3 -0
  4. data/.rspec +2 -0
  5. data/.ruby-gemset +1 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +5 -0
  8. data/Gemfile +4 -0
  9. data/Guardfile +11 -0
  10. data/README.md +105 -0
  11. data/Rakefile +105 -0
  12. data/bin/hodor +18 -0
  13. data/hodor.gemspec +47 -0
  14. data/lib/config/log4r_config.xml +35 -0
  15. data/lib/hodor.rb +83 -0
  16. data/lib/hodor/api/hdfs.rb +222 -0
  17. data/lib/hodor/api/oozie.rb +215 -0
  18. data/lib/hodor/api/oozie/action.rb +52 -0
  19. data/lib/hodor/api/oozie/bundle.rb +27 -0
  20. data/lib/hodor/api/oozie/coordinator.rb +53 -0
  21. data/lib/hodor/api/oozie/hadoop_job.rb +29 -0
  22. data/lib/hodor/api/oozie/job.rb +192 -0
  23. data/lib/hodor/api/oozie/materialization.rb +56 -0
  24. data/lib/hodor/api/oozie/query.rb +115 -0
  25. data/lib/hodor/api/oozie/session.rb +170 -0
  26. data/lib/hodor/api/oozie/workflow.rb +58 -0
  27. data/lib/hodor/cli.rb +146 -0
  28. data/lib/hodor/command.rb +164 -0
  29. data/lib/hodor/configuration.rb +80 -0
  30. data/lib/hodor/environment.rb +437 -0
  31. data/lib/hodor/ui/table.rb +130 -0
  32. data/lib/hodor/version.rb +3 -0
  33. data/lib/tasks/hdfs.thor +138 -0
  34. data/lib/tasks/master.thor +61 -0
  35. data/lib/tasks/oozie.thor +399 -0
  36. data/lib/tasks/sandbox.thor +87 -0
  37. data/spec/integration/api/oozie/action_spec.rb +69 -0
  38. data/spec/integration/api/oozie/bundle_spec.rb +33 -0
  39. data/spec/integration/api/oozie/coordinator_spec.rb +66 -0
  40. data/spec/integration/api/oozie/hadoop_job_spec.rb +29 -0
  41. data/spec/integration/api/oozie/job_spec.rb +15 -0
  42. data/spec/integration/api/oozie/materialization_spec.rb +66 -0
  43. data/spec/integration/api/oozie/query_spec.rb +43 -0
  44. data/spec/integration/api/oozie/session_spec.rb +18 -0
  45. data/spec/integration/api/oozie/workflow_spec.rb +65 -0
  46. data/spec/integration/api/oozie_spec.rb +198 -0
  47. data/spec/integration/fixtures/api/running_coordinators/req_resp_00.memo +6 -0
  48. data/spec/integration/fixtures/api/sample_action/req_resp_00.memo +5 -0
  49. data/spec/integration/fixtures/api/sample_action/req_resp_01.memo +7 -0
  50. data/spec/integration/fixtures/api/sample_bundle/req_resp_00.memo +6 -0
  51. data/spec/integration/fixtures/api/sample_coordinator/req_resp_00.memo +5 -0
  52. data/spec/integration/fixtures/api/sample_materialization/req_resp_00.memo +5 -0
  53. data/spec/integration/fixtures/api/sample_materialization/req_resp_01.memo +7 -0
  54. data/spec/integration/fixtures/api/sample_workflow/req_resp_00.memo +5 -0
  55. data/spec/spec_helper.rb +92 -0
  56. data/spec/support/d_v_r.rb +125 -0
  57. data/spec/support/hodor_api.rb +15 -0
  58. data/spec/unit/hodor/api/hdfs_spec.rb +63 -0
  59. data/spec/unit/hodor/api/oozie_spec.rb +32 -0
  60. data/spec/unit/hodor/environment_spec.rb +52 -0
  61. data/topics/hdfs/corresponding_paths.txt +31 -0
  62. data/topics/hdfs/overview.txt +10 -0
  63. data/topics/master/clusters.yml.txt +36 -0
  64. data/topics/master/overview.txt +17 -0
  65. data/topics/oozie/blocking_coordinators.txt +46 -0
  66. data/topics/oozie/composing_job_properties.txt +68 -0
  67. data/topics/oozie/display_job.txt +52 -0
  68. data/topics/oozie/driver_scenarios.txt +42 -0
  69. data/topics/oozie/inspecting_jobs.txt +59 -0
  70. data/topics/oozie/jobs.yml.txt +185 -0
  71. data/topics/oozie/overview.txt +43 -0
  72. data/topics/oozie/workers_and_drivers.txt +40 -0
  73. metadata +455 -0
@@ -0,0 +1,15 @@
1
+ require 'hodor/api/oozie/session'
2
+
3
+ shared_context "hodor api" do
4
+
5
+ attr_reader :memo
6
+
7
+ subject(:env) { ::Hodor::Environment.instance }
8
+ subject(:session) { ::Hodor::Oozie::Session.instance }
9
+ subject(:oozie) { ::Hodor::Oozie }
10
+
11
+ before(:each) do
12
+ @memo = DVR.new(self) unless (self.methods & [:scenario, :playback, :record]).empty?
13
+ end
14
+
15
+ end
@@ -0,0 +1,63 @@
1
+ require 'hodor/api/hdfs'
2
+
3
+ module Hodor
4
+
5
+ describe Hdfs do
6
+
7
+ describe "Required Public Interface" do
8
+
9
+ # .instance instead of .new necessitated by singleton:
10
+ subject(:hdfs_methods) { Hodor::Hdfs.instance_methods }
11
+
12
+ # Public methods
13
+ it { should include :pwd }
14
+ it { should include :path_on_hdfs }
15
+
16
+ end
17
+
18
+ context "test local to hdfs path operations" do
19
+
20
+ before(:each) do
21
+ use_settings hdfs_root: "/", hdfs_user: "hdfs"
22
+ use_pwd "company/workers/noop", false
23
+ end
24
+
25
+ context "ensure pwd maps correctly between file systems" do
26
+
27
+ subject(:hdfs) { Hodor::Hdfs.instance }
28
+
29
+ it "should correctly map test repo path to HDFS path" do
30
+ expect(hdfs.pwd).to match(/\/company\/workers\/noop/)
31
+ end
32
+ end
33
+
34
+ context "test putting file to HDFS" do
35
+
36
+ subject(:env) { Hodor::Environment.instance }
37
+ subject(:hdfs) { Hodor::Hdfs.instance }
38
+
39
+ it "should successfully construct ssh commandline to put file to HDFS" do
40
+ expect(File).to receive(:exists?).twice { true }
41
+ expect(env).to receive(:run_local).with(/cat workflow.xml.*=hdfs\s.*-put - \/company\/workers\/noop\/workflow.xml/, anything)
42
+ hdfs.put_file("workflow.xml")
43
+ end
44
+ end
45
+
46
+ context "test putting directory to HDFS" do
47
+
48
+ subject(:env) { Hodor::Environment.instance }
49
+ subject(:hdfs) { Hodor::Hdfs.instance }
50
+
51
+ it "should successfully construct ssh commandline to put directory to HDFS" do
52
+ expect(File).to receive(:exists?).twice { true }
53
+ expect(env).to receive(:run_local).with(/cat workflow.xml.*=hdfs\s.*-put - \/company\/workers\/noop\/workflow.xml/, anything)
54
+ hdfs.put_file("workflow.xml")
55
+ end
56
+ end
57
+
58
+
59
+ end
60
+
61
+ end
62
+
63
+ end
@@ -0,0 +1,32 @@
1
+ module Hodor
2
+ describe Oozie do
3
+ describe 'Required Public Interface' do
4
+ subject { Hodor::Oozie }
5
+
6
+ # Public methods
7
+ it { should respond_to? :job_by_id }
8
+ it { should respond_to? :job_by_path }
9
+ it { should respond_to? :change_job }
10
+ it { should respond_to? :compose_job_file }
11
+ it { should respond_to? :run_job }
12
+ end
13
+ context 'Filename prefixes' do
14
+ let(:prefix) { 'Test_prefix_' }
15
+ let(:full_path) { 'foo/foo/foo' }
16
+ let(:just_name_path) { 'foo' }
17
+ let(:correctly_prefixed) { 'foo/foo/Test_prefix_foo' }
18
+
19
+ it 'appends a supplied prefix to the file name' do
20
+ expect(subject.append_prefix_to_filename(full_path, prefix)).to eq(correctly_prefixed)
21
+ end
22
+
23
+ it 'appends a supplied prefix to a simple file name' do
24
+ expect(subject.append_prefix_to_filename(just_name_path, prefix)).to eq(prefix+just_name_path)
25
+ end
26
+
27
+ it 'keeps original filename if no prefix supplied' do
28
+ expect(subject.append_prefix_to_filename(full_path)).to eq(full_path)
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,52 @@
1
+ require 'hodor/environment'
2
+
3
+ module Hodor
4
+
5
+ describe Environment do
6
+
7
+ describe "Required Public Interface" do
8
+
9
+ # .instance instead of .new necessitated by singleton:
10
+ subject(:hadoop_env) { Hodor::Environment.instance_methods }
11
+
12
+ # Public fields
13
+ it { should include :logger }
14
+
15
+ # Public methods
16
+ it { should include :erb_sub }
17
+ it { should include :erb_load }
18
+ it { should include :yml_load }
19
+ it { should include :root }
20
+ end
21
+
22
+ describe "Ensure usable test repo" do
23
+
24
+ # .instance instead of .new necessitated by singleton:
25
+ subject(:env) { Hodor::Environment.instance }
26
+
27
+ it "should have correct root" do
28
+ expect(subject.root).to match(/spec\/test_repo/)
29
+ end
30
+ end
31
+
32
+ context "Test basic environment methods" do
33
+
34
+ subject(:env) { Hodor::Environment.instance }
35
+
36
+ before(:each) do
37
+ use_settings hdfs_root: "/", hdfs_user: "hdfs"
38
+ use_pwd "drivers/testbench"
39
+ end
40
+
41
+ it "should fail if no jobs.yml file exists" do
42
+ expect(
43
+ env.paths_from_root(Dir.pwd)
44
+ ).to match_array(
45
+ [/spec\/test_repo/,
46
+ /spec\/test_repo\/drivers/,
47
+ /spec\/test_repo\/drivers\/testbench/]
48
+ )
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,31 @@
1
+ HDFS Corresponding Paths
2
+ --------------------------------------------------------------------------------------------
3
+ There are two methods for running Hdfs commands using this namespace. First, you can run the
4
+ "hdfs:fs" pass through command, as follows:
5
+
6
+ $ hodor hdfs:fs -ls /big_data/pipeline/drivers/testbench
7
+
8
+ Because "fs" is a pass-through command, it passes all of its arguments and options through
9
+ as-is to the "hadoop fs" command line tool on the remote host. The "fs" command does no
10
+ argument processing locally, it simply passes them via ssh for handling by the remote tool.
11
+ Alternatively, you could accomplish the same directory listing as the above command, using
12
+ the 'hdfs:ls' command:
13
+
14
+ $ cd /big_data/pipeline/drivers/testbench
15
+ $ hodor hdfs:ls
16
+
17
+ The "ls" command uses your current local path to calculate the "corresponding path" on the
18
+ remote HDFS volume, and lists the contents of that directory. The "corresponding path"
19
+ on the remote HDFS volume is automatically determined by the Hdfs namespace and used by
20
+ each of its commands.
21
+
22
+ This HDFS path inference from your local path is done by all commands in the Hdfs namespace,
23
+ except for the "fs" pass-through. For example, to upload a file in your current local directory
24
+ to the corresponding path on the remote HDFS volume:
25
+
26
+ $ cd /big_data/pipeline/drivers/
27
+ $ hodor hdfs:put ingestion.xml
28
+
29
+ Note: corresponding path is determined by first calculating your local path relative to
30
+ the root of your git repo, know as the "repo relative path". Next, the repo relative
31
+ path is appended to the HDFS root that you configure in your 'clusters.yml' file.
@@ -0,0 +1,10 @@
1
+ Hodor's Hdfs namespace assembles a command set that operates on a remote HDFS volume. Instead
2
+ of ssh'ing to the remote Hadoop master, and running 'hadoop fs <command>' directly on the
3
+ remote host, you run commands from Hodor's Hdfs namespace locally.
4
+
5
+ HELP TOPICS:
6
+ ------------
7
+ * Local vs HDFS paths - to view help about HDFS corresponding paths, type:
8
+ $ hodor hdfs:topic corresponding_paths
9
+
10
+ \x5
@@ -0,0 +1,36 @@
1
+ The Clusters.yml File
2
+ ---------------------------------------------------------------------------------------------------
3
+ The "clusters.yml" file is a file Hodor reads at startup that allows you to define multiple hadoop
4
+ clusters, and the url and resource bindings they require, so that your Hodor session can be easily
5
+ switched between multiple available Hadoop clusters. Hodor''s namespaces and commands use the url
6
+ and resource bindings from clusters.yml to select the destination for its various SSH and REST
7
+ calls. The clusters.yml file is expected in your hadoop project git repo at the path:
8
+
9
+ "<repo_root>/config/clusters.yml"
10
+
11
+ The clusters.yml file will typically look something like the following:
12
+
13
+ :production:
14
+ :nameNode: hdfs://hadoop-prod.mycompany.com:8020
15
+ :jobTracker: hadoop-prod.mycompany.com:8050
16
+ :oozie_url: http://hadoop-prod.mycompany.com:11000/oozie
17
+
18
+ :staging:
19
+ :nameNode: hdfs://hadoop-stage.mycompany.com:8020
20
+ :jobTracker: hadoop-stage.mycompany.com:8050
21
+ :oozie_url: http://hadoop-stage.mycompany.com:11000/oozie
22
+
23
+ The section (staging or production) that Hodor uses (e.g. the "target cluster") is configured using:
24
+
25
+ $ export HADOOP_ENV=production
26
+
27
+ To display a list of all key/value pairs that are defined for the current target cluster:
28
+
29
+ $ hodor master:config
30
+
31
+ To view more information about how the clusters.yml keys and values can be used to
32
+ parameterize your job properties, type:
33
+
34
+ $ hodor oozie:topic compose_job_properties
35
+
36
+
@@ -0,0 +1,17 @@
1
+
2
+ Hodor's Master namespace functions as a local proxy for the master (e.g. "Name node") of your target
3
+ Hadoop cluster. The commands in this namespace either operate directly on the master node or
4
+ provide information about the cluster the master is master of. For example, to run a Linux command
5
+ on the master node:
6
+ \xt $ hodor master:exec ps -ef | grep java
7
+
8
+ Part of the value of this namespace, it is allows you to operate on the master node, whatever that
9
+ may be. For example, regardless of whether you are targetting a staging or production hadoop cluster
10
+ (as configured in your "clusters.yml" file), the 'master:exec' command will route appropriately.
11
+
12
+ HELP TOPICS:
13
+ ------------
14
+ * The clusters.yml file - to view help about the clusters.yml, type:
15
+ $ hodor master:topic clusters.yml
16
+
17
+ \x5
@@ -0,0 +1,46 @@
1
+ Blocking Coordinators
2
+ =====================
3
+ Oozie's coordinators can block on 2 things: the clock and data availability.
4
+ A coordinator that blocks on both the clock and data availability is called
5
+ a "blocking coordinator" because it will not run until both the time and the
6
+ input data it is waiting on arrive. The run.properties file allows you to
7
+ choose either a blocking coordinator or a non-blocking coordinator to pair
8
+ with your driver workflow. To specify a blocking coordinator, you will set
9
+ the following property values:
10
+
11
+ oozie.coord.application.path=${PWD}/coordinator-1-input.xml
12
+
13
+ This property value setting specifies that you want to use a blocking
14
+ coordinator that has 1 data input that it waits on. There is also a
15
+ similarly named 2-input coordinator if you require that. It is not sufficient
16
+ to specify that your coordinator should block on 1 input, you must also
17
+ specify which input that is. To do that, you'll use the "input_1" variable.
18
+ For example:
19
+
20
+ input_1=${service_data}
21
+
22
+ The "input_1" variable is expanded by the 1-input coordinator to specify
23
+ which input dataset, the coordinator monitors for data availability flags.
24
+ Input 1 is set to the variable ${service_data}, which is defined in the
25
+ shared (higher level) job.properties.erb file as follows:
26
+
27
+ service_data=${shared}/ready/service_data
28
+
29
+ By setting the input_1 and coordinator path variables above in the
30
+ run.properties file, the rake task will launch the corresponding coordinator
31
+ from the scenarios directory. The subdirectory with scenarios is chosen
32
+ according to your scenario variable assignment:
33
+
34
+ scenario: hourly/incremental
35
+
36
+ With this assignment, the 1-input blocking coordinator located in the
37
+ hourly/incremental subdirectory of scenarios will be launch. It will monitor
38
+ the service_data dataset, and when new data arrives, it will run the
39
+ workflow indicated by your assignment to the "driver" variable:
40
+
41
+ driver=periodic/analyze_campaign
42
+
43
+ The approach treats the frequency, data dependencies, run contexts and
44
+ workflow selections as distinct concerns and allows you to mix
45
+ and match them via run.properties settings as the situation warrants.
46
+
@@ -0,0 +1,68 @@
1
+ Job.property Files
2
+ -----------------------------------------------------------------------------------------------
3
+ Before you can run a job, you must first specify the properties it should receive. This is
4
+ done via a job.properties file. Hodor's Oozie namespace enables you to assemble a hierarchy
5
+ of job.property.erb files into a single job.property file suitable for execution of an
6
+ Oozie job. The following directory structure shows several Job.properties.erb files at
7
+ different locations within the hierarchy:
8
+
9
+ <git_repo>
10
+ job.properties.erb
11
+ <drivers/>
12
+ job.properties.erb
13
+ <billing>
14
+ job.properties.erb
15
+ <campaigns>
16
+ job.properties.erb
17
+ <diagnostic>
18
+ job.properties.erb
19
+
20
+ For example, running a workflow contained in the <campaigns> directory, will pull in three
21
+ job.properties.erb files. First, the one locate at the root, followed by the one located in
22
+ <drivers>, followed by the the one located in <compaigns>. Each successive job.properties.erb
23
+ file overrides the properties of its parents located higher in the directory structure.
24
+
25
+ A typical job.properties file might look like the following:
26
+
27
+ nameNode=<%= env[:nameNode] %>
28
+ jobTracker=<%= env[:jobTracker] %>
29
+ oozie_url=<%= env[:oozie_url] %>
30
+
31
+ Each line in the above job.property file sample represents the coming together of two
32
+ distinct property systems, Java's "Velocity" property system and Ruby's "ERB" property
33
+ system:
34
+
35
+ <Hadoop Property> = <Ruby ERB expansion>
36
+
37
+ The key on the left (i.e. nameNode) is a Hadoop Java property that is referenced and expanded
38
+ by workflows and coordinators at runtime on the Hadoop cluster. The value on the right
39
+ (i.e. env[:nameNode])is a Ruby ERB expansion, taken from the clusters.yml file that is
40
+ replace _on you local machine_ prior to deploying the workflows and coordinators to HDFS.
41
+ In other words, Java variables are expand post-deployment. ERB variables are expanded
42
+ pre-deployment. ERB expansions within a job.properties file provide a means of passing a
43
+ value from ERB to Velocity. The "<%= env[:nameNode] %>" part of the above
44
+ job.properties.erb sample is replaced by the ":nameNode" key value (from clusters.yml)
45
+ for the currently targetted Hadoop cluster. This variable substitution happens when you
46
+ run the "oozie:run_job" command, so the target cluster's configuration is referenced
47
+ at that moment.
48
+
49
+ This approach of parameterization the urls and resource bindings that reach a target
50
+ Hadoop cluster allows one a single job.properties file to work for all Hadoop clusters
51
+ (i.e. production, staging, ad-hoc) in your infrastructure. To get a listing of what ERB
52
+ variables are available for expansion within the hadoop environment currently being
53
+ targeted:
54
+
55
+ $ hodor master:config
56
+ INFO|nameNode : hdfs://sandbox.hortonworks.com:8020
57
+ INFO|jobTracker : sandbox.hortonworks.com:8050
58
+ INFO|oozie_url : http://127.0.0.1:11000/oozie
59
+ INFO|ssh_user : root
60
+ INFO|ssh_host : 127.0.0.1
61
+ INFO|ssh_port : 2222
62
+
63
+ Finally, the motivation for breaking a single job.properties file up into muliple segments
64
+ arranged hierarchically is adherance to the "Do not repeat yourself" (DRY) principle.
65
+ By breaking the job.properties.erb file into a hierarchy, we avoid lots of duplication
66
+ that would exist in a flat file system. For example, properties that are shared by all
67
+ jobs are pushed to higher job.properties.erb files. Properties that are specific to a
68
+ particular job, are pushed to lower ones.
@@ -0,0 +1,52 @@
1
+ The display_job command allows you to explore and print information about the current
2
+ job within the Oozie job hierarchy and (optionally) to move between jobs. Display_job
3
+ is the most feature-rich command in the command set, allowing the user to display
4
+ information about a running or completed bundle, coordinator, workflow, oozie action,
5
+ or hadoop job. In addition, for each type of job, you can display different aspects
6
+ of the job's state. For example, the job's definition, it's log file, its config
7
+ settings, etc. Display_jobs info display also shows the children of the current job
8
+ organized in a table. If the current job is a coordinator, the children will be all
9
+ the materializations of that coordinator, for exaple. Some of the options apply to
10
+ the info display about the current job (i.e. -v option) and some of the options
11
+ apply to the table of children (ie. -l option). The following display_job commands
12
+ illustrate the display of different job state aspects and their options:
13
+ \x5 $ hodor oozie:display_job info # displays overall info about the job
14
+ $ hodor oozie:display_job -v info # displays all available information
15
+ $ hodor oozie:display_job info -m 'data_source' # only show matching child rows
16
+ $ hodor oozie:display_job info -l 100 # show 100 children of current job
17
+ $ hodor oozie:display_job info -l 100 -o 100 # show second 100 children
18
+ $ hodor oozie:display_job info -k # display only killed children
19
+ $ hodor oozie:display_job log # displays the job's log file
20
+ $ hodor oozie:display_job log -w fail.log # writes log output to file
21
+ $ hodor oozie:display_job definition # displays the job's definition
22
+ $ hodor oozie:display_job conf # displays the job's property settings
23
+ $ hodor oozie:display_job conf -m 'rdbms' # matching properties (keys or values)
24
+ $ hodor oozie:display_job rest # displays current REST request url
25
+ $ hodor oozie:display_job json # raw json output of REST call
26
+ $ hodor oozie:display_job json -w rest.json # the -w option works for all aspects
27
+
28
+ In addition to displaying information about the current job, display_job can also
29
+ describe non-current jobs. Just pass in a job id, as follows:
30
+ \x5 $ hodor oozie:display_job 3 json # rest output for child #3
31
+ $ hodor oozie:display_job 0035642-151002103648730-oozie-oozi-W log
32
+ # ^^ display log for job ID
33
+
34
+ Display_job can function in 1 of 2 modes: query mode or change mode. In query mode,
35
+ display_job just queries for and displays the jobs state, but does not change the
36
+ current_job to it. In change mode, display_job still shows the job's state, but
37
+ also changes the current_job, like change_job does. The default mode is change_mode,
38
+ but this behavior can be altered in two ways. First, you can add the -q option to the
39
+ command line:
40
+ \x5 $ hodor oozie:display_job -q 3 # Just queries child 3, does not change to it
41
+
42
+ Alternatively, you can change display_job's default behavior by modifying the Hodor
43
+ preference ":display_job_query_mode" in ~/.hodor.yml as follows:
44
+ \x5 FILE: ${HOME}/.hodor.yml ----
45
+ :display_job_query_mode: true
46
+
47
+ Hodor reads ~/.hodor.yml on startup, and if the above flag is true, query mode becomes
48
+ display_job's default behavior.
49
+
50
+ Suggested Alias:
51
+ \x5 $ alias dj='hodor oozie:display_job'
52
+
@@ -0,0 +1,42 @@
1
+ Driver Scenarios
2
+ ---------------------------------------------------------------------------
3
+ Within the drivers directory, a subdirectory named "scenarios" is expected
4
+ by Hodor. The term "scenario" has aspecial meaning. A scenario is
5
+ the combination of the "fill_frequency" plus the "fill_type" that a workflow
6
+ should execute for. For example, some workflows run hourly, some daily. And
7
+ some workflows run over historic ranges of time (i.e. a "backfill"), and some
8
+ workflow run for future ranges of time (i.e. "incremental"). So, a driver
9
+ can run with a "scenario", which is the combination of the two:
10
+
11
+ fill frequency + fill type).
12
+
13
+ This scenarios that are typically defined are:
14
+
15
+ * hourly/incremental
16
+ * hourly/backfill
17
+ * daily/incremental
18
+ * daily/backfill
19
+
20
+ The scenarios directory, under drivers, defines Oozie coordinators and context
21
+ code artifacts that implement the scheduling and variable assignment concerns necessary
22
+ to implement the scenario. A typical scenario is organized as follows
23
+
24
+
25
+ hourly/
26
+ incremental/
27
+ context.xml
28
+ coordinator.xml # blocks on time online
29
+ coordinator-1.xml # blocks on time + 1 data input
30
+
31
+ For example, the above "hourly/incremental" driver run scenario defines a
32
+ coordinator that has an hourly frequency, along with a context.xml workflow that
33
+ defines the table partitioning scheme to partition down to hourly granularity
34
+ ("i.e. an hourly fill_frequency"). The "fill_type" part of the scenario indicates whether
35
+ the workflow is running from current time forward, or for historic date ranges.
36
+ If running over historic date ranges (i.e. backfill fill_type), certain optimizatons
37
+ can be made that run many hours at once.
38
+
39
+ Each driver cites the run scenario it expects to run within, via the "jobs.yml"
40
+ file. For more information about the jobs.yml file, type:
41
+
42
+ hodor oozie:topic jobs.yml