RubyGems - hodor - Versions diffs - 1.0.2 - Mend

hodor 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

checksums.yaml +7 -0
data/.gitignore +16 -0
data/.gitmodules +3 -0
data/.rspec +2 -0
data/.ruby-gemset +1 -0
data/.ruby-version +1 -0
data/.travis.yml +5 -0
data/Gemfile +4 -0
data/Guardfile +11 -0
data/README.md +105 -0
data/Rakefile +105 -0
data/bin/hodor +18 -0
data/hodor.gemspec +47 -0
data/lib/config/log4r_config.xml +35 -0
data/lib/hodor.rb +83 -0
data/lib/hodor/api/hdfs.rb +222 -0
data/lib/hodor/api/oozie.rb +215 -0
data/lib/hodor/api/oozie/action.rb +52 -0
data/lib/hodor/api/oozie/bundle.rb +27 -0
data/lib/hodor/api/oozie/coordinator.rb +53 -0
data/lib/hodor/api/oozie/hadoop_job.rb +29 -0
data/lib/hodor/api/oozie/job.rb +192 -0
data/lib/hodor/api/oozie/materialization.rb +56 -0
data/lib/hodor/api/oozie/query.rb +115 -0
data/lib/hodor/api/oozie/session.rb +170 -0
data/lib/hodor/api/oozie/workflow.rb +58 -0
data/lib/hodor/cli.rb +146 -0
data/lib/hodor/command.rb +164 -0
data/lib/hodor/configuration.rb +80 -0
data/lib/hodor/environment.rb +437 -0
data/lib/hodor/ui/table.rb +130 -0
data/lib/hodor/version.rb +3 -0
data/lib/tasks/hdfs.thor +138 -0
data/lib/tasks/master.thor +61 -0
data/lib/tasks/oozie.thor +399 -0
data/lib/tasks/sandbox.thor +87 -0
data/spec/integration/api/oozie/action_spec.rb +69 -0
data/spec/integration/api/oozie/bundle_spec.rb +33 -0
data/spec/integration/api/oozie/coordinator_spec.rb +66 -0
data/spec/integration/api/oozie/hadoop_job_spec.rb +29 -0
data/spec/integration/api/oozie/job_spec.rb +15 -0
data/spec/integration/api/oozie/materialization_spec.rb +66 -0
data/spec/integration/api/oozie/query_spec.rb +43 -0
data/spec/integration/api/oozie/session_spec.rb +18 -0
data/spec/integration/api/oozie/workflow_spec.rb +65 -0
data/spec/integration/api/oozie_spec.rb +198 -0
data/spec/integration/fixtures/api/running_coordinators/req_resp_00.memo +6 -0
data/spec/integration/fixtures/api/sample_action/req_resp_00.memo +5 -0
data/spec/integration/fixtures/api/sample_action/req_resp_01.memo +7 -0
data/spec/integration/fixtures/api/sample_bundle/req_resp_00.memo +6 -0
data/spec/integration/fixtures/api/sample_coordinator/req_resp_00.memo +5 -0
data/spec/integration/fixtures/api/sample_materialization/req_resp_00.memo +5 -0
data/spec/integration/fixtures/api/sample_materialization/req_resp_01.memo +7 -0
data/spec/integration/fixtures/api/sample_workflow/req_resp_00.memo +5 -0
data/spec/spec_helper.rb +92 -0
data/spec/support/d_v_r.rb +125 -0
data/spec/support/hodor_api.rb +15 -0
data/spec/unit/hodor/api/hdfs_spec.rb +63 -0
data/spec/unit/hodor/api/oozie_spec.rb +32 -0
data/spec/unit/hodor/environment_spec.rb +52 -0
data/topics/hdfs/corresponding_paths.txt +31 -0
data/topics/hdfs/overview.txt +10 -0
data/topics/master/clusters.yml.txt +36 -0
data/topics/master/overview.txt +17 -0
data/topics/oozie/blocking_coordinators.txt +46 -0
data/topics/oozie/composing_job_properties.txt +68 -0
data/topics/oozie/display_job.txt +52 -0
data/topics/oozie/driver_scenarios.txt +42 -0
data/topics/oozie/inspecting_jobs.txt +59 -0
data/topics/oozie/jobs.yml.txt +185 -0
data/topics/oozie/overview.txt +43 -0
data/topics/oozie/workers_and_drivers.txt +40 -0
metadata +455 -0

data/spec/support/hodor_api.rb ADDED

@@ -0,0 +1,15 @@
+require 'hodor/api/oozie/session'
+shared_context "hodor api" do
+  attr_reader :memo
+  subject(:env) { ::Hodor::Environment.instance }
+  subject(:session) { ::Hodor::Oozie::Session.instance }
+  subject(:oozie) { ::Hodor::Oozie }
+  before(:each) do
+    @memo = DVR.new(self) unless (self.methods & [:scenario, :playback, :record]).empty?
+  end
+end

data/spec/unit/hodor/api/hdfs_spec.rb ADDED

@@ -0,0 +1,63 @@
+require 'hodor/api/hdfs'
+module Hodor
+  describe Hdfs do
+    describe "Required Public Interface" do
+      # .instance instead of .new necessitated by singleton:
+      subject(:hdfs_methods) { Hodor::Hdfs.instance_methods }
+      # Public methods
+      it { should include :pwd }
+      it { should include :path_on_hdfs }
+    end
+    context "test local to hdfs path operations" do
+      before(:each) do
+        use_settings hdfs_root: "/", hdfs_user: "hdfs"
+        use_pwd "company/workers/noop", false
+      end
+      context "ensure pwd maps correctly between file systems" do
+        subject(:hdfs) { Hodor::Hdfs.instance }
+        it "should correctly map test repo path to HDFS path" do
+          expect(hdfs.pwd).to match(/\/company\/workers\/noop/)
+        end
+      end
+      context "test putting file to HDFS" do
+        subject(:env) { Hodor::Environment.instance }
+        subject(:hdfs) { Hodor::Hdfs.instance }
+        it "should successfully construct ssh commandline to put file to HDFS" do
+          expect(File).to receive(:exists?).twice { true }
+          expect(env).to receive(:run_local).with(/cat workflow.xml.*=hdfs\s.*-put - \/company\/workers\/noop\/workflow.xml/, anything)
+          hdfs.put_file("workflow.xml")
+        end
+      end
+      context "test putting directory to HDFS" do
+        subject(:env) { Hodor::Environment.instance }
+        subject(:hdfs) { Hodor::Hdfs.instance }
+        it "should successfully construct ssh commandline to put directory to HDFS" do
+          expect(File).to receive(:exists?).twice { true }
+          expect(env).to receive(:run_local).with(/cat workflow.xml.*=hdfs\s.*-put - \/company\/workers\/noop\/workflow.xml/, anything)
+          hdfs.put_file("workflow.xml")
+        end
+      end
+    end
+  end
+end

data/spec/unit/hodor/api/oozie_spec.rb ADDED

@@ -0,0 +1,32 @@
+module Hodor
+  describe Oozie do
+    describe 'Required Public Interface' do
+      subject { Hodor::Oozie }
+      # Public methods
+      it { should respond_to? :job_by_id }
+      it { should respond_to? :job_by_path }
+      it { should respond_to? :change_job }
+      it { should respond_to? :compose_job_file }
+      it { should respond_to? :run_job }
+    end
+    context 'Filename prefixes' do
+      let(:prefix) { 'Test_prefix_' }
+      let(:full_path) { 'foo/foo/foo' }
+      let(:just_name_path) { 'foo' }
+      let(:correctly_prefixed) { 'foo/foo/Test_prefix_foo' }
+      it 'appends a supplied prefix to the file name' do
+         expect(subject.append_prefix_to_filename(full_path, prefix)).to eq(correctly_prefixed)
+      end
+      it 'appends a supplied prefix to a simple file name' do
+        expect(subject.append_prefix_to_filename(just_name_path, prefix)).to eq(prefix+just_name_path)
+      end
+      it 'keeps original filename if no prefix supplied' do
+        expect(subject.append_prefix_to_filename(full_path)).to eq(full_path)
+      end
+    end
+  end
+end

data/spec/unit/hodor/environment_spec.rb ADDED

@@ -0,0 +1,52 @@
+require 'hodor/environment'
+module Hodor
+  describe Environment do
+    describe "Required Public Interface" do
+      # .instance instead of .new necessitated by singleton:
+      subject(:hadoop_env) { Hodor::Environment.instance_methods }
+      # Public fields
+      it { should include :logger }
+      # Public methods
+      it { should include :erb_sub }
+      it { should include :erb_load }
+      it { should include :yml_load }
+      it { should include :root }
+    end
+    describe "Ensure usable test repo" do
+      # .instance instead of .new necessitated by singleton:
+      subject(:env) { Hodor::Environment.instance }
+      it "should have correct root" do
+        expect(subject.root).to match(/spec\/test_repo/)
+      end
+    end
+    context "Test basic environment methods" do
+      subject(:env) { Hodor::Environment.instance }
+      before(:each) do
+        use_settings hdfs_root: "/", hdfs_user: "hdfs"
+        use_pwd "drivers/testbench"
+      end
+      it "should fail if no jobs.yml file exists" do
+        expect(
+          env.paths_from_root(Dir.pwd)
+        ).to match_array(
+          [/spec\/test_repo/,
+           /spec\/test_repo\/drivers/,
+           /spec\/test_repo\/drivers\/testbench/]
+        )
+      end
+    end
+  end
+end

data/topics/hdfs/corresponding_paths.txt ADDED

@@ -0,0 +1,31 @@
+HDFS Corresponding Paths
+--------------------------------------------------------------------------------------------
+There are two methods for running Hdfs commands using this namespace. First, you can run the
+"hdfs:fs" pass through command, as follows:
+    $ hodor hdfs:fs -ls /big_data/pipeline/drivers/testbench
+Because "fs" is a pass-through command, it passes all of its arguments and options through
+as-is to the "hadoop fs" command line tool on the remote host. The "fs" command does no
+argument processing locally, it simply passes them via ssh for handling by the remote tool.
+Alternatively, you could accomplish the same directory listing as the above command, using
+the 'hdfs:ls' command:
+    $ cd /big_data/pipeline/drivers/testbench
+    $ hodor hdfs:ls
+The "ls" command uses your current local path to calculate the "corresponding path" on the
+remote HDFS volume, and lists the contents of that directory. The "corresponding path"
+on the remote HDFS volume is automatically determined by the Hdfs namespace and used by
+each of its commands.
+This HDFS path inference from your local path is done by all commands in the Hdfs namespace,
+except for the "fs" pass-through. For example, to upload a file in your current local directory
+to the corresponding path on the remote HDFS volume:
+    $ cd /big_data/pipeline/drivers/
+    $ hodor hdfs:put ingestion.xml
+Note: corresponding path is determined by first calculating your local path relative to
+      the root of your git repo, know as the "repo relative path". Next, the repo relative
+      path is appended to the HDFS root that you configure in your 'clusters.yml' file.

data/topics/hdfs/overview.txt ADDED

@@ -0,0 +1,10 @@
+Hodor's Hdfs namespace assembles a command set that operates on a remote HDFS volume. Instead
+of ssh'ing to the remote Hadoop master, and running 'hadoop fs <command>' directly on the
+remote host, you run commands from Hodor's Hdfs namespace locally.
+HELP TOPICS:
+------------
+ * Local vs HDFS paths - to view help about HDFS corresponding paths, type:
+     $ hodor hdfs:topic corresponding_paths
+\x5

data/topics/master/clusters.yml.txt ADDED

@@ -0,0 +1,36 @@
+The Clusters.yml File
+---------------------------------------------------------------------------------------------------
+The "clusters.yml" file is a file Hodor reads at startup that allows you to define multiple hadoop
+clusters, and the url and resource bindings they require, so that your Hodor session can be easily
+switched between multiple available Hadoop clusters. Hodor''s namespaces and commands use the url
+and resource bindings from clusters.yml to select the destination for its various SSH and REST
+calls. The clusters.yml file is expected in your hadoop project git repo at the path:
+    "<repo_root>/config/clusters.yml"
+The clusters.yml file will typically look something like the following:
+    :production:
+      :nameNode: hdfs://hadoop-prod.mycompany.com:8020
+      :jobTracker: hadoop-prod.mycompany.com:8050
+      :oozie_url: http://hadoop-prod.mycompany.com:11000/oozie
+    :staging:
+      :nameNode: hdfs://hadoop-stage.mycompany.com:8020
+      :jobTracker: hadoop-stage.mycompany.com:8050
+      :oozie_url: http://hadoop-stage.mycompany.com:11000/oozie
+The section (staging or production) that Hodor uses (e.g. the "target cluster") is configured using:
+    $ export HADOOP_ENV=production
+To display a list of all key/value pairs that are defined for the current target cluster:
+    $ hodor master:config
+To view more information about how the clusters.yml keys and values can be used to
+parameterize your job properties, type:
+    $ hodor oozie:topic compose_job_properties

data/topics/master/overview.txt ADDED

@@ -0,0 +1,17 @@
+Hodor's Master namespace functions as a local proxy for the master (e.g. "Name node") of your target
+Hadoop cluster.  The commands in this namespace either operate directly on the master node or
+provide information about the cluster the master is master of. For example, to run a Linux command
+on the master node:
+\xt    $ hodor master:exec ps -ef | grep java
+Part of the value of this namespace, it is allows you to operate on the master node, whatever that
+may be. For example, regardless of whether you are targetting a staging or production hadoop cluster
+(as configured in your "clusters.yml" file), the 'master:exec' command will route appropriately.
+HELP TOPICS:
+------------
+ * The clusters.yml file - to view help about the clusters.yml, type:
+    $ hodor master:topic clusters.yml
+\x5

data/topics/oozie/blocking_coordinators.txt ADDED

@@ -0,0 +1,46 @@
+Blocking Coordinators
+=====================
+Oozie's coordinators can block on 2 things: the clock and data availability.
+A coordinator that blocks on both the clock and data availability is called
+a "blocking coordinator" because it will not run until both the time and the
+input data it is waiting on arrive. The run.properties file allows you to
+choose either a blocking coordinator or a non-blocking coordinator to pair
+with your driver workflow. To specify a blocking coordinator, you will set
+the following property values:
+  oozie.coord.application.path=${PWD}/coordinator-1-input.xml
+This property value setting specifies that you want to use a blocking
+coordinator that has 1 data input that it waits on. There is also a
+similarly named 2-input coordinator if you require that. It is not sufficient
+to specify that your coordinator should block on 1 input, you must also
+specify which input that is. To do that, you'll use the "input_1" variable.
+For example:
+  input_1=${service_data}
+The "input_1" variable is expanded by the 1-input coordinator to specify
+which input dataset, the coordinator monitors for data availability flags.
+Input 1 is set to the variable ${service_data}, which is defined in the
+shared (higher level) job.properties.erb file as follows:
+  service_data=${shared}/ready/service_data
+By setting the input_1 and coordinator path variables above in the
+run.properties file, the rake task will launch the corresponding coordinator
+from the scenarios directory. The subdirectory with scenarios is chosen
+according to your scenario variable assignment:
+  scenario: hourly/incremental
+With this assignment, the 1-input blocking coordinator located in the
+hourly/incremental subdirectory of scenarios will be launch. It will monitor
+the service_data dataset, and when new data arrives, it will run the
+workflow indicated by your assignment to the "driver" variable:
+  driver=periodic/analyze_campaign
+The approach treats the frequency, data dependencies, run contexts and
+workflow selections as distinct concerns and allows you to mix
+and match them via run.properties settings as the situation warrants.

data/topics/oozie/composing_job_properties.txt ADDED

@@ -0,0 +1,68 @@
+Job.property Files
+-----------------------------------------------------------------------------------------------
+Before you can run a job, you must first specify the properties it should receive. This is
+done via a job.properties file. Hodor's Oozie namespace enables you to assemble a hierarchy
+of job.property.erb files into a single job.property file suitable for execution of an
+Oozie job. The following directory structure shows several Job.properties.erb files at
+different locations within the hierarchy:
+  <git_repo>
+     job.properties.erb
+     <drivers/>
+        job.properties.erb
+          <billing>
+            job.properties.erb
+          <campaigns>
+            job.properties.erb
+          <diagnostic>
+            job.properties.erb
+For example, running a workflow contained in the <campaigns> directory, will pull in three
+job.properties.erb files. First, the one locate at the root, followed by the one located in
+<drivers>, followed by the the one located in <compaigns>. Each successive job.properties.erb
+file overrides the properties of its parents located higher in the directory structure.
+A typical job.properties file might look like the following:
+    nameNode=<%= env[:nameNode] %>
+    jobTracker=<%= env[:jobTracker] %>
+    oozie_url=<%= env[:oozie_url] %>
+Each line in the above job.property file sample represents the coming together of two
+distinct property systems, Java's "Velocity" property system and Ruby's "ERB" property
+system:
+    <Hadoop Property> = <Ruby ERB expansion>
+The key on the left (i.e. nameNode) is a Hadoop Java property that is referenced and expanded
+by workflows and coordinators at runtime on the Hadoop cluster. The value on the right
+(i.e. env[:nameNode])is a Ruby ERB expansion, taken from the clusters.yml file that is
+replace _on you local machine_ prior to deploying the workflows and coordinators to HDFS.
+In other words, Java variables are expand post-deployment. ERB variables are expanded
+pre-deployment. ERB expansions within a job.properties file provide a means of passing a
+value from ERB to Velocity. The "<%= env[:nameNode] %>" part of the above
+job.properties.erb sample is replaced by the ":nameNode" key value (from clusters.yml)
+for the currently targetted Hadoop cluster. This variable substitution happens when you
+run the "oozie:run_job" command, so the target cluster's configuration is referenced
+at that moment.
+This approach of parameterization the urls and resource bindings that reach a target
+Hadoop cluster allows one a single job.properties file to work for all Hadoop clusters
+(i.e. production, staging, ad-hoc) in your infrastructure. To get a listing of what ERB
+variables are available for expansion within the hadoop environment currently being
+targeted:
+    $ hodor master:config
+    INFO|nameNode :  hdfs://sandbox.hortonworks.com:8020
+    INFO|jobTracker :  sandbox.hortonworks.com:8050
+    INFO|oozie_url :  http://127.0.0.1:11000/oozie
+    INFO|ssh_user :  root
+    INFO|ssh_host :  127.0.0.1
+    INFO|ssh_port :  2222
+Finally, the motivation for breaking a single job.properties file up into muliple segments
+arranged hierarchically is adherance to the "Do not repeat yourself" (DRY) principle.
+By breaking the job.properties.erb file into a hierarchy, we avoid lots of duplication
+that would exist in a flat file system. For example, properties that are shared by all
+jobs are pushed to higher job.properties.erb files. Properties that are specific to a
+particular job, are pushed to lower ones.

data/topics/oozie/display_job.txt ADDED

@@ -0,0 +1,52 @@
+The display_job command allows you to explore and print information about the current
+job within the Oozie job hierarchy and (optionally) to move between jobs. Display_job
+is the most feature-rich command in the command set, allowing the user to display
+information about a running or completed bundle, coordinator, workflow, oozie action,
+or hadoop job. In addition, for each type of job, you can display different aspects
+of the job's state. For example, the job's definition, it's log file, its config
+settings, etc. Display_jobs info display also shows the children of the current job
+organized in a table. If the current job is a coordinator, the children will be all
+the materializations of that coordinator, for exaple. Some of the options apply to
+the info display about the current job (i.e. -v option) and some of the options
+apply to the table of children (ie. -l option). The following display_job commands
+illustrate the display of different job state aspects and their options:
+\x5    $ hodor oozie:display_job info                # displays overall info about the job
+    $ hodor oozie:display_job -v info             # displays all available information
+    $ hodor oozie:display_job info -m 'data_source'    # only show matching child rows
+    $ hodor oozie:display_job info -l 100         # show 100 children of current job
+    $ hodor oozie:display_job info -l 100 -o 100  # show second 100 children
+    $ hodor oozie:display_job info -k             # display only killed children
+    $ hodor oozie:display_job log                 # displays the job's log file
+    $ hodor oozie:display_job log -w fail.log     # writes log output to file
+    $ hodor oozie:display_job definition          # displays the job's definition
+    $ hodor oozie:display_job conf                # displays the job's property settings
+    $ hodor oozie:display_job conf -m 'rdbms'     # matching properties (keys or values)
+    $ hodor oozie:display_job rest                # displays current REST request url
+    $ hodor oozie:display_job json                # raw json output of REST call
+    $ hodor oozie:display_job json -w rest.json   # the -w option works for all aspects
+In addition to displaying information about the current job, display_job can also
+describe non-current jobs. Just pass in a job id, as follows:
+\x5    $ hodor oozie:display_job 3 json              # rest output for child #3
+    $ hodor oozie:display_job 0035642-151002103648730-oozie-oozi-W log
+                                                  # ^^ display log for job ID
+Display_job can function in 1 of 2 modes: query mode or change mode. In query mode,
+display_job just queries for and displays the jobs state, but does not change the
+current_job to it. In change mode, display_job still shows the job's state, but
+also changes the current_job, like change_job does. The default mode is change_mode,
+but this behavior can be altered in two ways. First, you can add the -q option to the
+command line:
+\x5    $ hodor oozie:display_job -q 3    # Just queries child 3, does not change to it
+Alternatively, you can change display_job's default behavior by modifying the Hodor
+preference ":display_job_query_mode" in ~/.hodor.yml as follows:
+\x5    FILE: ${HOME}/.hodor.yml ----
+    :display_job_query_mode: true
+Hodor reads ~/.hodor.yml on startup, and if the above flag is true, query mode becomes
+display_job's default behavior.
+Suggested Alias:
+\x5    $ alias dj='hodor oozie:display_job'

data/topics/oozie/driver_scenarios.txt ADDED

@@ -0,0 +1,42 @@
+Driver Scenarios
+---------------------------------------------------------------------------
+Within the drivers directory, a subdirectory named "scenarios" is expected
+by Hodor. The term "scenario" has aspecial meaning. A scenario is
+the combination of the "fill_frequency" plus the "fill_type" that a workflow
+should execute for. For example, some workflows run hourly, some daily. And
+some workflows run over historic ranges of time (i.e. a "backfill"), and some
+workflow run for future ranges of time (i.e. "incremental").  So, a driver
+can run with a "scenario", which is the combination of the two:
+   fill frequency + fill type).
+This scenarios that are typically defined are:
+  * hourly/incremental
+  * hourly/backfill
+  * daily/incremental
+  * daily/backfill
+The scenarios directory, under drivers, defines Oozie coordinators and context
+code artifacts that implement the scheduling and variable assignment concerns necessary
+to implement the scenario.  A typical scenario is organized as follows
+  hourly/
+    incremental/
+       context.xml
+       coordinator.xml    # blocks on time online
+       coordinator-1.xml  # blocks on time + 1 data input
+For example, the above "hourly/incremental" driver run scenario defines a
+coordinator that has an hourly frequency, along with a context.xml workflow that
+defines the table partitioning scheme to partition down to hourly granularity
+("i.e. an hourly fill_frequency"). The "fill_type" part of the scenario indicates whether
+the workflow is running from current time forward, or for historic date ranges.
+If running over historic date ranges (i.e. backfill fill_type), certain optimizatons
+can be made that run many hours at once.
+Each driver cites the run scenario it expects to run within, via the "jobs.yml"
+file. For more information about the jobs.yml file, type:
+   hodor oozie:topic jobs.yml