RubyGems - hackboxen - Versions diffs - 0.1.0 - Mend

hackboxen 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

data/CHANGELOG.textile +44 -0
data/Gemfile +12 -0
data/Gemfile.lock +34 -0
data/LICENSE.txt +20 -0
data/README.textile +203 -0
data/Rakefile +49 -0
data/VERSION +1 -0
data/bin/describe.rb +101 -0
data/bin/hb-install +5 -0
data/bin/hb-runner +93 -0
data/bin/hb-scaffold +6 -0
data/hackboxen.gemspec +97 -0
data/lib/gemfiles/Gemfile.jruby-1.6.2.default +19 -0
data/lib/gemfiles/Gemfile.ruby-1.8.7.default +20 -0
data/lib/gemfiles/Gemfile.ruby-1.9.2.default +18 -0
data/lib/hackboxen.rb +17 -0
data/lib/hackboxen/tasks.rb +6 -0
data/lib/hackboxen/tasks/endpoint.rb +16 -0
data/lib/hackboxen/tasks/icss.rb +15 -0
data/lib/hackboxen/tasks/init.rb +36 -0
data/lib/hackboxen/tasks/install.rb +46 -0
data/lib/hackboxen/tasks/mini.rb +47 -0
data/lib/hackboxen/tasks/scaffold.rb +71 -0
data/lib/hackboxen/template.rb +36 -0
data/lib/hackboxen/template/Rakefile.erb +31 -0
data/lib/hackboxen/template/config.yaml.erb +10 -0
data/lib/hackboxen/template/endpoint.rb.erb +39 -0
data/lib/hackboxen/template/icss.yaml.erb +125 -0
data/lib/hackboxen/template/main.erb +31 -0
data/lib/hackboxen/utils.rb +49 -0
data/lib/hackboxen/utils/README_ConfigValidator.textile +63 -0
data/lib/hackboxen/utils/config_validator.rb +41 -0
data/lib/hackboxen/utils/logging.rb +39 -0
data/lib/hackboxen/utils/paths.rb +66 -0
data/spec/install_spec.rb +36 -0
metadata +213 -0

data/CHANGELOG.textile ADDED

@@ -0,0 +1,44 @@
+h3. Deprecations/Changes
+* HackBoxen::Paths methods have changed.
+* You no longer need to include the line @HACKBOX_DIR = File.basename(__FILE__)@ at the top of the Rakefile.
+* @Settings@ as used by the Rakefile is now @WorkingConfig@.
+* @working_environment@ is now only available in the JSON flavor and the @env/@ directory has moved to the same level as @fixd/@.
+* Output data and Icss need to end up in @fixd/data/@.
+* @rake scaffold@ is no longer the command to build a hackbox.
+* Config files are no longer read as a directory, and also no longer read from the dataroot as the @config/@ output directory is no longer being created.
+* Much old code was refactored or removed.
+h3. New Functionality
+* Default Hackboxen paths can be accessed by using @path_to(:fixd_dir)@. See HackBoxen::Paths for using/adding others.
+* You may now @require 'hackboxen' anywhere and it will recognize if you are in a hackbox directory (or not) and allow you appropriate access to HackBoxen methods.
+* Tasks for moving icss and endpoint code have been added. Include @'hb:icss'@ and @'hb:endpoint'@ in the default Rakefile task if you want to use them.
+* @filesystem_scheme@ now defaults to the local filesystem if not specified in the @config.yaml@
+* A logging helper has been added. Use @include HackBoxen::Logging@ and then @logs_to STDOUT, 'file'@ inside of a class to access an instance variable @@log@ that contains a formatted log4r Logger.
+* A binary executable has been added, @hb-scaffold@ that can be run from anywhere and is designed to replace the rake task.
+h3. Still Needing
+* Make Hackboxen a gem. This will require the separation of the actual hackbox code from the hackboxen library (not done yet) and the creation of a coderoot to connect the hackbox library with the other code (implemented).
+* When Hackboxen is a gem, its version can be added to the requires hash in a hackbox @config.yaml@ so we can keep track of potentially breaking changes to legacy code.
+* Full spec coverage for the hackboxen library.
+* Implementation of @'hb:mini'@ and @ConfigValidator@. Some of this code is written, but it needs to be fleshed out and decided upon.
+* The separation (completely) of a @config.yaml@ from an @icss.yaml@. Would not affect hackbox running much.

data/Gemfile ADDED

@@ -0,0 +1,12 @@
+source :rubygems
+gem 'swineherd', '>=0.0.4'
+gem 'configliere', '0.4.6'
+gem 'rake', '0.8.7'
+group :development do
+  gem "shoulda", ">= 0"
+  gem "bundler", "~> 1.0.0"
+  gem "jeweler", "~> 1.5.2"
+  gem "rcov", ">= 0"
+end

data/Gemfile.lock ADDED

@@ -0,0 +1,34 @@
+GEM
+  remote: http://rubygems.org/
+  specs:
+    configliere (0.4.6)
+    erubis (2.7.0)
+    git (1.2.5)
+    gorillib (0.1.1)
+    jeweler (1.5.2)
+      bundler (~> 1.0.0)
+      git (>= 1.2.5)
+      rake
+    rake (0.8.7)
+    rcov (0.9.9)
+    right_aws (2.1.0)
+      right_http_connection (>= 1.2.5)
+    right_http_connection (1.3.0)
+    shoulda (2.11.3)
+    swineherd (0.0.4)
+      configliere
+      erubis
+      gorillib
+      right_aws
+PLATFORMS
+  ruby
+DEPENDENCIES
+  bundler (~> 1.0.0)
+  configliere (= 0.4.6)
+  jeweler (~> 1.5.2)
+  rake (= 0.8.7)
+  rcov
+  shoulda
+  swineherd (>= 0.0.4)

data/LICENSE.txt ADDED

@@ -0,0 +1,20 @@
+Copyright (c) 2011 Infochimps
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.textile ADDED

@@ -0,0 +1,203 @@
+h1. Hackboxen
+The Hackboxen library is designed to encapsulate data collecting and processing tasks into simple and easy to implement packages.
+Any singular hackbox has the following two parts:
+* An engine, which contains configuration information and data processing code.
+* An output directory, which will contain the fully processed data along with a descriptive schema. This directory may be either local or remote (e.g. S3/HDFS)
+A hackbox **dataset** is defined by a @namespace@ and a @protocol@. The @namespace@ must be dot(.) separated and both the @namespace@ and @protocol@ may contain only lowercase letters, numbers and underscores.
+h2. Hackbox Engine
+A hackbox engine contains:
+* @Rakefile@: **(required)** Used to read and combine all the sources of config metadata and execute @main@.
+* @Gemfile@: **(optional)** A list of gems necessary for thsi hackbox to run. Processed automatically by "Bundler":https://github.com/carlhuda/bundler.
+* @config/@: **(required)** A subdirectory containing:
+** @config.yaml@ **(required)** A dataset specific default configuration YAML file.
+** @protocol.icss.yaml@ **(optional)** An "Icss":http://github.com/infochimps/icss schema file describing the output data and publishing targets.
+* @engine/@: **(required)** A subdirectory containing:
+** @main@: **(required)** An executable data processing file. This may be written in any language.
+** **(optional)** Any other executable and support files. There is no restriction on language and complexity.
+The hackbox engine lives in the @coderoot@ directory specified by your configuration settings. An example hackbox engine directory structure:
+<pre><code>coderoot
+└── language
+    └── corpora
+        └── word_freq
+            └── bnc
+                ├── config
+                │   ├── config.yaml
+                │   └── bnc.icss.yaml
+                ├── engine
+                │   ├── main
+                │   └── bnc_endpoint.rb
+                └── Rakefile
+</code></pre>
+h2. Hackbox Output Directory
+The hackbox output directory is where all of the data that a hackbox acquires, reads, or creates lives. The location of the data directory is determind by the @dataroot@ variable specified in your configuration settings. An example hackbox output directory structure:
+<pre><code>dataroot
+└── language
+    └── corpora
+        └── word_freq
+            └── bnc
+                ├── fixd
+                │   ├── code
+                │   │   └── bnc_endpoint.rb
+                │   ├── data
+                │   │   └── bnc_fixd_data.tsv
+                │   └── env
+                │       └── working_environment.json
+                ├── log
+                │   └── bnc_run_0.log
+                ├── rawd
+                │   └── bnc_data_in_process
+                ├── ripd
+                │   └── bnc_download.zip
+                └── tmp
+</code></pre>
+* @log/@:  **(optional)** All logging from a hackbox run goes here.
+* @tmp/@:  **(optional)** If needed, any truly ephemeral output of the workflow should go here.
+* @ripd/@: **(required)** This will contain virginal downloaded source data adhering to the directory structure from which it was pulled.
+* @rawd/@: **(optional)** This will contain all intermediate data processing outputs.
+* @fixd/@: **(required)** See the output interface described below.
+Engine and output directories are generally created dynamically and are not meant to be archival.
+h3. Output Interface (fixd/)
+@fixd/@ is the final output directory and contains the following:
+* @env/@: **(required)** This directory contains a file describing the environment in which the hackbox was run.
+** @working_environment.json@: **(required)** All runtime config metadata used to generate the schema and output data.
+* @code/@: **(optional)** A directory containing the code assets described in the icss.
+* @data/@: **(required)** A directory containing a single dataset or subdirectories named for each dataset. Each contains:
+** @protocol.icss.json@: **(required)** An "Icss":http://github.com/infochimps/icss schema file describing its respective dataset.
+** **(required)** One or more data files that collectively adhere to the schema of this dataset.
+h2. Hackbox Configuration
+Hackbox configuration may be one or more files in YAML format and, optionally, the command line. Configuration will be read in using "Configliere":https://github.com/mrflip/configliere in the following order:
+* @/etc/hackbox/hackbox.yaml@: Machine-wide config.
+* @~/.hackbox/hackbox.yaml@: Install specific config.
+* @config/config.yaml@: Hackbox specific config.
+* @rake task -- --args=@: Command line arguments.
+Later sources on this list overwrite earlier sources. The combined configuration metadata is serialized out as JSON in the @fixd/env@ directory as @working_config.json@. This is done before any other code executes in order for a hackbox to be able to read in this file if necessary.
+h1. Getting Started
+Here are the general guidelines for creating your own hackbox.
+h3. Hackboxen Dependencies
+Clone the Hackboxen repo:
+<pre><code>git clone git@github.com:infochimps/hackboxen.git
+</code></pre>
+Add Hackboxen to your $RUBYLIB:
+<pre><code>export RUBYLIB=$RUBYLIB:/path/to/hackboxen/lib
+</code></pre>
+Install Hackboxen dependencies:
+<pre><code>cd hackboxen
+sudo bundle install
+rake install # optionally: rake install -- --dataroot=/data/hb --coderoot=/code/hb
+</code></pre>
+This will install the following gems: "configliere":http://github.com/mrflip/configliere, "icss":http://github.com/infochimps/icss, "swineherd":http://github.com/ganglion/swineherd, and "rake":http://github.com/jimweirich/rake. This will also create a @.hackbox@ directory with a @hackbox.yaml@ file that contains default values for @coderoot@, @dataroot@, @s3_filesystem@, @os@, and @machine@. The @rake install@ command has optional arguments @--dataroot=@, @--coderoot=@.
+A default @hackbox.yaml@ file:
+<pre><code>---
+coderoot: /code/hb/
+dataroot: /data/hb/
+s3_filesystem:
+  access_key:
+  secret_key:
+  mini_bucket:
+requires:
+  machine: x86_64
+  os: darwin
+</code></pre>
+h3. Creating a Hackbox
+Hackboxen comes with scaffold task that creates a template hackbox for you. Required arguments are @--namespace=@ and @--protocol=@. Optional arguments are @--targets=@, @--s3access=@, and @--s3secret=@.
+<pre><code>hb-scaffold --namespace=foo.bar --protocol --targets=catalog,mysql
+</code></pre>
+This will create the following directories and files:
+<pre><code>coderoot
+└── foo
+    └── bar
+        └── baz
+            ├── config
+            │   ├── config.yaml
+            │   └── baz.icss.yaml
+            ├── engine
+            │   ├── main
+            │   └── baz_endpoint.rb
+            └── Rakefile
+</code></pre>
+h3. Running a hackbox
+Externally, the execution of a hackbox appears as:
+* A @Rakefile@ is run with @rake@ from the shell with one of the following targets:
+** @get_data@: Performs only the ingest step. The input data (in @ripd@/@rawd@) and any required metadata should exist after this step.
+** @default@: Performs the processing step, @:get_data@, and executes the @main@ file.
+Execution Results:
+* If there is no failure, @rake@ can be silent.
+* If there is a failure, @rake@ ends with a thrown exception
+* After a successful execution, the complete output interface (@fixd@) must exist, with no additional interaction outside of @rake@.
+The rough steps of hackbox internal execution are:
+* The configuration sources (command line and files) are read and combined.
+* The output directory structure (@fixd@) is created.
+* The hackbox engine is run and the "troop ready" ouput datasets are created in @fixd@.
+* Note: Hackbox execution should be idempotent (when it is sensible and efficient), leveraging this behavior from @rake@.*
+h3. Hackboxen Best Practices
+One should try to avoid redundant computation.  In particular, idempotency of output creation should be observed.  Sometimes incrementally updated information makes this hard, but should be done if not too painful.
+Files read and written by the hackbox should use the @Swineherd::FileSystem@ abstraction. See "swineherd":http://github.com/infochimps/swineherd.
+Implementation of the @Gorillib::Receiver@ pattern is recommended. See "gorillib":http://github.com/infochimps/gorillib.
+Any and all output datasets must include an appropriately descriptive schema. See "icss":http://github.com/infochimps/icss.
+== Contributing to hackboxen
+* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
+* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
+* Fork the project
+* Start a feature/bugfix branch
+* Commit and push until you are happy with your contribution
+* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
+* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
+== Copyright
+Copyright (c) 2011 Infochimps. See LICENSE.txt for
+further details.

data/Rakefile ADDED

@@ -0,0 +1,49 @@
+require 'rubygems'
+require 'bundler'
+begin
+  Bundler.setup(:default, :development)
+rescue Bundler::BundlerError => e
+  $stderr.puts e.message
+  $stderr.puts "Run `bundle install` to install missing gems"
+  exit e.status_code
+end
+require 'rake'
+require 'jeweler'
+Jeweler::Tasks.new do |gem|
+  gem.name = "hackboxen"
+  gem.homepage = "http://github.com/infochimps/hackboxen"
+  gem.executables = ["hb-install", "hb-scaffold", "hb-runner"]
+  gem.license = "MIT"
+  gem.summary = "A simple framework to assist in standardizing the data-munging input/output process."
+  gem.description = "A simple framework to assist in standardizing the data-munging input/output process."
+  gem.email = "travis@infochimps.com"
+  gem.authors = ["kornypoet", "Ganglion", "bollacker"]
+end
+Jeweler::RubygemsDotOrgTasks.new
+require 'rake/testtask'
+Rake::TestTask.new(:test) do |test|
+  test.libs << 'lib' << 'test'
+  test.pattern = 'test/**/test_*.rb'
+  test.verbose = true
+end
+require 'rcov/rcovtask'
+Rcov::RcovTask.new do |test|
+  test.libs << 'test'
+  test.pattern = 'test/**/test_*.rb'
+  test.verbose = true
+end
+task :default => :test
+require 'rake/rdoctask'
+Rake::RDocTask.new do |rdoc|
+  version = File.exist?('VERSION') ? File.read('VERSION') : ""
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = "hackboxen #{version}"
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end

data/VERSION ADDED

	@@ -0,0 +1 @@
1	+ 0.1.0

data/bin/describe.rb ADDED

@@ -0,0 +1,101 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'json'
+require 'yaml'
+class MetaBox
+  attr_accessor :cache
+  def initialize
+    @cache = {}
+    @unreadable = []
+  end
+  def lookup
+    readable = {}
+    paths_to_cfg.each do |path|
+      config = read_config path
+      if config
+        name = config['namespace'] + '.' + config['protocol']
+        readable[name] = path
+      else
+        @unreadable << path
+      end
+    end
+    readable
+  end
+  def paths_to_cfg
+    Dir["../**/config.yaml"]
+  end
+  def read_config cfg_path
+    begin
+      return config = YAML.load(File.read cfg_path)
+    rescue
+      return nil
+    end
+  end
+  def add_to_cache *args
+    args.flatten.each do |name|
+      @cache[name] = read_config(lookup[name]) if lookup[name]
+    end
+    list_cache
+  end
+  def clear_cache
+    @cache = {}
+    list_cache
+  end
+  def list_readable
+    lookup.keys
+  end
+  def list_cache
+    @cache.keys
+  end
+  def describe name
+    cfg = read_config(lookup[name]) if lookup[name]
+    puts JSON.pretty_generate cfg
+    name
+  end
+  def describe_cache *args
+    if args.empty?
+      @cache.each { |key, val| puts JSON.pretty_generate val }
+      list_cache
+    else
+      args.each { |val| puts JSON.pretty_generate @cache[val] }
+    end
+  end
+  def each_insert key, val
+    @cache.each { |name, cfg| cfg[key] = val }
+    describe_cache
+  end
+  def search query
+    results = []
+    lookup.each do |name, path|
+      cfg = read_config path
+      results << name if cfg[query]
+    end
+    results
+  end
+  def write_cache
+    @cache.each do |name, cfg|
+      File.open(lookup[name], 'w') do |file|
+        file.puts cfg.to_yaml
+      end
+    end
+    list_cache
+  end
+end