RubyGems - imw - Versions diffs - 0.1.0 → 0.1.1 - Mend

imw 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

data/README.rdoc +194 -31
data/VERSION +1 -1
data/bin/imw +5 -0
data/lib/imw/boot.rb +0 -15
data/lib/imw/dataset/paths.rb +38 -0
data/lib/imw/dataset/task.rb +21 -18
data/lib/imw/dataset/workflow.rb +126 -65
data/lib/imw/dataset.rb +56 -82
data/lib/imw/files/basicfile.rb +3 -3
data/lib/imw/files/compressed_files_and_archives.rb +23 -37
data/lib/imw/files/csv.rb +2 -1
data/lib/imw/files/directory.rb +62 -0
data/lib/imw/files/excel.rb +84 -0
data/lib/imw/files/sgml.rb +4 -23
data/lib/imw/files.rb +62 -47
data/lib/imw/packagers/archiver.rb +19 -1
data/lib/imw/packagers/s3_mover.rb +8 -0
data/lib/imw/parsers/html_parser/matchers.rb +251 -268
data/lib/imw/parsers/html_parser.rb +181 -176
data/lib/imw/parsers.rb +1 -1
data/lib/imw/repository.rb +35 -0
data/lib/imw/runner.rb +114 -0
data/lib/imw/utils/extensions/core.rb +0 -16
data/lib/imw/utils/paths.rb +0 -28
data/lib/imw.rb +21 -32
metadata +11 -19
data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
data/lib/imw/dataset/datamapper.rb +0 -66
data/lib/imw/dataset/loaddump.rb +0 -50
data/lib/imw/dataset/old/file_collection.rb +0 -88
data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
data/lib/imw/dataset/scaffold.rb +0 -132
data/lib/imw/dataset/scraped_uri.rb +0 -305
data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
data/lib/imw/dataset/scrub/scrub.rb +0 -147
data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
data/lib/imw/dataset/scrub/slug.rb +0 -101
data/lib/imw/dataset/stats/counter.rb +0 -23
data/lib/imw/dataset/stats.rb +0 -73

data/README.rdoc CHANGED Viewed

@@ -38,7 +38,7 @@ right one to use.  IMW is **not** designed for
 IMW is hosted on Gemcutter[http://gemcutter.org] so it's easy to install.
-You'll have to set up Gemcutter
+You'll have to set up Gemcutter if you haven't already
   $ sudo gem install gemcutter
   $ gem tumble
@@ -47,55 +47,218 @@ and then install IMW
   $ sudo gem install imw
-= Using IMW
+= IMW Basics
 The central goal of IMW is to make workflow involved in processing a
 dataset from a raw source to a finished product as simple as possible.
-So consider that there exist two datasets that I want to combine.  The
-first details the historical price of bananas over the past century
-and the second
+To help achieve this goal, IMW creates lots of convenient structures
+and methods.  The following sections provide a tour of these.
-== Working with paths and files
+It is assumed that you've installed IMW and required it in a script
+via
   require 'rubygems'
   require 'imw'
+== Paths
 IMW holds a registry of paths that you can define on the fly or store
 in a configuration file.
-  IMW.add_path :dropbox, "/var/www/public/dropbox"
-  IMW.add_path :raw,     "/mnt/data/raw"
-  IMW.add_path :
+  IMW.add_path(:dropbox, "/var/www/public/dropbox")
+  IMW.path_to(:dropbox)  #=> "/var/www/public/dropbox"
+You can combine paths together dynamically.
-This makes it easeir
+  IMW.add_path(:raw, "/data/raw")
+  IMW.path_to(:raw, "my/dataset") #=> "/data/raw/my/dataset"
+  IMW.add_path(:rejects, :raw, "rejects")
+  IMW.path_to(:rejects) #=> "/data/raw/rejects"
-  IMW.path_to :raw, "one/particular/dataset"
-  #=> "/mnt/data/raw/one/particular/dataset"
+Altering one path will update others
-IMW makes it easy to manipulate compressed files and archives.
+  IMW.add_path(:raw, "/data2/raw")
+  IMW.path_to(:rejects) #=> "/data2/raw/rejects", not "/data/raw/rejects"
+== Files & Directories
-  # Move a collection of files from a public dropbox to a processing directory
+Use IMW.open to open files.  The object returned by IMW.open obeys the
+usual semantics of a File object but it has new methods to manipulate
+and parse the file.
-  raw
+  f1 = IMW.open("/path/to/file")
+  f1.read() # does what you think
-  Dir["/public/*"].each do |path|
-    file = IMW.open(path)
-    case
-    when file.compressed?
-      file.decompress.mv_to_dir "/raw"
-    when file.archive?
-      FileUtils.cd("/raw") do
-        file.extract
-      end
-    else
-      file.mv_to_dir("/raw")
-    end
+  # class methods from File are available
+  f1.size
+  f1.writeable?
+  # use a bang or a 'w' to write
+  writable_file = IMW.open!('/some/path') # similar to open('/some/path', 'w')
+  # as well as methods to manipulate the file on the filesystem
+  f2 = f1.cp("/new/path/to/file") # also try cp_to_dir
+  f1.exist? # true
+  f3 = f1.mv("/yet/another/path") # also try mv_to_dir
+  f1.exist? # false
+IMW also knows about directories
+  d = IMW.open('/tmp')
+  d.directory? # true
+  d['*'] # Dir['/tmp/*']
+  d.mv('/parent/dir')
+== Remote Files
+Many operations defined for files are also defined for arbitrary URIs
+through the <tt>open-uri</tt> library.
+Files can readily be opened, read, and downloaded from the Internet
+  site = IMW.open('http://infochimps.org') #=> Recognized as an HTML document
+  site.read() # does what you think
+  site.cp('/some/local/path')
+  site.exist? # will work in many cases
+(writing to remote sources isn't enabled yet).
+== Archives & Compressed Files
+IMW works with a variety of archiving and compression programs (see
+IMW::EXTERNAL_PROGRAMS) to make packaging/unpackaging data easy.
+  bz2   = IMW.open('/path/to/big_file.bz2')
+  zip   = IMW.open('/path/to/archive.zip')
+  targz = IMW.open('/path/to/archive.tar.gz')
+  # IMW recognizes files by extension
+  bz2.archive?      # false
+  bz2.compressed?   # true
+  zip.archive?      # true
+  zip.compressed?   # false
+  targz.archive?    # true
+  targz.compressed? # true
+  # decompress or compress files
+  big_file = bz2.decompress! # skip the ! to preserve the original
+  new_bz2  = big_file.compress!
+  # extract and package archives
+  zip.extract    # files show up in working directory
+  tarbz2.extract # no need to decompress first
+  new_tarbz2 = IMW.open!('/new/archive.tar').create(['/path1', '/path/2']).compress!
+== Data Formats
+IMW encourages you to work with data as Ruby objects as much as
+possible by providing methods to parse common data formats directly
+into Ruby.
+The actual parsing is always handled by a separate library appropriate
+for the data format so it will be fast and, if you're familiar with
+the library, you can use many functions of the library directly on the
+object returned by IMW.open.
+IMW uses classes (defined in IMW::Files) to interface with each data
+type.  The choice of class is determined by the extension of the path
+supplied to IMW.open.
+  IMW.open('file.csv')  #=> IMW::Files::Csv
+  IMW.open('file.xml')  #=> IMW::Files::Xml
+  IMW.open('file.html') #=> IMW::Files::Html
+  # default choice will be a text file
+  IMW.open('strange_filename.wuzz') #=> IMW::Files::Text
+  # but you force a particular choice
+  IMW.open('strange_filename.wuzz', :as => :csv)  #=> IMW::Files::Csv
+Some formats are extremely regular (CSV's, JSON, YAML, &c.) and can
+immediately be converted to simple Ruby objects.  Other formats (flat
+files, HTML, XML, &c.) require parsing before they can be
+unambiguously converted to Ruby objects.
+As an example, consider flat, delimited files.  They are extremely
+regular and IMW uses FasterCSV to automatically parse them into nested
+arrays, the only sensible and unambiguous Ruby representation of their
+data:
+  delimit1 = IMW.open('/path/to/csv') # IMW::Files::Csv
+  delimit1.entries #=> array of arrays of entries
+  delimit1.each do |row|
+    # passes in parsed rows
+    ...
   end
+  # if there's a funny delimiter, it can be passed as an option (in
+  # this case identical to what would be passed to FasterCSV under the
+  # hood
+  delimit2 = IMW.open('/path/to/file.csv', :col_sep => " ")
+HTML files, on the other hand, are more complex and typically have to
+be parsed before being converted to plain Ruby objects:
+  # Grab a tiny link from the bottom of Google's homepage
+  doc = IMW.open('http://www.google.com') # IMW::Files::Html
+  doc.parse('p a') # 'Privacy'
+More complex parsers can also be built
+  # Grab each row from an HTML table
+  doc = IMW.open('/path/to/data.html')
+  doc.parse :employees => ["tr", { :name => "td.name", :address => "td.address" } ]
+  #=> [{:name => "John Chimpo", :address => "123 Fake St."}, {...}, ... ]
+see IMW::Parsers::HtmlParser for details on parsing HTML (and similar)
+files.  Examine the other parsers in IMW::Parsers for details on
+parsing other data formats.
+= The IMW Workflow
+The workflow of IMW can be roughly summarized as follows:
+rip::
+  Data is obtained from a source.  IMW allows you to download data
+  from the web, obtain it by querying databases, or use other services
+  like rsync, ftp, &c. to pull it in from another computer.
+extract::
+  Ripped data is often compressed or otherwise archived and needs to
+  be extracted.  It may also be sliced in many ways (excluding certain
+  years, say) to reduce the volume to only what is required.
+parse::
+  Data is parsed into Ruby objects and stored.
+munge::
+  All the parsed data is combined, reconciled, and further processed
+  into a final form.
+package::
+  The data is archived and compressed as necessary and moved to an
+  outbox, staging server, S3 bucket, &c.
+Not all datasets
+== Datasets
+== Tasks & Dependencies
+== Directory Structure
+== Records
+= IMW on the Command Line
+== Repositories
+== Running Tasks

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.1.0
1	+ 0.1.1

data/bin/imw ADDED Viewed

@@ -0,0 +1,5 @@
+#!/usr/bin/env ruby
+$:.unshift File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
+require 'imw/runner'
+exit IMW::Runner.new(*ARGV).run!

data/lib/imw/boot.rb CHANGED Viewed

@@ -1,18 +1,3 @@
-#
-# h2. lib/imw/boot.rb -- startup functions
-#
-# == About
-#
-# This file contains code necessary to boot the Infinite Monkeywrench
-# at a particular site.
-#
-# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
-# Copyright:: Copyright (c) 2008 infochimps.org
-# License::   GPL 3.0
-# Website::   http://infinitemonkeywrench.org/
-#
-# puts "#{File.basename(__FILE__)}: You heft up your Infinite Monkeywrench for the first time and marvel at how something so powerful could be made so wondrous light!"
 module IMW
   module Config

data/lib/imw/dataset/paths.rb ADDED Viewed

@@ -0,0 +1,38 @@
+module IMW
+  class Dataset
+    include IMW::Paths
+    # A dataset keeps track of its own collection of paths just like
+    # IMW itself.  When an IMW::Dataset is instantiated in a script,
+    # that script's directory becomes the dataset's +self+ path and
+    # the default workflow directories (see IMW::Workflow) are created
+    # within this directory.
+    #
+    # You can change a dataset's paths the same way you can change
+    # IMW's paths; calling +add_path+ and +remove_path+ on the
+    # dataset.
+    #
+    # To customize this behavior for all future datasets, created a
+    # subclass of IMW::Dataset and override the +set_paths+ method.
+    def paths
+      @paths
+    end
+    protected
+    # Sets the roots of various paths relative to this dataset.
+    def set_root_paths
+      @paths = {}
+      add_path :script, File.expand_path(eval('__FILE__'))
+      add_path :self,   File.dirname(path_to(:script))
+      IMW::Workflow::DIRS.each do |dir|
+        add_path dir, :self, dir.to_s
+      end
+    end
+    # Overwrite this method to set additional paths for the dataset.
+    def set_paths
+    end
+  end
+end

data/lib/imw/dataset/task.rb CHANGED Viewed

@@ -1,25 +1,10 @@
-#
-# h2. lib/imw/workflow/task.rb --
-#
-# == About
-#
-# This file defines a class <tt>IMW::Task</tt> which subclasses
-# <tt>Rake::Task</tt>.  Tasks defined in IMW should be instances of
-# <tt>IMW::Task</tt>.
-#
-# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
-# Copyright:: Copyright (c) 2008 infochimps.org
-# License::   GPL 3.0
-# Website::   http://infinitemonkeywrench.org/
-#
-# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
 require 'rake'
 module IMW
-  class Task < Rake::Task
-  end
+  Task             = Class.new(Rake::Task)
+  FileTask         = Class.new(Rake::FileTask)
+  FileCreationTask = Class.new(Rake::FileCreationTask)
   class Dataset
     include Rake::TaskManager
@@ -31,6 +16,24 @@ module IMW
       self.define_task IMW::Task, name, &block
     end
+    # Return a new (or existing) <tt>IMW::FileTask</tt> with the given
+    # +name+.  Dependencies can be declared and a block passed in just
+    # as in Rake.
+    def file name, &block
+      self.define_task IMW::FileTask, name, &block
+    end
+    # Return a new (or existing) <tt>IMW::FileCreationTask</tt> with the given
+    # +name+.  Dependencies can be declared and a block passed in just
+    # as in Rake.
+    def file_create name, &block
+      self.define_task IMW::FileCreationTask, name, &block
+    end
+    # Override this method to define default tasks for a subclass of
+    # IMW::Dataset.
+    def set_tasks
+    end
   end
 end

data/lib/imw/dataset/workflow.rb CHANGED Viewed

@@ -1,81 +1,142 @@
-#
-# lib/imw/workflow.rb -- implements the workflow class
-#
-# == About
-#
-# This file implements the <tt>IMW::Workflow</tt> class which tailors
-# the functionality of Rake for IMW objects.
-#
-# Author::    Philip flip Kromer for infochimps.org (mailto:coders@infochimps.org)
-# Copyright:: Copyright (c) 2008 infochimps.org
-# License::   GPL 3.0
-# Website::   http://infinitemonkeywrench.org/
-#
-require 'imw/dataset/scaffold'
 require 'imw/dataset/task'
+require 'ostruct'
 module IMW
-  # The <tt>IMW::Workflow</tt> module is a collection of methods which
-  # define Rake[http://rake.rubyforge.org/] tasks specialized for each
-  # dataset.
+  # IMW encourages you to view a data transformation as a network of
+  # dependencies.  By default, IMW defines five main steps:
+  #
+  # rip::
+  #   Obtain data via HTTP, FTP, SCP, RSYNC, database query, &c.
+  #
+  # extract::
+  #   Extract data from its ripped form to a form which can be
+  #   parsed.
+  #
+  # parse::
+  #   Parse data into a structured form.
+  #
+  # munge::
+  #   Combine, filter, reconcile, and transform already structured
+  #   data into a desired form.
+  #
+  # package::
+  #   Archive, compress, and deliver data in its final form to some
+  #   location (HTTP, FTP, SCP, RSYNC, S3, EBS, &c.).
+  #
+  # Each step depends upon the one before it.  The steps are blank by
+  # default so there's no need to write code for steps you don't need
+  # to use.
+  #
+  # Each step corresponds to a named directory in IMW::Workflow::DIRS.
   module Workflow
-    # The functions called here define the default tasks associated
-    # with each dataset.
-    def create_default_tasks
-      create_directories_task
-      create_symlinks_task
-      create_initialize_task
-      create_delete_data_task
-      create_destroy_task
-      create_workflow_tasks
-    end
+    # The <tt>Rake::TaskManager</tt> module allows the
+    # <tt>IMW::Dataset</tt> class to leverage the functionality of the
+    # Rake[http://rake.rubyforge.org/] library to manage tasks
+    # associated with the processing of this dataset.
+    include Rake::TaskManager
-    # Sets the default tasks in this workflow.
-    #
-    # The default tasks constitute a set of consecutive actions that
-    # must be taken in order: <tt>:rip</tt>, <tt>parse</tt>,
-    # <tt>munge</tt>, <tt>fix</tt>, and <tt>package</tt>.  Each task
-    # is a <tt>Rake::Task</tt> which depends on the one before it.
+    # Default options passed to <tt>Rake</tt>.  Any class including
+    # the <tt>Rake::TaskManager</tt> module must define a constant by
+    # this name.
+    DEFAULT_OPTIONS = {
+      :dry_run => false,
+      :trace   => false,
+      :verbose => false
+    }
+    # The standard IMW workflow steps.
+    STEPS = [:rip,  :extract, :parse, :munge, :package]
+    # The steps of the IMW workflow each correspond to a directory in
+    # which it is customary that they deposit their files <em>once
+    # they are finished processing</em> (so ripped files wind up in
+    # the +ripd+ directory, packaged files in the +pkgd+ directory,
+    # and so on).
+    DIRS  = [:ripd, :xtrd,    :prsd,  :mungd, :pkgd   ]
+    # Each workflow step can be configured to take default actions,
+    # each action being a proc in the array for the step in this hash.
     #
-    # Each task does nothing by default other than create directories
-    # to hold files for this dataset as it undergoes the workflow.
-    def set_default_tasks
-      define_task(Rake::Task, {:rip => []})
-      define_task(Rake::Task, {:parse => :rip})
-      define_task(Rake::Task, {:munge => :parse})
-      define_task(Rake::Task, {:fix => :munge})
-      define_task(Rake::Task, {:package => :fix})
-      comment_default_tasks
+    # This allows classes which include IMW::Workflow to use class
+    # methods named after each step (+rip+, +parse+, &c.) to directly
+    # define tasks.
+    STEPS_TASKS = returning({}) do |steps_procs|
+      STEPS.each do |step|
+        steps_procs[step] = []
+      end
     end
-    # Set the initial comments for each of the default tasks.
-    def comment_default_tasks
-      self[:rip].comment = "Rip dataset from an origin"
-      self[:parse].comment = "Parse dataset into intermediate form"
-      self[:munge].comment = "Munge dataset's structure into desired form"
-      self[:fix].comment = "Fix and format dataset"
-      self[:package].comment = "Package dataset into a final format"
+    protected
+    def self.included klass
+      STEPS.each do |step|
+        klass.class_eval <<EOF
+def self.#{step}(deps=nil, &block)
+  STEPS_TASKS[:#{step}] << [deps, block]
+end
+EOF
+      end
     end
-    # Creates the task dependency chain <tt>:package => :fix => :munge
-    # => :peel => :rip => :initialize</tt>.
-    def create_workflow_tasks
-      @last_description = "Obtain data from some source."
-      define_task(IMW::Task, :rip     => [:initialize])
-      @last_description = "Extract datafiles from ripped data."
-      define_task(IMW::Task, :peel    => [:rip])
-      @last_description = "Transform records in a dataset."
-      define_task(IMW::Task, :munge   => [:peel])
-      @last_description = "Reconcile records."
-      define_task(IMW::Task, :fix     => [:munge])
-      @last_description = "Package dataset in final form."
-      define_task(IMW::Task, :package => [:fix])
+    def define_workflow_task deps, comment
+      @last_description = comment
+      define_task(IMW::Task, deps)
+      step = deps.respond_to?(:keys) ? deps.keys.first : deps
+      STEPS_TASKS[step].each do |deps, block|
+        self[step].enhance(deps) do
+          self.instance_eval(&block)
+        end
+      end
+    end
+    # Create all the instance variables required by Rake::TaskManager
+    # and define default tasks for this dataset.
+    def initialize_workflow
+      @tasks = Hash.new
+      @rules = Array.new
+      @scope = Array.new
+      @last_description = nil
+      @options = OpenStruct.new(DEFAULT_OPTIONS)
+      define_create_directories_task
+      define_workflow_tasks
+      define_destroy_task
+    end
+    # Creates a task <tt>:create_directories</tt> to create the
+    # directory structure for this dataset.
+    def define_create_directories_task
+      @last_description = "Creates workflow directories for this dataset."
+      define_task(IMW::Task, {:create_directories => []}) do
+        DIRS.each do |dir|
+          FileUtils.mkdir_p(path_to(dir)) unless File.exist?(path_to(dir))
+        end
+      end
+    end
+    # Creates a task <tt>:destroy</tt> which removes dataset's
+    # workflow directories.
+    def define_destroy_task
+      @last_description = "Get rid of all traces of this dataset."
+      define_task(IMW::Task, :destroy => [:create_directories]) do
+        DIRS.each do |dir|
+          FileUtils.rm_rf(path_to(dir))
+        end
+      end
+    end
+    # Creates the task dependency chain <tt>:package => :munge =>
+    # :parse => :extract => :rip => :initialize</tt> of the
+    # IMW::Workflow.
+    def define_workflow_tasks
+      define_workflow_task({:rip     => [:create_directories]}, "Obtain data from some source."           )
+      define_workflow_task({:extract => [:rip]},                "Extract data so it's ready to parse."    )
+      define_workflow_task({:parse   => [:extract]},            "Parse data into a structured form."      )
+      define_workflow_task({:munge   => [:parse]},              "Munge structured data into desired form.")
+      define_workflow_task({:package => [:munge]},              "Package dataset in final form."          )
     end
   end
 end
-# puts "#{File.basename(__FILE__)}: You find your flow next to a tall tree.  Ahhhh."