RubyGems - imw - Versions diffs - 0.1.0 → 0.1.1 - Mend

imw 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

data/README.rdoc +194 -31
data/VERSION +1 -1
data/bin/imw +5 -0
data/lib/imw/boot.rb +0 -15
data/lib/imw/dataset/paths.rb +38 -0
data/lib/imw/dataset/task.rb +21 -18
data/lib/imw/dataset/workflow.rb +126 -65
data/lib/imw/dataset.rb +56 -82
data/lib/imw/files/basicfile.rb +3 -3
data/lib/imw/files/compressed_files_and_archives.rb +23 -37
data/lib/imw/files/csv.rb +2 -1
data/lib/imw/files/directory.rb +62 -0
data/lib/imw/files/excel.rb +84 -0
data/lib/imw/files/sgml.rb +4 -23
data/lib/imw/files.rb +62 -47
data/lib/imw/packagers/archiver.rb +19 -1
data/lib/imw/packagers/s3_mover.rb +8 -0
data/lib/imw/parsers/html_parser/matchers.rb +251 -268
data/lib/imw/parsers/html_parser.rb +181 -176
data/lib/imw/parsers.rb +1 -1
data/lib/imw/repository.rb +35 -0
data/lib/imw/runner.rb +114 -0
data/lib/imw/utils/extensions/core.rb +0 -16
data/lib/imw/utils/paths.rb +0 -28
data/lib/imw.rb +21 -32
metadata +11 -19
data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
data/lib/imw/dataset/datamapper.rb +0 -66
data/lib/imw/dataset/loaddump.rb +0 -50
data/lib/imw/dataset/old/file_collection.rb +0 -88
data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
data/lib/imw/dataset/scaffold.rb +0 -132
data/lib/imw/dataset/scraped_uri.rb +0 -305
data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
data/lib/imw/dataset/scrub/scrub.rb +0 -147
data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
data/lib/imw/dataset/scrub/slug.rb +0 -101
data/lib/imw/dataset/stats/counter.rb +0 -23
data/lib/imw/dataset/stats.rb +0 -73

data/lib/imw/parsers/html_parser.rb CHANGED Viewed

@@ -191,192 +191,197 @@
 require 'imw/parsers/html_parser/matchers'
-class IMW::HTMLParser
+module IMW
+  module Parsers
+    class HtmlParser
-  include IMW::HTMLParserMatcher
+      include IMW::Parsers::HtmlMatchers
-  attr_accessor :parse_tree
+      attr_accessor :parse_tree
-  #
-  # Parse Tree
-  #
-  def initialize arg_spec=nil
-    spec = arg_spec || self.class.parser_spec
-    self.parse_tree = IMW::HTMLParserMatcher.build_parse_tree(spec)
-  end
+      #
+      # Parse Tree
+      #
+      def initialize arg_spec=nil
+        spec = arg_spec || self.class.parser_spec
+        self.parse_tree = IMW::Parsers::HtmlMatchers.build_parse_tree(spec)
+      end
-  #
-  # See IMW::HTMLParser for syntax
-  #
-  #
-  def self.parser_spec
-    raise "Override this to create your own parser spec"
-  end
+      #
+      # See IMW::HtmlParser for syntax
+      #
+      #
+      def self.parser_spec
+        raise "Override this to create your own parser spec"
+      end
-  #
-  # Walk
-  #
-  def parse doc
-    self.parse_tree.match(doc)
-  end
+      #
+      # Walk
+      #
+      def parse doc
+        self.parse_tree.match(doc)
+      end
-  # one("hpricot_path")                 first match to hpricot_path
-  # one("hpricot_path", /spec/)         applies spec to first match to hpricot_path
-  #
-  def self.one selector, matcher
-    MatchFirstElement.new(selector, IMW::HTMLParserMatcher.build_parse_tree(matcher))
-  end
-  # match the +attr+ attribute of the first element given by +selector+
-  def self.attr selector, attr, matcher=nil
-    MatchAttribute.new(selector, attr, IMW::HTMLParserMatcher.build_parse_tree(matcher))
-  end
-  # shorthand for +attr(foo, 'href')+
-  def self.href selector, matcher=nil
-    self.attr(selector, 'href', matcher)
-  end
-  # shorthand for +attr(foo, 'src')+
-  def self.src selector, matcher=nil
-    self.attr(selector, 'src', matcher)
-  end
+      # one("hpricot_path")                 first match to hpricot_path
+      # one("hpricot_path", /spec/)         applies spec to first match to hpricot_path
+      #
+      def self.one selector, matcher
+        MatchFirstElement.new(selector, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
+      end
+      # match the +attr+ attribute of the first element given by +selector+
+      def self.attr selector, attr, matcher=nil
+        MatchAttribute.new(selector, attr, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
+      end
+      # shorthand for +attr(foo, 'href')+
+      def self.href selector, matcher=nil
+        self.attr(selector, 'href', matcher)
+      end
+      # shorthand for +attr(foo, 'src')+
+      def self.src selector, matcher=nil
+        self.attr(selector, 'src', matcher)
+      end
-  def self.proc selector, proc, matcher=nil
-    MatchProc.new(selector, proc, IMW::HTMLParserMatcher.build_parse_tree(matcher))
-  end
+      def self.proc selector, proc, matcher=nil
+        MatchProc.new(selector, proc, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
+      end
-  # strip ","s (!! thus disrespecting locale !!!)
-  # and convert to int
-  def self.to_num selector, matcher=nil
-    proc selector, lambda{|num| num.to_s.gsub(/,/,'').to_i if num }, matcher
-  end
-  def self.to_json selector, matcher=nil
-    proc selector, lambda{|v| v.to_json if v }, matcher
-  end
+      # strip ","s (!! thus disrespecting locale !!!)
+      # and convert to int
+      def self.to_num selector, matcher=nil
+        proc selector, lambda{|num| num.to_s.gsub(/,/,'').to_i if num }, matcher
+      end
+      def self.to_json selector, matcher=nil
+        proc selector, lambda{|v| v.to_json if v }, matcher
+      end
-  def self.strip selector, matcher=nil
-    proc selector, lambda{|v| v.strip }, matcher
-  end
+      def self.strip selector, matcher=nil
+        proc selector, lambda{|v| v.strip }, matcher
+      end
-  def self.re_group selector, re
-    MatchRegexp.new(selector, re)
-  end
-  def self.re selector, re
-    MatchRegexp.new(selector, re, nil, :capture => 1)
-  end
-  def self.re_all selector, re, matcher=nil
-    MatchRegexpRepeatedly.new(selector, re)
-  end
+      def self.re_group selector, re
+        MatchRegexp.new(selector, re)
+      end
+      def self.re selector, re
+        MatchRegexp.new(selector, re, nil, :capture => 1)
+      end
+      def self.re_all selector, re, matcher=nil
+        MatchRegexpRepeatedly.new(selector, re)
+      end
-  # def self.plain_text selector, matcher=nil
-  #   proc selector, lambda{|el| el.inner_text if el }, matcher
-  # end
+      # def self.plain_text selector, matcher=nil
+      #   proc selector, lambda{|el| el.inner_text if el }, matcher
+      # end
-  # attr_accessor :mapping
-  #
-  # #
-  # # Feed me a hash and I'll semantify HTML
-  # #
-  # # The hash should magically adhere to the too-complicated,
-  # # ever evolving goatrope that works for the below
-  # #
-  # #
-  # def initialize mapping
-  #   self.mapping = mapping
-  # end
-  #
-  # #
-  # # take a document subtree,
-  # # and a mapping of hpricot paths to that subtree's data mapping
-  # # recursively extract that datamapping
-  # #
-  # def extract_tree  hdoc, content, sub_mapping
-  #   data = { }
-  #   sub_mapping.each do |selector, target|
-  #     data[selector] = []
-  #     sub_contents = content/selector
-  #     sub_contents.each do |sub_content|
-  #       sub_data = {}
-  #       extract_node hdoc, sub_content, sub_data, selector, target
-  #       data[selector] << sub_data
-  #     end
-  #   end
-  #   data
-  #   # end
-  #   #   if selector.is_a?(String)
-  #   #     conts = (content)
-  #   #   else
-  #   #     conts = [content]
-  #   #   end
-  #   #   conts[0..0].each do |content|
-  #   #     extract_node hdoc, content, data, selector, target
-  #   #   end
-  #   # end
-  #   data
-  # end
-  #
-  # #
-  # # insert the extracted element into the data mapping
-  # #
-  # def extract_node hdoc, content, data, selector, target
-  #   classification = classify_node(selector, target)
-  #   result = \
-  #   case classification
-  #   when :subtree
-  #     target.each do |sub_selector, sub_target|
-  #       extract_node hdoc, content, data, sub_selector, sub_target
-  #     end
-  #
-  #   when :sub_attribute
-  #     k, v = selector.to_a[0]
-  #     subcontent = (k[0..0] == '/') ? (hdoc.at(k)) : (content.at(k))
-  #     val  = subcontent.attributes[v.to_s] if subcontent
-  #     data[target] = val unless val.blank?
-  #
-  #   when :attribute then
-  #     val = content.attributes[selector.to_s]
-  #     data[target] = val unless val.blank?
-  #
-  #   when :flatten_list
-  #     subcontents = (selector[0..0] == '/') ? (hdoc/selector) : (content/selector)
-  #     data[target.first] = subcontents.map{|subcontent| subcontent.inner_html }
-  #
-  #   when :inner_html
-  #     subcontent = (selector[0..0] == '/') ? (hdoc.at(selector)) : (content.at(selector))
-  #     data[target] = subcontent.inner_html.strip if subcontent
-  #
-  #   else
-  #     raise "classify_node shouldn't ever return #{classification}"
-  #   end
-  #   # puts "%-19s %-19s %-31s %s" % [target.inspect[0..18], classification.inspect[0..18], selector.inspect[0..30], result.inspect[0..90]] if (classification == :sub_attribute)
-  #   # puts '' if classification == :subtree
-  # end
-  #
-  # def classify_node selector, target
-  #   case
-  #   when target.is_a?(Hash)                             then :subtree
-  #   when selector.is_a?(Hash) && (selector.length == 1) then
-  #     k, v = selector.to_a[0]
-  #     case v
-  #     when Symbol then :sub_attribute
-  #     end
-  #   when selector.is_a?(Symbol)                         then :attribute
-  #   when selector.is_a?(String) && target.is_a?(Array)  then :flatten_list
-  #   when selector.is_a?(String) && target.is_a?(Symbol) then :inner_html
-  #   else
-  #     raise "Can't classify mapping: " + [selector, target].join(" - ")
-  #   end
-  # end
-  #
-  # # use #mapping to parse file
-  # def parse link
-  #   begin       hdoc = Hpricot(link.contents)
-  #   rescue;     warn "can't hpricot #{link.to_s}" ; return false;  end
-  #   raw_taggings = extract_tree hdoc, hdoc, self.mapping
-  # end
-  #
-  # # use #mapping to parse file
-  # def parse_file filename
-  #   begin       hdoc = Hpricot(File.open(filename))
-  #   rescue;     warn "can't hpricot #{filename}" ; return false;  end
-  #   raw_taggings = extract_tree hdoc, hdoc, self.mapping
-  # end
+      # attr_accessor :mapping
+      #
+      # #
+      # # Feed me a hash and I'll semantify HTML
+      # #
+      # # The hash should magically adhere to the too-complicated,
+      # # ever evolving goatrope that works for the below
+      # #
+      # #
+      # def initialize mapping
+      #   self.mapping = mapping
+      # end
+      #
+      # #
+      # # take a document subtree,
+      # # and a mapping of hpricot paths to that subtree's data mapping
+      # # recursively extract that datamapping
+      # #
+      # def extract_tree  hdoc, content, sub_mapping
+      #   data = { }
+      #   sub_mapping.each do |selector, target|
+      #     data[selector] = []
+      #     sub_contents = content/selector
+      #     sub_contents.each do |sub_content|
+      #       sub_data = {}
+      #       extract_node hdoc, sub_content, sub_data, selector, target
+      #       data[selector] << sub_data
+      #     end
+      #   end
+      #   data
+      #   # end
+      #   #   if selector.is_a?(String)
+      #   #     conts = (content)
+      #   #   else
+      #   #     conts = [content]
+      #   #   end
+      #   #   conts[0..0].each do |content|
+      #   #     extract_node hdoc, content, data, selector, target
+      #   #   end
+      #   # end
+      #   data
+      # end
+      #
+      # #
+      # # insert the extracted element into the data mapping
+      # #
+      # def extract_node hdoc, content, data, selector, target
+      #   classification = classify_node(selector, target)
+      #   result = \
+      #   case classification
+      #   when :subtree
+      #     target.each do |sub_selector, sub_target|
+      #       extract_node hdoc, content, data, sub_selector, sub_target
+      #     end
+      #
+      #   when :sub_attribute
+      #     k, v = selector.to_a[0]
+      #     subcontent = (k[0..0] == '/') ? (hdoc.at(k)) : (content.at(k))
+      #     val  = subcontent.attributes[v.to_s] if subcontent
+      #     data[target] = val unless val.blank?
+      #
+      #   when :attribute then
+      #     val = content.attributes[selector.to_s]
+      #     data[target] = val unless val.blank?
+      #
+      #   when :flatten_list
+      #     subcontents = (selector[0..0] == '/') ? (hdoc/selector) : (content/selector)
+      #     data[target.first] = subcontents.map{|subcontent| subcontent.inner_html }
+      #
+      #   when :inner_html
+      #     subcontent = (selector[0..0] == '/') ? (hdoc.at(selector)) : (content.at(selector))
+      #     data[target] = subcontent.inner_html.strip if subcontent
+      #
+      #   else
+      #     raise "classify_node shouldn't ever return #{classification}"
+      #   end
+      #   # puts "%-19s %-19s %-31s %s" % [target.inspect[0..18], classification.inspect[0..18], selector.inspect[0..30], result.inspect[0..90]] if (classification == :sub_attribute)
+      #   # puts '' if classification == :subtree
+      # end
+      #
+      # def classify_node selector, target
+      #   case
+      #   when target.is_a?(Hash)                             then :subtree
+      #   when selector.is_a?(Hash) && (selector.length == 1) then
+      #     k, v = selector.to_a[0]
+      #     case v
+      #     when Symbol then :sub_attribute
+      #     end
+      #   when selector.is_a?(Symbol)                         then :attribute
+      #   when selector.is_a?(String) && target.is_a?(Array)  then :flatten_list
+      #   when selector.is_a?(String) && target.is_a?(Symbol) then :inner_html
+      #   else
+      #     raise "Can't classify mapping: " + [selector, target].join(" - ")
+      #   end
+      # end
+      #
+      # # use #mapping to parse file
+      # def parse link
+      #   begin       hdoc = Hpricot(link.contents)
+      #   rescue;     warn "can't hpricot #{link.to_s}" ; return false;  end
+      #   raw_taggings = extract_tree hdoc, hdoc, self.mapping
+      # end
+      #
+      # # use #mapping to parse file
+      # def parse_file filename
+      #   begin       hdoc = Hpricot(File.open(filename))
+      #   rescue;     warn "can't hpricot #{filename}" ; return false;  end
+      #   raw_taggings = extract_tree hdoc, hdoc, self.mapping
+      # end
+    end
+  end
 end

data/lib/imw/parsers.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module IMW
   module Parsers
-    autoload :HTML,         'imw/parsers/html_parser'
+    autoload :HtmlParser,   'imw/parsers/html_parser'
     autoload :LineParser,   'imw/parsers/line_parser'
     autoload :RegexpParser, 'imw/parsers/regexp_parser'
   end

data/lib/imw/repository.rb ADDED Viewed

@@ -0,0 +1,35 @@
+require 'imw/utils'
+module IMW
+  # A Repository is a collection of datasets.
+  class Repository < Hash
+    # FIXME This should read some configuration settings somewhere and
+    # generate a pool specific to each IMW user.
+    def self.default
+      new
+    end
+  end
+  # The default repository managed by IMW.
+  REPOSITORY = Repository.default
+  # Add a dataset to the IMW::REPOSITORY.  If the dataset has a
+  # +handle+ then it will be used as the key in this repository;
+  # otherwise the dataset's class will be used.
+  def self.add dataset
+    REPOSITORY[dataset.handle] = dataset
+  end
+  # Remove a dataset from the IMW::REPOSITORY.  Can pass in either a
+  # string handle or an instance of the dataset.
+  def self.delete handle
+    handle = handle.handle if handle.respond_to?(:handle)
+    REPOSITORY.delete(handle)
+  end
+end

data/lib/imw/runner.rb ADDED Viewed

@@ -0,0 +1,114 @@
+require 'imw'
+require 'optparse'
+module IMW
+  RunnerError = Class.new(IMW::Error)
+  class Runner
+    DEFAULT_OPTIONS = {
+      :requires  => [],
+      :selectors => [],
+      :dry_run   => false
+    }
+    attr_reader :args, :options
+    def initialize *args
+      @args    = args
+      @options = DEFAULT_OPTIONS.dup
+      parser.parse!(args)       # will trim options from args
+    end
+    def parser
+      OptionParser.new do |opts|
+        opts.banner = "usage: imw [OPTIONS] TASK"
+        opts.separator <<EOF
+  Run TASK for all datasets in the repository.  IMW will read any
+  *.imw files in the current directory by default.
+  Options include
+EOF
+        opts.on('-l', '--list', "List datasets in repository") do
+          options[:list] = true
+        end
+        opts.on('-s', '--selector SELECTOR', "Filter datasets by regexp SELECTOR.  Can be given more than once.") do |selector|
+          options[:selectors] << selector
+        end
+        opts.on('-r', '--require PATH', "Require PATH.  Can be given more than once.") do |path|
+          options[:requires] << path
+        end
+      end
+    end
+    def require_files
+      Dir['*.imw'].each { |path| load File.expand_path(path) }
+      options[:requires].each do |path|
+        IMW.open(path) do |requireable|
+          if requireable.directory?
+            requireable["**/*.rb"].each  { |file| require file }
+            requireable["**/*.imw"].each { |file| load    file }
+          else
+            require requireable.path
+          end
+        end
+      end
+    end
+    def task
+      args.first
+    end
+    def handles
+      matched_handles = Set.new
+      if options[:selectors].blank?
+        matched_handles += IMW::REPOSITORY.keys
+      else
+        keys = IMW::REPOSITORY.keys
+        unless keys.empty?
+          options[:selectors].each do |selector|
+            matched_handles += keys.find_all { |key| key =~ Regexp.new(selector) }
+          end
+        end
+      end
+      matched_handles.to_a.sort
+    end
+    def datasets
+      handles.map { |handle| IMW::REPOSITORY[handle] }
+    end
+    def list!
+      puts handles
+      exit
+    end
+    def run_task!
+      datasets.each do |dataset|
+        dataset[task].invoke
+      end
+      exit
+    end
+    def run!
+      require_files
+      case
+      when options[:list]
+        list!
+      when task.blank?
+        puts parser
+        exit 1
+      else
+        run_task!
+      end
+    end
+  end
+end

data/lib/imw/utils/extensions/core.rb CHANGED Viewed

@@ -1,19 +1,3 @@
-#
-# h2. lib/imw/utils/extensions/core.rb -- extensions to the Ruby core
-#
-# == About
-#
-# Some useful extensions to basic Ruby classes.  This file is required
-# by <tt>imw/utils</tt> so any files required here are automatically
-# required when loading IMW.
-#
-# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
-# Copyright:: Copyright (c) 2008 infochimps.org
-# License::   GPL 3.0
-# Website::   http://infinitemonkeywrench.org/
-#
-# puts "#{File.basename(__FILE__)}: Your monkeywrench does a complicated series of core-burning exercises and emerges with ripped, powerful-looking abs."
 require 'imw/utils/extensions/string'
 require 'imw/utils/extensions/array'
 require 'imw/utils/extensions/hash'

data/lib/imw/utils/paths.rb CHANGED Viewed

@@ -1,20 +1,3 @@
-#
-# h2. lib/imw/utils/paths.rb -- defines the path structure of IMW
-#
-# == About
-#
-# IMW uses lots of different directories to keep information on data
-# and datasets separate.  This module interfaces with the
-# configuration files to establish the paths to these IMW directories
-# and provides functions and mixins for IMW objects to use to access
-# these paths.
-#
-# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
-# Copyright:: Copyright (c) 2008 infochimps.org
-# License::   GPL 3.0
-# Website::   http://infinitemonkeywrench.org/
-#
 module IMW
   # Implements methods designed to work with an object's
@@ -67,17 +50,6 @@ module IMW
     end
   end
-  class Dataset
-    attr_reader :paths
-    include IMW::Paths
-    private
-    def set_paths
-      @paths = {}
-      add_path :self, File.dirname(eval('__FILE__'))
-    end
-  end
   def self.path_to *pathsegs
     begin
       path = Pathname.new IMW.path_to_helper(*pathsegs)

data/lib/imw.rb CHANGED Viewed

@@ -1,42 +1,31 @@
-#
-# h2. lib/imw.rb -- main imw file
-#
-# == About
-#
-# This file is the entry-point to the IMW library.  It loads a minimal
-# setup.  Optional components can be loaded by calling the function
-# <tt>IMW.imw_components</tt>.
-#
-# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
-# Copyright:: Copyright (c) 2008 infochimps.org
-# License::   GPL 3.0
-# Website::   http://infinitemonkeywrench.org/
-#
-# puts "#{File.basename(__FILE__)}: Behold, the weighty, the munificent, the Infinite Monkeywrench! Approach it with care: it has overwhelmed mightier monkeys than ye."
 require 'rubygems'
-require 'YAML' unless defined?('YAML') # some stupid collision with datamapper makes it double include
 require 'imw/boot'
 require 'imw/utils'
 require 'imw/dataset'
+require 'imw/repository'
 require 'imw/files'
 require 'imw/parsers'
 require 'imw/packagers'
-# The Infinite Monkeywrench (IMW) is a Ruby library for obtaining,
-# parsing, transforming, reconciling, and packaging datasets.
-#
-# Data is obtained via FIXME
-#
-# Data is loaded into IMW using <tt>IMW.open</tt> which provides a
-# uniform interface across a variety of data formats.  The objects
-# returned will each have +load+ method which will return data in the
-# best form for further processing.  If the data is a YAML file, then
-# Ruby's +YAML+ library will be used to return primitive Ruby objects,
-# if it is a CSV, then the +FasterCSV+ library will be used, &c.
-#
-# The main interface to handling data is the <tt>IMW::Dataset</tt>
-# class.  It has methods for summarizing, transforming, and dumping
-# data to a variety of formats.
+# The Infinite Monkeywrench (IMW) is a Ruby library for ripping,
+# extracting, parsing, munging, and packaging datasets.  It allows you
+# to handle different data formats transparently as well as organize
+# transformations of data as a network of dependencies (a la Make or
+# Rake).
+#
+# On first reading of IMW examine the classes within the IMW::Files
+# module, all transparently instantiated when using IMW.open (instead
+# of File.open).  These classes do a lot of work to ensure that all
+# objects returned by IMW.open share methods (write, read, load, dump,
+# parse, compress, extract, &c.) while continuing to use existing
+# implementations of these concepts.
+#
+# Another entrace point is the <tt>IMW::Dataset</tt> class.  It
+# leverages Rake to craft workflows for transforming datasets.  IMW
+# encourages you to organize your data transformations in a step-wise
+# process, managed with dependencies.
+#
+# Utilities to help with one step in particular (ripping, parsing,
+# pacaking, &c.) are in their own directories.
 module IMW
 end