RubyGems - imw - Versions diffs - 0.1.0 - Mend

imw 0.1.0

Files changed (111) hide show

data/.gitignore +15 -0
data/CHANGELOG +0 -0
data/LICENSE +674 -0
data/README.rdoc +101 -0
data/Rakefile +20 -0
data/VERSION +1 -0
data/etc/imwrc.rb +76 -0
data/lib/imw.rb +42 -0
data/lib/imw/boot.rb +58 -0
data/lib/imw/dataset.rb +233 -0
data/lib/imw/dataset/datamapper.rb +66 -0
data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
data/lib/imw/dataset/loaddump.rb +50 -0
data/lib/imw/dataset/old/file_collection.rb +88 -0
data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
data/lib/imw/dataset/scaffold.rb +132 -0
data/lib/imw/dataset/scraped_uri.rb +305 -0
data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
data/lib/imw/dataset/scrub/scrub.rb +147 -0
data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
data/lib/imw/dataset/scrub/slug.rb +101 -0
data/lib/imw/dataset/stats.rb +73 -0
data/lib/imw/dataset/stats/counter.rb +23 -0
data/lib/imw/dataset/task.rb +38 -0
data/lib/imw/dataset/workflow.rb +81 -0
data/lib/imw/files.rb +110 -0
data/lib/imw/files/archive.rb +113 -0
data/lib/imw/files/basicfile.rb +122 -0
data/lib/imw/files/binary.rb +28 -0
data/lib/imw/files/compressed_file.rb +93 -0
data/lib/imw/files/compressed_files_and_archives.rb +348 -0
data/lib/imw/files/compressible.rb +103 -0
data/lib/imw/files/csv.rb +112 -0
data/lib/imw/files/json.rb +41 -0
data/lib/imw/files/sgml.rb +65 -0
data/lib/imw/files/text.rb +68 -0
data/lib/imw/files/yaml.rb +46 -0
data/lib/imw/packagers.rb +8 -0
data/lib/imw/packagers/archiver.rb +108 -0
data/lib/imw/packagers/s3_mover.rb +28 -0
data/lib/imw/parsers.rb +7 -0
data/lib/imw/parsers/html_parser.rb +382 -0
data/lib/imw/parsers/html_parser/matchers.rb +306 -0
data/lib/imw/parsers/line_parser.rb +87 -0
data/lib/imw/parsers/regexp_parser.rb +72 -0
data/lib/imw/utils.rb +24 -0
data/lib/imw/utils/components.rb +61 -0
data/lib/imw/utils/config.rb +46 -0
data/lib/imw/utils/error.rb +54 -0
data/lib/imw/utils/extensions/array.rb +125 -0
data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
data/lib/imw/utils/extensions/core.rb +43 -0
data/lib/imw/utils/extensions/dir.rb +24 -0
data/lib/imw/utils/extensions/file_core.rb +64 -0
data/lib/imw/utils/extensions/hash.rb +218 -0
data/lib/imw/utils/extensions/hpricot.rb +48 -0
data/lib/imw/utils/extensions/string.rb +49 -0
data/lib/imw/utils/extensions/struct.rb +42 -0
data/lib/imw/utils/extensions/symbol.rb +28 -0
data/lib/imw/utils/extensions/typed_struct.rb +22 -0
data/lib/imw/utils/extensions/uri.rb +59 -0
data/lib/imw/utils/log.rb +67 -0
data/lib/imw/utils/misc.rb +63 -0
data/lib/imw/utils/paths.rb +115 -0
data/lib/imw/utils/uri.rb +59 -0
data/lib/imw/utils/uuid.rb +33 -0
data/lib/imw/utils/validate.rb +38 -0
data/lib/imw/utils/version.rb +12 -0
data/lib/imw/utils/view.rb +113 -0
data/lib/imw/utils/view/dump_csv.rb +112 -0
data/lib/imw/utils/view/dump_csv_older.rb +117 -0
data/spec/data/sample.csv +131 -0
data/spec/data/sample.tsv +131 -0
data/spec/data/sample.txt +131 -0
data/spec/data/sample.xml +653 -0
data/spec/data/sample.yaml +652 -0
data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
data/spec/imw/files/archive_spec.rb +118 -0
data/spec/imw/files/basicfile_spec.rb +121 -0
data/spec/imw/files/bz2_spec.rb +32 -0
data/spec/imw/files/compressed_file_spec.rb +96 -0
data/spec/imw/files/compressible_spec.rb +100 -0
data/spec/imw/files/file_spec.rb +144 -0
data/spec/imw/files/gz_spec.rb +32 -0
data/spec/imw/files/rar_spec.rb +33 -0
data/spec/imw/files/tar_spec.rb +31 -0
data/spec/imw/files/text_spec.rb +23 -0
data/spec/imw/files/zip_spec.rb +31 -0
data/spec/imw/files_spec.rb +38 -0
data/spec/imw/packagers/archiver_spec.rb +125 -0
data/spec/imw/packagers/s3_mover_spec.rb +7 -0
data/spec/imw/parsers/line_parser_spec.rb +96 -0
data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
data/spec/imw/utils/extensions/find_spec.rb +113 -0
data/spec/imw/utils/paths_spec.rb +38 -0
data/spec/imw/workflow/rip/local_spec.rb +89 -0
data/spec/imw/workflow/rip_spec.rb +27 -0
data/spec/rcov.opts +1 -0
data/spec/spec.opts +4 -0
data/spec/spec_helper.rb +32 -0
data/spec/support/archive_contents_matcher.rb +94 -0
data/spec/support/custom_matchers.rb +21 -0
data/spec/support/directory_contents_matcher.rb +61 -0
data/spec/support/extensions.rb +18 -0
data/spec/support/file_contents_matcher.rb +50 -0
data/spec/support/random.rb +210 -0
data/spec/support/without_regard_to_order_matcher.rb +58 -0
metadata +196 -0

data/lib/imw/parsers/html_parser/matchers.rb ADDED

@@ -0,0 +1,306 @@
+#
+# h2. lib/imw/parsers/html_parser/matcher.rb -- utility classes for html parser
+#
+# == About
+#
+# This file defines the <tt>IMW::HTMLParserMatcher::Matcher</tt>
+# abstract class and some concrete subclasses which perform specific
+# kinds of matches against HTML documents using the
+# Hpricot[https://code.whytheluckystiff.net/hpricot/] library.
+#
+# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
+# Copyright:: Copyright (c) 2008 infochimps.org
+# License::   GPL 3.0
+# Website::   http://infinitemonkeywrench.org/
+#
+# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
+require 'imw/utils/extensions/hpricot'
+module IMW
+  module HTMLParserMatcher
+    # An abstract class from which to subclass specific HTML matchers.
+    #
+    # A subclass is initialized with a +selector+ and an optional
+    # +matcher+.  The +selector+ is an HTML path specification used to
+    # collect elements from the document.  If initialized with a
+    # +matcher+, the +matcher+ is used to return match information
+    # from the elements; else the inner HTML is returned.  Subclasses
+    # decide how the +selector+ will collect elements.
+    class Matcher
+      attr_accessor :selector
+      attr_accessor :matcher
+      attr_accessor :options
+      def initialize selector, matcher=nil, options={}
+        self.selector = selector
+        self.matcher  = matcher
+        self.options  = options
+      end
+      def match doc
+        raise "Abstract class #{self.class}"
+      end
+    end
+    # Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
+    # for matching against the first element of a document matching a
+    # selector.
+    class MatchFirstElement < Matcher
+      # Grab the first element from +doc+ matching the +selector+ this
+      # class was initialized with.  If initialized with a +matcher+,
+      # then return the +matcher+'s match against the first element,
+      # else just return the inner HTML of the first element.
+      #
+      #   m = MatchFirstElement.new('span#bio/a.homepage')
+      #   m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>')
+      #   # => 'My Homepage'
+      def match doc
+        doc = Hpricot(doc) if doc.is_a?(String)
+        el = doc.at(selector) or return nil
+        if matcher
+          matcher.match(el)
+        else
+          options[:html] ? el.inner_html : el.inner_text.strip
+        end
+      end
+    end
+    # FIXME is there really a need for this separate class?  why can't
+    # MatchFirstElement.match accept a block?
+    class MatchProc < MatchFirstElement
+      attr_accessor :proc
+      attr_accessor :options
+      def initialize selector, proc, matcher=nil, options={}
+        super selector, matcher
+        self.options = options
+        self.proc = proc
+      end
+      def match doc
+        val = super doc
+        val ? self.proc.call(val) : self.proc.call(doc)
+      end
+    end
+    # Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
+    # for matching each element of a document matching a selector.
+    class MatchArray < Matcher
+      # Grab each element from +doc+ matching the +selector+ this
+      # class was initialized with.  If initialized with a +matcher+,
+      # then return an array consisting of the +matcher+'s match
+      # against each element, else just return an array consisting of
+      # the inner HTML of each element.
+      #
+      #   m = MatchArray.new('span#bio/a.homepage')
+      #   m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>
+      #            <span id="bio"><a class="homepage" href="http://foo.baz">Your Homepage</a></span>
+      #            <span id="bio"><a class="homepage" href="http://foo.qux">Their Homepage</a></span>')
+      #   # => ["My Homepage", "Your Homepage", "Their Homepage"]
+      def match doc
+        doc = Hpricot(doc) if doc.is_a?(String)
+        subdoc = (doc/selector) or return nil
+        if matcher
+          subdoc.map{|el| matcher.match(el)}
+        else
+          if options[:html]
+            subdoc.map{|el| el.inner_html }
+          else
+            subdoc.map{|el| el.inner_text.strip }
+          end
+        end
+      end
+    end
+    # Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
+    # for matching an attribute of the first element of a document
+    # matching a selector.
+    class MatchAttribute < Matcher
+      attr_accessor :attribute
+      # Unlike <tt>IMW::HTMLParserMatcher::Matcher</tt>,
+      # <tt>IMW::HTMLParserMatcher::MatchAttribute</tt> is initialized
+      # with three arguments: the +selector+ which collects elements
+      # from an HTML document, an +attribute+ to extract, and
+      # (optionally) a +matcher+ to perform the matching.
+      def initialize selector, attribute, matcher=nil
+        super selector, matcher
+        self.attribute = attribute.to_s
+      end
+      # Grab the first element from +doc+ matching the +selector+ this
+      # class was initialized with.  If initialized with a +matcher+,
+      # then return the +matcher+'s match against the value of the
+      # +attribute+ this class was initialized with, else just return
+      # the value of the +attribute+.
+      #
+      #   m = MatchAttribute.new('span#bio/a.homepage', 'href')
+      #   m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>')
+      #   # => 'http://foo.bar'
+      def match doc
+        doc = Hpricot(doc) if doc.is_a?(String)
+        val = doc.path_attr(selector, attribute)
+        matcher ? matcher.match(val) : val
+      end
+    end
+    # Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
+    # for using a regular expression to match against text in an HTML
+    # document.
+    class MatchRegexp < Matcher
+      attr_accessor :re
+      attr_accessor :options
+      # Use the regular expression +re+ to return captures from the
+      # elements collected by +selector+ (treated as text) used on an
+      # HTML document (if +selector+ is +nil+ then match against the
+      # full text of the document).  If the keyword argument
+      # <tt>:capture</tt> is specified then return the corresponding
+      # group (indexing is that of regular expressions; "1" is the
+      # first capture), else return an array of all captures.  If
+      # +matcher+, then use it on the capture(s) before returning.
+      #
+      # FIXME Shouldn't the matcher come BEFORE the regexp capture,
+      # not after?
+      def initialize selector, re, matcher=nil, options={}
+        super selector, matcher
+        self.options = options
+        self.re = re
+      end
+      # Grab the first element from +doc+ matching the +selector+ this
+      # object was initialized with.  Use the +re+ and the (optional)
+      # capture group this object was initialized with to capture a
+      # string (or array of strings if no capture group was specified)
+      # from the collected element (treated as text). If initialized
+      # with a +matcher+, then return the +matcher+'s match against
+      # the value of the capture(s), else just return the capture(s).
+      #
+      #   m = MatchRegexp.new('span#bio/a.homepage', /Homepage of (.*)$/, nil, :capture => 1 )
+      #   m.match('<span id="bio"><a class="homepage" href="http://foo.bar">Homepage of John Chimpo</a></span>')
+      #   # => "John Chimpo"
+      def match doc
+        doc = Hpricot(doc) if doc.is_a?(String)
+        el = selector ? doc.contents_of(selector) : doc
+        m = re.match(el.to_s)
+        val = case
+              when m.nil? then nil
+              when self.options.key?(:capture) then m.captures[self.options[:capture] - 1] # -1 to match regexp indexing
+              else m.captures
+              end
+        # pass to matcher, if any
+        matcher ? matcher.match(val) : val
+      end
+    end
+    class MatchRegexpRepeatedly < Matcher
+      attr_accessor :re
+      def initialize selector, re, matcher=nil
+        super selector, matcher
+        self.re = re
+      end
+      def match doc
+        doc = Hpricot(doc) if doc.is_a?(String)
+        # apply selector, if any
+        el = selector ? doc.contents_of(selector) : doc
+        return unless el
+        # get all matches
+        val = el.to_s.scan(re)
+        # if there's only one capture group, flatten the array
+        val = val.flatten if val.first && val.first.length == 1
+        # pass to matcher, if any
+        matcher ? matcher.match(val) : val
+      end
+    end
+    # Class for building a hash of values by using appropriate
+    # matchers against an HTML document.
+    class MatchHash
+      attr_accessor :match_hash
+      # The +match_hash+ must be a +Hash+ of symbols matched to HTML
+      # matchers (subclasses of
+      # <tt>IMW::HTMLParserMatcher::Matcher</tt>).
+      def initialize match_hash
+        # Kludge? maybe.
+        raise "MatchHash requires a hash of :attributes => matchers." unless match_hash.is_a?(Hash)
+        self.match_hash = match_hash
+      end
+      # Use the +match_hash+ this +MatchHash+ was initialized with to
+      # select elements from +doc+ and extract information from them:
+      #
+      #   m = MatchHash.new({
+      #       :name         => MatchFirstElement.new('li/span.customer'),
+      #       :order_status => MatchAttribute.new('li/ul[@status]','status'),
+      #       :products     => MatchArray.new('li/ul/li')
+      #     })
+      #   m.match('<li><span class="customer">John Chimpo</span>
+      #                <ul status="shipped">
+      #                  <li>bananas</li>
+      #                  <li>mangos</li>
+      #                  <li>banangos</li>
+      #                </ul></li>')
+      #   # => {
+      #         :name         => "John Chimpo",
+      #         :order_status => "shipped",
+      #         :products     => ["bananas", "mangos", "banangos"]
+      #        }
+      def match doc
+        doc = Hpricot(doc) if doc.is_a?(String)
+        hsh = { }
+        match_hash.each do |attr, m|
+          val = m.match(doc)
+          case attr
+          when Array then hsh.merge!(Hash.zip(attr, val).reject{|k,v| v.nil? }) if val
+          else            hsh[attr] = val  end
+        end
+        self.class.scrub!(hsh)
+      end
+      # kill off keys with nil values
+      def self.scrub! hsh
+        hsh # .reject{|k,v| v.nil? }
+      end
+    end
+    #
+    # construct the downstream part of a hash matcher
+    #
+    def self.build_match_hash spec_hash
+      hsh = { }
+      spec_hash.each do |attr, spec|
+        hsh[attr] = build_parse_tree(spec)
+      end
+      hsh
+    end
+    #
+    # recursively build a tree of matchers
+    #
+    def self.build_parse_tree spec
+      case spec
+      when nil            then nil
+      when Matcher        then spec
+      when Hash           then MatchHash.new(build_match_hash(spec))
+      when Array          then
+        return nil if spec.empty?
+        raise "Array spec must be a single selector or a selector and another match specification" unless (spec.length <= 2)
+        MatchArray.new(spec[0].to_s, build_parse_tree(spec[1]))
+      when String         then MatchFirstElement.new(spec)
+      when Proc           then MatchProc.new(nil, spec)
+      when Regexp         then MatchRegexp.new(nil, spec, nil, :capture => 1)
+      else raise "Don't know how to parse #{spec.inspect}"
+      end
+    end
+  end
+end

data/lib/imw/parsers/line_parser.rb ADDED

@@ -0,0 +1,87 @@
+module IMW
+  module Parsers
+    # This is an abstract class for a line-oriented parser intended to
+    # read and emit lines sequentially from a file.
+    #
+    # To leverage the functionality of this class, subclass it and
+    # define a +parse_line+ method.
+    class LineParser
+      # The number of lines to skip on each file parsed.
+      attr_accessor :skip_first
+      # The class to parse each line into.  The +new+ method of this
+      # class must accept a hash.
+      attr_accessor :klass
+      # If called with the option <tt>:skip_first</tt> then skip the
+      # corresponding number of lines at the beginning of the file when
+      # parsing.
+      def initialize options={}
+        @skip_first = options[:skip_first] || 0
+        @klass      = options[:of]         || options[:klass]
+      end
+      # Parse the given file.  If the option <tt>:lines</tt> is passed
+      # in then only parse that many lines.  If given a block then
+      # yield the result of each line to the block; else just return
+      # an array of results.
+      #
+      # If this parser has a +klass+ attribute then each parsed line
+      # will first be turned into an instance of that class (the class
+      # must accept a hash of values in its initializer).
+      def parse! file, options={}, &block
+        skip_lines!(file)
+        if options[:lines]
+          case
+          when klass && block_given?
+            options[:lines].times do
+              yield klass.new(parse_line(file.readline))
+            end
+          when block_given?
+            options[:lines].times do
+              yield parse_line(file.readline)
+            end
+          when klass
+            options[:lines].times do
+              klass.new(parse_line(file.readline))
+            end
+          else
+            options[:lines].times.map do
+              parse_line(file.readline)
+            end
+          end
+        else
+          case
+          when klass && block_given?
+            file.each do |line|
+              yield klass.new(parse_line(line))
+            end
+          when block_given?
+            file.each do |line|
+              yield parse_line(line)
+            end
+          when klass
+            file.map do |line|
+              klass.new(parse_line(line))
+            end
+          else
+            file.map do |line|
+              parse_line(line)
+            end
+          end
+        end
+      end
+      def parse_line line
+        raise IMW::NotImplementedError.new("Subclass the LineParser and redefine this method to create a true parser.")
+      end
+      protected
+      def skip_lines! file
+        skip_first.times { file.readline }
+      end
+    end
+  end
+end

data/lib/imw/parsers/regexp_parser.rb ADDED

@@ -0,0 +1,72 @@
+require 'imw/parsers/line_parser'
+module IMW
+  module Parsers
+    # A RegexpParser is a line-oriented parser which uses a regular
+    # expression to extract data from a line into either a hash or an
+    # object obeying hash semantics.
+    #
+    # As an example, a flat file with one record per line in the
+    # following format (this is a simplified version of common
+    # webserver log formats)
+    #
+    #   151.199.53.145 14-Oct-2007:13:34:34-0500 GET /phpmyadmin/main.php HTTP/1.0
+    #   81.227.179.120 14-Oct-2007:13:34:34-0500 GET /phpmyadmin/libraries/select_lang.lib.php HTTP/1.0
+    #   81.3.107.173 14-Oct-2007:13:54:26-0500 GET / HTTP/1.1
+    #   ...
+    #
+    # could be parsed as follows
+    #
+    #   file   = File.new '/path/to/file.log'
+    #   parser = IMW::Parsers::RegexpParser.new :by_regexp   => %r{^([\d\.]+) (\d{2}-\w{3}-\d{4}:\d{2}:\d{2}:\d{2}-\d{4}) (\w+) ([^\s]+) HTTP/([\d.]{3})$},
+    #                                           :into_fields => [:ip, :timestamp, :verb, :url, :version]
+    #   parser.parse file #=> [{:ip => '151.199.53.145', :timestamp => '14-Oct-2007:13:34:34-0500', :verb => 'GET', :url => '/phpmyadmin/main.php', :version => "1.0"}, ... ]
+    #
+    # Consecutive captures from the regular expression will be pushed
+    # into a hash with keys given by the +into_fields+ property of
+    # this parser.
+    #
+    # If the parser is instantiated with the <tt>:of</tt> keyword then
+    # the parsed hash from each line is used to instantiate a new
+    # object of the corresponding class:
+    #
+    #   require 'ostruct'
+    #
+    #   PageView = Class.new(OpenStruct)
+    #
+    #   parser = IMW::Parsers::RegexpParser.new :by_regexp   => %r{^([\d\.]+) (\d{2}-\w{3}-\d{4}:\d{2}:\d{2}:\d{2}-\d{4}) (\w+) ([^\s]+) HTTP/([\d.]{3})$},
+    #                                           :into_fields => [:ip, :timestamp, :verb, :url, :version],
+    #                                           :of          => PageView
+    #
+    #   parser.parse! file #=> [#<PageView ip="151.199.53.145", timestamp="14-Oct-2007:13:34:34-0500", verb="GET", url="/phpmyadmin/main.php", version="1.0">, ... ]
+    #
+    # The option <tt>:strictly</tt> can also be set to force the
+    # parser to raise an error if it finds a line which doesn't match
+    # its regexp.
+    class RegexpParser < LineParser
+      attr_accessor :regexp, :fields, :strict
+      def initialize options={}
+        @regexp = options[:regexp] || options[:by_regexp]
+        @fields = options[:fields] || options[:into_fields]
+        @strict = options[:strict] || options[:strictly]
+        super options
+      end
+      def parse_line line
+        match_data = regexp.match(line.chomp)
+        returning({}) do |hsh|
+          if match_data
+            match_data.captures.each_with_index do |capture, index|
+              hsh[fields[index]] = capture
+            end
+          else
+            raise IMW::ParseError.new("Could not parse the following line:\n\n#{line}\n\nusing regexp\n\n#{regexp.to_s}") if strict
+          end
+        end
+      end
+    end
+  end
+end