RubyGems - imw - Versions diffs - 0.2.18 → 0.3.0 - Mend

imw 0.2.18 → 0.3.0

Files changed (172) hide show

data/Gemfile +7 -26
data/Gemfile.lock +13 -38
data/{LICENSE → LICENSE.txt} +1 -1
data/README.textile +35 -0
data/Rakefile +45 -22
data/VERSION +1 -1
data/examples/foo.rb +19 -0
data/examples/html_selector.rb +22 -0
data/examples/nes_game_list.csv +625 -0
data/examples/nes_gamespot.csv +1371 -0
data/examples/nes_nintendo.csv +624 -0
data/examples/nes_unlicensed.csv +89 -0
data/examples/nes_wikipedia.csv +710 -0
data/examples/nibbler_test.rb +24 -0
data/examples/script.rb +19 -0
data/lib/imw.rb +28 -140
data/lib/imw/error.rb +9 -0
data/lib/imw/recordizer.rb +8 -0
data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
data/lib/imw/resource.rb +3 -119
data/lib/imw/serializer.rb +7 -0
data/lib/imw/serializer/json_serializer.rb +17 -0
data/lib/imw/uri.rb +41 -0
data/spec/resource_spec.rb +78 -0
data/spec/uri_spec.rb +55 -0
metadata +81 -232
data/README.rdoc +0 -371
data/bin/imw +0 -5
data/bin/tsv_to_json.rb +0 -29
data/etc/imwrc.rb +0 -26
data/examples/dataset.rb +0 -12
data/examples/metadata.yml +0 -10
data/lib/imw/archives.rb +0 -120
data/lib/imw/archives/rar.rb +0 -19
data/lib/imw/archives/tar.rb +0 -19
data/lib/imw/archives/tarbz2.rb +0 -73
data/lib/imw/archives/targz.rb +0 -73
data/lib/imw/archives/zip.rb +0 -51
data/lib/imw/boot.rb +0 -87
data/lib/imw/compressed_files.rb +0 -94
data/lib/imw/compressed_files/bz2.rb +0 -16
data/lib/imw/compressed_files/compressible.rb +0 -75
data/lib/imw/compressed_files/gz.rb +0 -16
data/lib/imw/dataset.rb +0 -125
data/lib/imw/dataset/paths.rb +0 -29
data/lib/imw/dataset/workflow.rb +0 -195
data/lib/imw/formats.rb +0 -33
data/lib/imw/formats/delimited.rb +0 -170
data/lib/imw/formats/excel.rb +0 -100
data/lib/imw/formats/json.rb +0 -41
data/lib/imw/formats/pdf.rb +0 -71
data/lib/imw/formats/sgml.rb +0 -69
data/lib/imw/formats/yaml.rb +0 -41
data/lib/imw/metadata.rb +0 -83
data/lib/imw/metadata/contains_metadata.rb +0 -54
data/lib/imw/metadata/dsl.rb +0 -111
data/lib/imw/metadata/field.rb +0 -37
data/lib/imw/metadata/has_metadata.rb +0 -98
data/lib/imw/metadata/has_summary.rb +0 -57
data/lib/imw/metadata/schema.rb +0 -17
data/lib/imw/parsers.rb +0 -8
data/lib/imw/parsers/flat.rb +0 -44
data/lib/imw/parsers/html_parser.rb +0 -387
data/lib/imw/parsers/html_parser/matchers.rb +0 -289
data/lib/imw/parsers/line_parser.rb +0 -87
data/lib/imw/parsers/regexp_parser.rb +0 -72
data/lib/imw/repository.rb +0 -12
data/lib/imw/runner.rb +0 -118
data/lib/imw/schemes.rb +0 -23
data/lib/imw/schemes/ftp.rb +0 -142
data/lib/imw/schemes/hdfs.rb +0 -251
data/lib/imw/schemes/http.rb +0 -165
data/lib/imw/schemes/local.rb +0 -409
data/lib/imw/schemes/remote.rb +0 -119
data/lib/imw/schemes/s3.rb +0 -143
data/lib/imw/schemes/sql.rb +0 -129
data/lib/imw/tools.rb +0 -12
data/lib/imw/tools/aggregator.rb +0 -148
data/lib/imw/tools/archiver.rb +0 -220
data/lib/imw/tools/downloader.rb +0 -63
data/lib/imw/tools/extension_analyzer.rb +0 -114
data/lib/imw/tools/summarizer.rb +0 -83
data/lib/imw/tools/transferer.rb +0 -167
data/lib/imw/utils.rb +0 -74
data/lib/imw/utils/dynamically_extendable.rb +0 -137
data/lib/imw/utils/error.rb +0 -59
data/lib/imw/utils/extensions/hpricot.rb +0 -34
data/lib/imw/utils/has_uri.rb +0 -131
data/lib/imw/utils/log.rb +0 -92
data/lib/imw/utils/misc.rb +0 -57
data/lib/imw/utils/paths.rb +0 -146
data/lib/imw/utils/uri.rb +0 -59
data/lib/imw/utils/uuid.rb +0 -33
data/lib/imw/utils/validate.rb +0 -38
data/lib/imw/utils/version.rb +0 -11
data/spec/data/formats/delimited/sample.csv +0 -131
data/spec/data/formats/delimited/sample.tsv +0 -131
data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
data/spec/data/formats/excel/sample.xls +0 -0
data/spec/data/formats/json/sample.json +0 -1
data/spec/data/formats/none/sample +0 -650
data/spec/data/formats/sgml/sample.xml +0 -617
data/spec/data/formats/text/sample.txt +0 -650
data/spec/data/formats/yaml/sample.yaml +0 -410
data/spec/data/schema-tabular.yaml +0 -11
data/spec/imw/archives/rar_spec.rb +0 -16
data/spec/imw/archives/tar_spec.rb +0 -16
data/spec/imw/archives/tarbz2_spec.rb +0 -24
data/spec/imw/archives/targz_spec.rb +0 -21
data/spec/imw/archives/zip_spec.rb +0 -16
data/spec/imw/archives_spec.rb +0 -77
data/spec/imw/compressed_files/bz2_spec.rb +0 -15
data/spec/imw/compressed_files/compressible_spec.rb +0 -36
data/spec/imw/compressed_files/gz_spec.rb +0 -15
data/spec/imw/compressed_files_spec.rb +0 -47
data/spec/imw/dataset/paths_spec.rb +0 -32
data/spec/imw/dataset/workflow_spec.rb +0 -41
data/spec/imw/formats/delimited_spec.rb +0 -44
data/spec/imw/formats/excel_spec.rb +0 -55
data/spec/imw/formats/json_spec.rb +0 -18
data/spec/imw/formats/sgml_spec.rb +0 -24
data/spec/imw/formats/yaml_spec.rb +0 -19
data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
data/spec/imw/metadata/field_spec.rb +0 -25
data/spec/imw/metadata/has_metadata_spec.rb +0 -58
data/spec/imw/metadata/has_summary_spec.rb +0 -32
data/spec/imw/metadata/schema_spec.rb +0 -24
data/spec/imw/metadata_spec.rb +0 -86
data/spec/imw/parsers/line_parser_spec.rb +0 -96
data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
data/spec/imw/resource_spec.rb +0 -32
data/spec/imw/schemes/hdfs_spec.rb +0 -67
data/spec/imw/schemes/http_spec.rb +0 -19
data/spec/imw/schemes/local_spec.rb +0 -165
data/spec/imw/schemes/remote_spec.rb +0 -38
data/spec/imw/schemes/s3_spec.rb +0 -31
data/spec/imw/schemes/sql_spec.rb +0 -3
data/spec/imw/tools/aggregator_spec.rb +0 -71
data/spec/imw/tools/archiver_spec.rb +0 -120
data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
data/spec/imw/tools/summarizer_spec.rb +0 -8
data/spec/imw/tools/transferer_spec.rb +0 -195
data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
data/spec/imw/utils/has_uri_spec.rb +0 -61
data/spec/imw/utils/paths_spec.rb +0 -10
data/spec/imw/utils/shared_paths_spec.rb +0 -29
data/spec/imw_spec.rb +0 -14
data/spec/rcov.opts +0 -1
data/spec/spec_helper.rb +0 -31
data/spec/support/custom_matchers.rb +0 -28
data/spec/support/file_contents_matcher.rb +0 -30
data/spec/support/paths_matcher.rb +0 -66
data/spec/support/random.rb +0 -213
data/spec/support/without_regard_to_order_matcher.rb +0 -41

@@ -1,17 +0,0 @@
-module IMW
-  class Metadata
-    # Represents a schema for data.
-    #
-    # FIXME add methods that help couple nicely with Avro schemata.
-    class Schema < Hash
-      def initialize obj=nil
-        super()
-        merge!(obj) if obj.is_a?(Hash) || obj.is_a?(Schema)
-      end
-    end
-  end
-end

data/lib/imw/parsers.rb DELETED

@@ -1,8 +0,0 @@
-module IMW
-  module Parsers
-    autoload :LineParser,   'imw/parsers/line_parser'
-    autoload :RegexpParser, 'imw/parsers/regexp_parser'
-    autoload :HtmlParser,   'imw/parsers/html_parser'
-    autoload :Flat,         'imw/parsers/flat'
-  end
-end

data/lib/imw/parsers/flat.rb DELETED

@@ -1,44 +0,0 @@
-module IMW
-  module Parsers
-    class Flat
-      attr_accessor :io
-      attr_accessor :state
-      attr_accessor :accumulated
-      attr_accessor :current
-      def initialize io
-        self.io          = io
-        self.state       = nil
-        self.accumulated = []
-        self.current     = nil
-      end
-      def read_next!
-        self.current = io.readline.chomp
-      end
-      def parse!
-        while (! complete?)
-          read_next!
-          react_to_input!
-        end
-      end
-      def accumulate!
-        self.accumulated << current
-      end
-      def complete?
-        io.eof?
-      end
-      def react_to_input!
-        raise IMW::NotImplementedError.new("Override the `react_to_input!' method of the #{self.class} class")
-      end
-    end
-  end
-end

data/lib/imw/parsers/html_parser.rb DELETED

@@ -1,387 +0,0 @@
-#
-# h2. lib/imw/parsers/html_parser.rb -- html parser
-#
-# == About
-#
-# h4. HTML Extractor
-#
-# * map repeating HTML elements to intermediate ruby data structure
-# * optimize all the common cases for expressive brevity
-# * output structure will come from HTML structure; map to desired output objects in transform stage.
-# * spec shouldn't be allowed to get too much more complicated than this; again, transform stage exists
-#
-# If this doesn't yield satisfaction you may enjoy
-# * http://blog.labnotes.org/2006/07/11/scraping-with-style-scrapi-toolkit-for-ruby/
-# * http://scrubyt.org/
-# Note of course that these have quite different goals.  For example, we don't
-# have any interest in "interactive" crawling, eg form submission, or at least
-# that goes elsewhere.
-#
-#
-# == Sample HTML (http://twitter.com):
-#
-#   <ul class="about vcard entry-author">
-#     <li         ><span class="label">Name</span>     <span class="fn" >MarsPhoenix       </span> </li>
-#     <li         ><span class="label">Location</span> <span class="adr">Mars, Solar System</span> </li>
-#     <li id="bio"><span class="label">Bio</span>      <span class="bio">I dig Mars!       </span> </li>
-#     <li         ><span class="label">Web</span>
-#        <a href="http://tinyurl.com/5wwaru" class="url" rel="me nofollow">http://tinyurl.co...</a></li>
-#   </ul>
-#
-# == Parser Spec:
-#   :hcard        => m_one('//ul.vcard.about',
-#     {
-#       :name     => 'li/span.fn',
-#       :location => 'li/span.adr',
-#       :url      => m_attr('li/a.url[@href]', 'href'),
-#       :bio      => 'li#bio/span.bio',
-#     }
-#   )
-#
-# == Example return:
-#   { :hcard => { :name => 'Mars Phoenix', :location => 'Mars, Solar System', :bio => 'I dig Mars!', :url => 'http://tinyurl.com/5wwaru' } }
-#
-# == Sample HTML (http://delicious.com):
-#   <ul id="bookmarklist" class="bookmarks NOTHUMB">
-#     <li class="post" id="item-...">
-#       <div class="bookmark NOTHUMB">
-#         <div class="dateGroup">         <span title="23 APR 08">23 APR 08</span>     </div>
-#         <div class="data">
-#           <h4>                          <a rel="nofollow" class="taggedlink" href="http://www.cs.biu.ac.il/~koppel/BlogCorpus.htm">Blog Authorship Corpus (Blogger.com 1994)</a>
-#                                         <a class="inlinesave" href="...">SAVE</a> </h4>
-#           <h5 class="savers-label">     PEOPLE</h5>
-#           <div class="savers savers2">  <a class="delNav" href="/url/7df6661946fca61863312644eb071953"><span class="delNavCount">26</span></a>  </div>
-#           <div class="description">     The Blog Authorship Corpus consists of the collected posts of 19,320 bloggers gathered from blogger.com in August 2004. The corpus incorporates a total of 681,288 posts and over 140 million words - or approximately 35 posts and 7250 words per person. </div>
-#         </div>
-#         <div class="meta"></div>
-#         <h5 class="tag-chain-label">TAGS</h5>
-#         <div class="tagdisplay">
-#           <ul class="tag-chain">
-#             <li class="tag-chain-item off first"><a class="tag-chain-item-link" rel="tag" href="/infochimps/blog"     ><span class="tag-chain-item-span">blog</span>    </a></li>
-#             <li class="tag-chain-item off">      <a class="tag-chain-item-link" rel="tag" href="/infochimps/corpus"   ><span class="tag-chain-item-span">corpus</span>  </a></li>
-#             <li class="tag-chain-item off">      <a class="tag-chain-item-link" rel="tag" href="/infochimps/analysis" ><span class="tag-chain-item-span">analysis</span></a></li>
-#             <li class="tag-chain-item off">      <a class="tag-chain-item-link" rel="tag" href="/infochimps/nlp"      ><span class="tag-chain-item-span">nlp</span>     </a></li>
-#             <li class="tag-chain-item on  last"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/dataset"  ><span class="tag-chain-item-span">dataset</span> </a></li>
-#           </ul>
-#         </div>
-#         <div class="clr"></div>
-#       </div>
-#     </li>
-#   </ul>
-#
-# == Parser Specification:
-#   :bookmarks            => [ 'ul#bookmarklist/li.post/.bookmark',
-#     {
-#       :date                     => hash(    '.dateGroup/span',
-#          [:year, :month, :day]  => regexp(  '', /(\d{2}) ([A-Z]{3}) (\d{2})/),
-#          ),
-#       :title                    =>          '.data/h4/a.taggedlink',
-#       :url                      => attr(    '.data/h4/a.taggedlink', 'href'),
-#       :del_link_url             => href(    '.data/.savers/a.delNav),
-#       :num_savers               => to_i(    '.data/.savers//span.delNavCount'),
-#       :description              =>          '.data/.description',
-#       :tags                     =>         ['.tagdisplay//tag-chain-item-span']
-#     }
-#   ]
-#
-# == Example output:
-#   { :bookmarks => [
-#     { :date             => { :year => '08', :month => 'APR', :day => '23' },
-#       :title            => 'Blog Authorship Corpus (Blogger.com 1994)',
-#       :url              => 'http://www.cs.biu.ac.il/~koppel/BlogCorpus.htm',
-#       :del_link_url     => '/url/7df6661946fca61863312644eb071953',
-#       :num_savers       => 26,
-#       :description      => 'The Blog ... ',
-#       :tags             => ['blog', 'corpus', 'analysis', 'nlp', 'dataset'],
-#      }
-#    ]}
-#
-# == Implementation:
-#
-# Internally, we take the spec and turn it into a recursive structure of Matcher
-# objects.  These consume Hpricot Elements and return the appropriately extracted
-# object.
-#
-# Note that the /default/ is for a bare selector to match ONE element, and to not
-# complain if there are many.
-#
-# Missing elements are silently ignored -- for example if
-#   :foo => 'li.missing'
-# there will simply be no :foo element in the hash (as opposed to having hsh[:foo]
-# set to nil -- hsh.include?(foo) will be false)
-#
-#
-# == List of Matchers:
-#     { :field => /spec/, ... }           # hash          hash, each field taken from spec.
-#     [ "hpricot_path" ]                  # 1-el array    array: for each element matching
-#                                                         hpricot_path, the inner_html
-#     [ "hpricot_path", /spec/ ]          # 2-el array    array: for each element matching
-#                                                         hpricot_path, pass to spec
-#     "hpricot_path"                      # string        same as one("hpricot_path")
-#     one("hpricot_path")                 # one           first match to hpricot_path
-#     one("hpricot_path", /spec/)         # one           applies spec to first match to hpricot_path
-#     (these all match on one path:)
-#     regexp("hpricot_path", /RE/)        # regexp        capture groups from matching RE against
-#                                                         inner_html of first match to hpricot_path
-#     attr("hpricot_path", 'attr_name')   # attr
-#     href("hpricot_path")                # href          shorthand for attr(foo, 'href')
-#     no_html                             #               strip tags from contents
-#     html_encoded                        #               html encode contents
-#     to_i, to_f, etc                     # convert
-#     lambda{|doc| ... }                  # proc          calls proc on current doc
-#
-# == Complicated HCard example:
-#     :hcards                     =>      [ '//ul.users/li.vcard',
-#       {
-#         :name                   =>      '.fn',
-#         :address                =>      one('.adr',
-#           :street               =>      '.street',
-#           :city                 =>      '.city',
-#           :zip                  =>      '.postal'
-#         )
-#         :tel                    =>      [ 'span.tel',
-#           {
-#             :type               =>      'span.type',
-#             [:cc, :area, :num]  =>      hp.regexp('span.value', /+(\d+).(\d{3})-(\d{3}-\d{4})/),
-#           }
-#         ]
-#         :tags                   =>      [ '.tag' ],
-#       }
-#     ]
-#
-# == Resulting Parser
-#     MatchHash({:hcards  =>      MatchArray('//ul.users/li.hcard',
-#       MatchHash({
-#         :name                   =>      MatchFirst('.fn'),
-#         :address                =>      MatchFirst('.adr',
-#           MatchHash({
-#             :street             =>      MatchFirst('.street'),
-#             :city               =>      MatchFirst('.locality),
-#             :state              =>      MatchFirst('.region),
-#             :zip                =>      MatchFirst('.postal'),
-#           }))
-#         :tel                    =>      MatchArray('span.tel',
-#           MatchHash({
-#             :type               =>      MatchFirst('span.type'),
-#             [:cc, :area, :num]  =>      RegexpMatcher('span.value', /+(\d+).(\d{3})-(\d{3}-\d{4})/),
-#           })
-#         )
-#         :tags                   =>      MatchArray('.tag'),
-#       })
-#     )
-#
-# == Example output
-#     [
-#       {:tel     => [ {:type => 'home', :cc => '49', :area => '305', :num => '555-1212'},
-#                      {:type => 'work', :cc => '49', :area => '305', :num => '555-6969'}, ],
-#        :name    => "Bob Dobbs, Jr.",
-#        :tags    => ["church"] },
-#       {:tel     => [ {:type => 'fax',  :cc => '49', :area => '305', :num => '867-5309'}, ],
-#        :name    => "Jenny",
-#        :address => { :street => "53 Evergreen Terr.", :city => "Springfield" },
-#        :tags    => ["bathroom", "wall"] },
-#     ]
-#
-# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
-# Copyright:: Copyright (c) 2008 infochimps.org
-# License::   GPL 3.0
-# Website::   http://infinitemonkeywrench.org/
-#
-# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
-require 'imw/parsers/html_parser/matchers'
-module IMW
-  module Parsers
-    class HtmlParser
-      include IMW::Parsers::HtmlMatchers
-      attr_accessor :parse_tree
-      #
-      # Parse Tree
-      #
-      def initialize arg_spec=nil
-        spec = arg_spec || self.class.parser_spec
-        self.parse_tree = IMW::Parsers::HtmlMatchers.build_parse_tree(spec)
-      end
-      #
-      # See IMW::HtmlParser for syntax
-      #
-      #
-      def self.parser_spec
-        raise "Override this to create your own parser spec"
-      end
-      #
-      # Walk
-      #
-      def parse doc
-        self.parse_tree.match(doc)
-      end
-      # one("hpricot_path")                 first match to hpricot_path
-      # one("hpricot_path", /spec/)         applies spec to first match to hpricot_path
-      #
-      def self.one selector, matcher
-        MatchFirstElement.new(selector, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
-      end
-      # match the +attr+ attribute of the first element given by +selector+
-      def self.attr selector, attr, matcher=nil
-        MatchAttribute.new(selector, attr, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
-      end
-      # shorthand for +attr(foo, 'href')+
-      def self.href selector, matcher=nil
-        self.attr(selector, 'href', matcher)
-      end
-      # shorthand for +attr(foo, 'src')+
-      def self.src selector, matcher=nil
-        self.attr(selector, 'src', matcher)
-      end
-      def self.proc selector, proc, matcher=nil
-        MatchProc.new(selector, proc, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
-      end
-      # strip ","s (!! thus disrespecting locale !!!)
-      # and convert to int
-      def self.to_num selector, matcher=nil
-        proc selector, lambda{|num| num.to_s.gsub(/,/,'').to_i if num }, matcher
-      end
-      def self.to_json selector, matcher=nil
-        proc selector, lambda{|v| v.to_json if v }, matcher
-      end
-      def self.strip selector, matcher=nil
-        proc selector, lambda{|v| v.strip }, matcher
-      end
-      def self.re_group selector, re
-        MatchRegexp.new(selector, re)
-      end
-      def self.re selector, re
-        MatchRegexp.new(selector, re, nil, :capture => 1)
-      end
-      def self.re_all selector, re, matcher=nil
-        MatchRegexpRepeatedly.new(selector, re)
-      end
-      # def self.plain_text selector, matcher=nil
-      #   proc selector, lambda{|el| el.inner_text if el }, matcher
-      # end
-      # attr_accessor :mapping
-      #
-      # #
-      # # Feed me a hash and I'll semantify HTML
-      # #
-      # # The hash should magically adhere to the too-complicated,
-      # # ever evolving goatrope that works for the below
-      # #
-      # #
-      # def initialize mapping
-      #   self.mapping = mapping
-      # end
-      #
-      # #
-      # # take a document subtree,
-      # # and a mapping of hpricot paths to that subtree's data mapping
-      # # recursively extract that datamapping
-      # #
-      # def extract_tree  hdoc, content, sub_mapping
-      #   data = { }
-      #   sub_mapping.each do |selector, target|
-      #     data[selector] = []
-      #     sub_contents = content/selector
-      #     sub_contents.each do |sub_content|
-      #       sub_data = {}
-      #       extract_node hdoc, sub_content, sub_data, selector, target
-      #       data[selector] << sub_data
-      #     end
-      #   end
-      #   data
-      #   # end
-      #   #   if selector.is_a?(String)
-      #   #     conts = (content)
-      #   #   else
-      #   #     conts = [content]
-      #   #   end
-      #   #   conts[0..0].each do |content|
-      #   #     extract_node hdoc, content, data, selector, target
-      #   #   end
-      #   # end
-      #   data
-      # end
-      #
-      # #
-      # # insert the extracted element into the data mapping
-      # #
-      # def extract_node hdoc, content, data, selector, target
-      #   classification = classify_node(selector, target)
-      #   result = \
-      #   case classification
-      #   when :subtree
-      #     target.each do |sub_selector, sub_target|
-      #       extract_node hdoc, content, data, sub_selector, sub_target
-      #     end
-      #
-      #   when :sub_attribute
-      #     k, v = selector.to_a[0]
-      #     subcontent = (k[0..0] == '/') ? (hdoc.at(k)) : (content.at(k))
-      #     val  = subcontent.attributes[v.to_s] if subcontent
-      #     data[target] = val unless val.blank?
-      #
-      #   when :attribute then
-      #     val = content.attributes[selector.to_s]
-      #     data[target] = val unless val.blank?
-      #
-      #   when :flatten_list
-      #     subcontents = (selector[0..0] == '/') ? (hdoc/selector) : (content/selector)
-      #     data[target.first] = subcontents.map{|subcontent| subcontent.inner_html }
-      #
-      #   when :inner_html
-      #     subcontent = (selector[0..0] == '/') ? (hdoc.at(selector)) : (content.at(selector))
-      #     data[target] = subcontent.inner_html.strip if subcontent
-      #
-      #   else
-      #     raise "classify_node shouldn't ever return #{classification}"
-      #   end
-      #   # puts "%-19s %-19s %-31s %s" % [target.inspect[0..18], classification.inspect[0..18], selector.inspect[0..30], result.inspect[0..90]] if (classification == :sub_attribute)
-      #   # puts '' if classification == :subtree
-      # end
-      #
-      # def classify_node selector, target
-      #   case
-      #   when target.is_a?(Hash)                             then :subtree
-      #   when selector.is_a?(Hash) && (selector.length == 1) then
-      #     k, v = selector.to_a[0]
-      #     case v
-      #     when Symbol then :sub_attribute
-      #     end
-      #   when selector.is_a?(Symbol)                         then :attribute
-      #   when selector.is_a?(String) && target.is_a?(Array)  then :flatten_list
-      #   when selector.is_a?(String) && target.is_a?(Symbol) then :inner_html
-      #   else
-      #     raise "Can't classify mapping: " + [selector, target].join(" - ")
-      #   end
-      # end
-      #
-      # # use #mapping to parse file
-      # def parse link
-      #   begin       hdoc = Hpricot(link.contents)
-      #   rescue;     warn "can't hpricot #{link.to_s}" ; return false;  end
-      #   raw_taggings = extract_tree hdoc, hdoc, self.mapping
-      # end
-      #
-      # # use #mapping to parse file
-      # def parse_file filename
-      #   begin       hdoc = Hpricot(File.open(filename))
-      #   rescue;     warn "can't hpricot #{filename}" ; return false;  end
-      #   raw_taggings = extract_tree hdoc, hdoc, self.mapping
-      # end
-    end
-  end
-end