RubyGems - scrapi - Versions diffs - 1.1.2 → 1.2.0 - Mend

scrapi 1.1.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/CHANGELOG CHANGED Viewed

@@ -1,3 +1,11 @@
+Version 1.2.0 (August 27, 2006)
+* Added: collect() method called just before result().
+* Changed: Elements are not skipped when processes unless :skip=>true.
+  ! Pay attention to this one, it could affect some scrapers.
+* Fixed: Declaring an array immediately creates an accessor for it.
+* Added: Scraper::Microformat for scraping hAtom and basic hCard.
 Version 1.1.2 (August 13, 2006)
 * Changed: Allows multiple :not pseudo classes to be used with the same

data/Rakefile CHANGED Viewed

@@ -41,14 +41,13 @@ gem_spec = Gem::Specification.new do |spec|
   spec.name = "scrapi"
   spec.version = version
-  spec.summary = "scrAPI toolkit for Ruby"
+  spec.summary = "scrAPI toolkit for Ruby. Uses CSS selectors to write easy, maintainable HTML scraping rules."
   spec.description = <<-EOF
-A framework for writing scrapers using CSS selectors and simple
-select => extract => store processing rules.
+scrAPI is an HTML scraping toolkit for Ruby. It uses CSS selectors to write easy, maintainable scraping rules to select, extract and store data from HTML content.
 EOF
   spec.author = "Assaf Arkin"
   spec.email = "assaf.arkin@gmail.com"
-  spec.homepage = "http://labnotes.org/"
+  spec.homepage = "http://blog.labnotes.org/category/scrapi/"
   spec.files = FileList["{test,lib}/**/*", "README", "CHANGELOG", "Rakefile", "MIT-LICENSE"].to_a
   spec.require_path = "lib"

data/lib/scraper/base.rb CHANGED Viewed

@@ -472,11 +472,19 @@ module Scraper
       #   process "a[href]", "urls[]"=>"@href"
       def array(*symbols)
         @arrays ||= []
-        symbols.each { |sym| @arrays << sym.to_sym }
+        symbols.each do |symbol|
+          symbol = symbol.to_sym
+          @arrays << symbol
+          begin
+            self.instance_method(symbol)
+          rescue NameError
+            attr_accessor symbol
+          end
+        end
       end
-  private
+    private
       # Called by #process and #process_first, see there for
@@ -655,6 +663,7 @@ module Scraper
           begin
             self.instance_method(target)
           rescue NameError
+            instance = "@#{target}".to_sym
             attr_accessor target
           end
           reader = "#{target}=".to_sym
@@ -740,8 +749,9 @@ module Scraper
       prepare document
       # Retrieve the document. This may raise HTTPError or HTMLParseError.
       case document
-      when Array: stack = @document.reverse # see below
-      when HTML::Node:
+      when Array
+        stack = @document.reverse # see below
+      when HTML::Node
         # If a root element is specified, start selecting from there.
         # The stack is empty if we can't find any root element (makes
         # sense). However, the node we're going to process may be
@@ -750,7 +760,8 @@ module Scraper
         root_element = option(:root_element)
         root = root_element ? @document.find(:tag=>root_element) : @document
         stack = root ? (root.tag? ? [root] : root.children.reverse) : []
-      else return
+      else
+        return
       end
       # @skip stores all the elements we want to skip (see #skip).
       # rules stores all the rules we want to process with this
@@ -801,9 +812,10 @@ module Scraper
                 # If it returns true, skip the element and if
                 # the current element, don't process any more
                 # rules. Again, pay attention to descendants.
-                skip = extractor.bind(self).call(element)
-                if (skip || @skip.delete(true)) && @skip.delete(false).nil?
+                if extractor.bind(self).call(element)
                   @extracted = true
+                end
+                if @skip.delete(true)
                   if element.equal?(node)
                     skip_this = true
                   else
@@ -824,6 +836,7 @@ module Scraper
       ensure
         @skip = nil
       end
+      collect
       return result
     end
@@ -895,7 +908,7 @@ module Scraper
       case elements
       when Array: @skip.concat elements
       when HTML::Node: @skip << elements
-      when nil: @skip << self.element
+      when nil: @skip << true
       when true, false: @skip << elements
       end
       # Calling skip(element) as the last statement is
@@ -920,6 +933,13 @@ module Scraper
     end
+    # Called by #scrape scraping the document, and before calling #result.
+    # Typically used to run any validation, post-processing steps,
+    # resolving referenced elements, etc.
+    def collect()
+    end
     # Returns the result of a succcessful scrape.
     #
     # This method is called by #scrape after running all the rules on the

data/lib/scraper/microformats.rb ADDED Viewed

@@ -0,0 +1,93 @@
+require "time"
+module Scraper
+  module Microformats
+    class HCard < Scraper::Base
+      process ".fn",          :fn=>:text
+      process ".given-name",  :given_name=>:text
+      process ".family-name", :family_name=>:text
+      process "img.photo",    :photo=>"@src"
+      process "a.url",        :url=>"@href"
+      result :fn, :given_name, :family_name, :photo, :url
+      def collect()
+        unless fn
+          if self.fn = given_name
+            self.given_name << " #{family_name}" if family_name
+          else
+            self.fn = family_name
+          end
+        end
+      end
+    end
+    class HAtom < Scraper::Base
+      class Entry < Scraper::Base
+        array :content, :tags
+        process ".entry-title",                   :title=>:text
+        process ".entry-content",                 :content=>:element
+        process ".entry-summary",                 :summary=>:element
+        process "a[rel~=bookmark]",               :permalink=>["@href"]
+        process ".author.vcard, .author .vcard",  :author=>HCard
+        process ".published",                     :published=>["abbr@title", :text]
+        process ".updated",                       :updated=>["abbr@title", :text]
+        process "a[rel~=tag]",                    :tags=>:text
+        def collect()
+          self.published = Time.parse(published)
+          self.updated = updated ? Time.parse(updated) : published
+        end
+        result :title, :content, :summary, :permalink, :author, :published, :updated, :tags
+      end
+      class Feed < Scraper::Base
+        array :entries
+        process ".hentry", :entries=>Entry
+        def result()
+          entries
+        end
+      end
+      array :feeds, :entries
+      # Skip feeds, so we don't process them twice.
+      process ".hfeed", :skip=>true, :feeds=>Feed
+      # And so we can collect unwrapped entries into a separate feed.
+      process ".hentry", :skip=>true, :entries=>Entry
+      # And collect the first remaining hcard as the default author.
+      process ".vcard", :hcard=>HCard
+      def collect()
+        @feeds ||= []
+        @feeds << entries if entries
+        for feed in feeds
+          for entry in feed
+            entry.author = hcard unless entry.author
+          end
+        end
+      end
+      result :feeds
+    end
+  end
+end

data/test/scraper_test.rb CHANGED Viewed

@@ -14,6 +14,20 @@ require File.join(File.dirname(__FILE__), "../lib", "scrapi")
 class ScraperTest < Test::Unit::TestCase
+  DIVS123 = <<-EOF
+    <div id="1"></div>
+    <div id="2"></div>
+    <div id="3"></div>
+  EOF
+  DIVS1_23 = <<-EOF
+    <div id="1">
+      <div id="2"></div>
+      <div id="3"></div>
+    </div>
+  EOF
   def setup
     Net::HTTP.reset_on_get
   end
@@ -28,8 +42,7 @@ class ScraperTest < Test::Unit::TestCase
   #
   def test_define_selectors
-    html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
-    scraper = new_scraper(html) do
+    scraper = new_scraper(DIVS123) do
       selector :test, "div"
     end
     assert_equal 3, scraper.test(scraper.document).size
@@ -40,8 +53,7 @@ class ScraperTest < Test::Unit::TestCase
   def test_selector_blocks
-    html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
-    scraper = new_scraper(html) do
+    scraper = new_scraper(DIVS123) do
       selector :test, "div" do |elements|
         return elements[0..-2]
         elements[0..-2]
@@ -52,18 +64,16 @@ class ScraperTest < Test::Unit::TestCase
   def test_array_selectors
-      html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
-      scraper = new_scraper(html) do
-        selector :test, "#?", "2"
-      end
-      assert_equal 1, scraper.test(scraper.document).size
-      assert_equal "2", scraper.test(scraper.document)[0].attributes["id"]
+    scraper = new_scraper(DIVS123) do
+      selector :test, "#?", "2"
+    end
+    assert_equal 1,   scraper.test(scraper.document).size
+    assert_equal "2", scraper.test(scraper.document)[0].attributes["id"]
   end
   def test_object_selectors
-    html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
-    scraper = new_scraper(html) do
+    scraper = new_scraper(DIVS123) do
       selector :test, HTML::Selector.new("div")
     end
     assert_equal 3, scraper.test(scraper.document).size
@@ -71,8 +81,7 @@ class ScraperTest < Test::Unit::TestCase
   def test_selector_returns_array
-    html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
-    scraper = new_scraper(html) do
+    scraper = new_scraper(DIVS123) do
       selector :test0, "#4"
       selector :test1, "#1"
       selector :test3, "div"
@@ -84,26 +93,24 @@ class ScraperTest < Test::Unit::TestCase
   def test_select_in_document_order
-    html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
-    scraper = new_scraper(html) do
+    scraper = new_scraper(DIVS123) do
       selector :test, "#2,#1"
     end
-    assert_equal 2, scraper.test(scraper.document).size
+    assert_equal 2,   scraper.test(scraper.document).size
     assert_equal "1", scraper.test(scraper.document)[0].attributes["id"]
     assert_equal "2", scraper.test(scraper.document)[1].attributes["id"]
   end
   def test_selecting_first_element
-    html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
-    scraper = new_scraper(html) do
+    scraper = new_scraper(DIVS123) do
       selector :test, "div"
     end
-    assert_equal 3, scraper.test(scraper.document).size
+    assert_equal 3,   scraper.test(scraper.document).size
     assert scraper.first_test(scraper.document)
     assert_equal "1", scraper.first_test(scraper.document).attributes["id"]
-    scraper = new_scraper(html) do
+    scraper = new_scraper(DIVS123) do
       selector :test, "div" do |element|
         element[0].attributes["id"]
       end
@@ -118,8 +125,7 @@ class ScraperTest < Test::Unit::TestCase
   #
   def test_processing_rule
-    html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
-    scraper = new_scraper(html) do
+    scraper = new_scraper(DIVS123) do
       process "div" do |element|
         @count = (@count || 0) + 1
       end
@@ -131,8 +137,7 @@ class ScraperTest < Test::Unit::TestCase
   def test_processing_rule_with_array
-    html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
-    scraper = new_scraper(html) do
+    scraper = new_scraper(DIVS123) do
       process "#?", "1" do |element|
         @count = (@count || 0) + 1
       end
@@ -144,8 +149,7 @@ class ScraperTest < Test::Unit::TestCase
   def test_processing_rule_with_selector
-    html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
-    scraper = new_scraper(html) do
+    scraper = new_scraper(DIVS123) do
       process HTML::Selector.new("div") do |element|
         @count = (@count || 0) + 1
       end
@@ -157,8 +161,7 @@ class ScraperTest < Test::Unit::TestCase
   def test_extracting_in_code
-    html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
-    scraper = new_scraper(html) do
+    scraper = new_scraper(DIVS123) do
       process "div" do |element|
         @concat = (@concat || "") << element.attributes["id"]
       end
@@ -170,8 +173,7 @@ class ScraperTest < Test::Unit::TestCase
   def test_processing_in_document_order
-    html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
-    scraper = new_scraper(html) do
+    scraper = new_scraper(DIVS123) do
       process "#2,#1" do |element|
         @concat = (@concat || "") << element.attributes["id"]
       end
@@ -182,68 +184,28 @@ class ScraperTest < Test::Unit::TestCase
   end
-  def test_skip_if_extractor_returns_true
-    html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
-    scraper = new_scraper(html) do
-      process "#1" do |element|
-        @first = true
-        false
-      end
-      process "#1" do |element|
-        @second = true
-      end
-      attr :first
-      attr :second
-    end
-    scraper.scrape
-    assert_equal true, scraper.first
-    assert_equal true, scraper.second
-    scraper = new_scraper(html) do
-      process "#1" do |element|
-        @first = true
-        true
-      end
-      process "#1" do |element|
-        @second = true
-      end
-      attr :first
-      attr :second
-    end
-    scraper.scrape
-    assert_equal true, scraper.first
-    assert_equal nil, scraper.second
-  end
   def test_process_once_if_skipped
-    html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
-    scraper = new_scraper(html) do
-      process "#1" do |element|
-        @first = true
-        skip element
-        false
+    scraper = new_scraper(DIVS123) do
+      def prepare(document)
+        @found = []
       end
-      process "#1" do |element|
-        @second = true
-      end
-      attr :first
-      attr :second
+      process("#1")             { |element| @found[0] = true }
+      process("#1")             { |element| @found[1] = true ; skip element }
+      process("#1")             { |element| @found[2] = true }
+      process("#2", :skip=>true){ |element| @found[3] = true }
+      process("#2")             { |element| @found[4] = true }
+      attr_reader :found
     end
     scraper.scrape
-    assert_equal true, scraper.first
-    assert_equal nil, scraper.second
+    assert_equal [true, true, nil, true], scraper.found
   end
   def test_skip_children
-    html = %Q{<div><div id="1"></div><div id="2"></div><div id="3"></div></div>}
-    scraper = new_scraper(html) do
+    scraper = new_scraper(DIVS1_23) do
       process "div" do |element|
         @concat = (@concat || "") << (element.attributes["id"] || "")
-        if to_skip = id2(element)
-          skip to_skip
-        end
-        false
+        skip id2(element)
       end
       selector :id2, "#2"
       attr :concat
@@ -254,68 +216,68 @@ class ScraperTest < Test::Unit::TestCase
   def test_skip_descendants
-    html = %Q{<div id="1"><div id="2"><div id="3"></div></div</div>}
-    scraper = new_scraper(html) do
+    # Root, child of root, grandchild of root.
+    scraper = new_scraper(DIVS1_23) do
       process "div" do |element|
         @concat = (@concat || "") << (element.attributes["id"] || "")
-        false
       end
       attr :concat
     end
     scraper.scrape
-    # Root, child of root, grandchild of root.
     assert_equal "123", scraper.concat
-    scraper = new_scraper(html) do
+    # Stop at root.
+    scraper = new_scraper(DIVS1_23) do
       process "div" do |element|
         @concat = (@concat || "") << (element.attributes["id"] || "")
-        true
+        skip
       end
       attr :concat
     end
     scraper.scrape
-    # Stop at root.
     assert_equal "1", scraper.concat
-    scraper = new_scraper(html) do
+    scraper.scrape
+    # Child of root, and child of root's child
+    scraper = new_scraper(DIVS1_23) do
       process "div>div" do |element|
         @concat = (@concat || "") << (element.attributes["id"] || "")
-        false
       end
       attr :concat
     end
     scraper.scrape
-    # Child of root, and child of root's child
     assert_equal "23", scraper.concat
-    scraper = new_scraper(html) do
+    # Stop at child of root.
+    scraper = new_scraper(DIVS1_23) do
       process "div>div" do |element|
         @concat = (@concat || "") << (element.attributes["id"] || "")
-        true
+        skip element.next_element
       end
       attr :concat
     end
     scraper.scrape
-    # Stop at child of root.
     assert_equal "2", scraper.concat
-    scraper = new_scraper(html) do
+    # Child of root, the child of child of root.
+    scraper = new_scraper(DIVS1_23) do
       process "div div" do |element|
         @concat = (@concat || "") << (element.attributes["id"] || "")
-        false
       end
       attr :concat
     end
     scraper.scrape
-    # Child of root, the child of child of root.
     assert_equal "23", scraper.concat
-    scraper = new_scraper(html) do
+    # Child of root.
+    scraper = new_scraper(DIVS1_23) do
       process "div div" do |element|
         @concat = (@concat || "") << (element.attributes["id"] || "")
-        true
+        skip element.next_element
       end
       attr :concat
     end
     scraper.scrape
-    # Child of root.
     assert_equal "2", scraper.concat
   end
@@ -328,15 +290,15 @@ class ScraperTest < Test::Unit::TestCase
     end
     scraper.scrape
     assert_equal "this", scraper.this1
-    assert_equal nil, scraper.this2
+    assert_equal "this", scraper.this2
     scraper = new_scraper(html) do
       process "#1", :this1=>:text, :skip=>false
       process "#1", :this2=>:text
     end
     scraper.scrape
-    #assert_equal "this", scraper.this1
-    #assert_equal "this", scraper.this2
+    assert_equal "this", scraper.this1
+    assert_equal "this", scraper.this2
     scraper = new_scraper(html) do
       process "#1", :this1=>:text, :skip=>true do
@@ -346,13 +308,12 @@ class ScraperTest < Test::Unit::TestCase
     end
     scraper.scrape
     assert_equal "this", scraper.this1
-    assert_equal nil, scraper.this2
+    assert_equal nil,    scraper.this2
   end
   def test_stop
-    html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
-    scraper = new_scraper(html) do
+    scraper = new_scraper(DIVS123) do
       process "div" do |element|
         @concat = (@concat || "") << (element.attributes["id"] || "")
         stop
@@ -365,18 +326,14 @@ class ScraperTest < Test::Unit::TestCase
   def test_process_first
-    html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
-    scraper = new_scraper(html) do
+    scraper = new_scraper(DIVS123) do
       process "div" do |element|
         @all = (@all || 0) + 1
-        false
       end
       process_first "div" do |element|
         @first = (@first || 0) + 1
-        false
       end
-      attr :all
-      attr :first
+      attr_accessor :all, :first
     end
     scraper.scrape
     assert_equal 3, scraper.all
@@ -391,14 +348,17 @@ class ScraperTest < Test::Unit::TestCase
         response = Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK")
         response["Last-Modified"] = time
         response["ETag"] = "etag"
-        [response, %Q{
-<html>
-<head>
-  <meta http-equiv="content-type" value="text/html; charset=other-encoding">
-</head>
-<body><div id="x"/></body>
-</html>
-        }]
+        [response, <<-EOF
+          <html>
+            <head>
+              <meta http-equiv="content-type" value="text/html; charset=other-encoding">
+            </head>
+            <body>
+              <div id="x"/>
+            </body>
+          </html>
+        EOF
+        ]
       else
         response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 300, "Moved")
         response["Location"] = "http://localhost/redirect"
@@ -417,11 +377,15 @@ class ScraperTest < Test::Unit::TestCase
   def test_scraping_end_to_end
     Net::HTTP.on_get do |address, path, headers|
-      [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), %Q{
-<html>
-<body><div id="1"/><div id="2"/></body>
-</html>
-      }]
+      [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), <<-EOF
+        <html>
+          <body>
+            <div id="1"/>
+            <div id="2"/>
+          </body>
+        </html>
+      EOF
+      ]
     end
     scraper = new_scraper(URI.parse("http://localhost/")) do
       process "div" do |element|
@@ -475,7 +439,10 @@ class ScraperTest < Test::Unit::TestCase
   def test_extractors_objects
-    html = %Q{<h1 class="header"></h1><h2 class="header"></h2>}
+    html = <<-EOF
+      <h1 class="header"></h1>
+      <h2 class="header"></h2>
+    EOF
     # Extract both elements based on class, return the second one.
     scraper = new_scraper(html) do
       process ".header", extractor(:header=>:element)
@@ -484,7 +451,10 @@ class ScraperTest < Test::Unit::TestCase
     scraper.scrape
     assert_equal "h2", scraper.header.name
     # Extracting a specific element skips the second match.
-    html = %Q{<h1 class="header"></h1><h2 class="header"></h2>}
+    html = <<-EOF
+      <h1 class="header"></h1>
+      <h2 class="header"></h2>
+    EOF
     scraper = new_scraper(html) do
       process ".header", extractor(:header=>"h1")
       attr :header
@@ -496,7 +466,10 @@ class ScraperTest < Test::Unit::TestCase
   def test_attribute_extractors
     # Extracting the attribute skips the second match.
-    html = %Q{<abbr title="foo">bar</div><abbr>foo</abbr>}
+    html = <<-EOF
+      <abbr title="foo">bar</div>
+      <abbr>foo</abbr>
+    EOF
     scraper = new_scraper(html) do
       process "abbr", extractor(:title=>"@title")
       attr :title
@@ -504,7 +477,10 @@ class ScraperTest < Test::Unit::TestCase
     scraper.scrape
     assert_equal "foo", scraper.title
     # Extracting a specific element skips the second match.
-    html = %Q{<h1 class="header" id="1"></h1><h2 class="header" id="2"></h2>}
+    html = <<-EOF
+      <h1 class="header" id="1"></h1>
+      <h2 class="header" id="2"></h2>
+    EOF
     scraper = new_scraper(html) do
       process ".header", extractor(:header=>"h1@id")
       attr :header
@@ -522,7 +498,12 @@ class ScraperTest < Test::Unit::TestCase
       attr :h1
       attr :h2
     end
-    html = %Q{<div><h1>first</h1><h2>second</h2></div>}
+    html = <<-EOF
+      <div>
+        <h1>first</h1>
+        <h2>second</h2>
+      </div>
+    EOF
     scraper = new_scraper(html) do
       process "div", extractor(:headers=>headers)
       attr :headers
@@ -535,7 +516,12 @@ class ScraperTest < Test::Unit::TestCase
   def test_array_extractors
-    html = %Q{<div><h1>first</h1><h1>second</h1></div>}
+    html = <<-EOF
+      <div>
+        <h1>first</h1>
+        <h1>second</h1>
+      </div>
+    EOF
     scraper = new_scraper(html) do
       process "h1", extractor("headers[]"=>:text)
       attr :headers
@@ -543,13 +529,17 @@ class ScraperTest < Test::Unit::TestCase
     scraper.scrape
     assert scraper.headers.is_a?(Array)
     assert_equal 2, scraper.headers.size
-    assert_equal "first", scraper.headers[0]
-    assert_equal "second", scraper.headers[1]
+    assert_equal "first",   scraper.headers[0]
+    assert_equal "second",  scraper.headers[1]
   end
   def test_hash_extractors
-    html = %Q{<div><h1 id="1" class="header">first</h1></div>}
+    html = <<-EOF
+      <div>
+        <h1 id="1" class="header">first</h1>
+      </div>
+    EOF
     scraper = new_scraper(html) do
       process "h1", extractor("header"=>{:id=>"@id", :class=>"@class", :text=>:text})
       attr :header
@@ -558,21 +548,25 @@ class ScraperTest < Test::Unit::TestCase
     assert scraper.header.is_a?(Hash)
     assert_equal 3, scraper.header.size
     assert_equal "1", scraper.header[:id]
-    assert_equal "header", scraper.header[:class]
-    assert_equal "first", scraper.header[:text]
+    assert_equal "header",  scraper.header[:class]
+    assert_equal "first",   scraper.header[:text]
   end
   def test_multi_value_extractors
-    html = %Q{<div><h1 id="1" class="header">first</h1></div>}
+    html = <<-EOF
+      <div>
+        <h1 id="1" class="header">first</h1>
+      </div>
+    EOF
     scraper = new_scraper(html) do
       process "h1", [:text, :kls]=>Scraper.define {
         process "*", :text=>:text, :kls=>"@class"
       }
     end
     result = scraper.scrape
-    assert "first", result.text
-    assert "header", result.kls
+    assert "first",   result.text
+    assert "header",  result.kls
   end
@@ -581,7 +575,13 @@ class ScraperTest < Test::Unit::TestCase
     # if not found look for class attribute (first
     # two headers), otherwise just get text (third
     # header).
-    html = %Q{<div><h1 class="foo">first</h1><h1 class="foo" id="bar">second</h1><h1>third</h1></div>}
+    html = <<-EOF
+      <div>
+        <h1 class="foo">first</h1>
+        <h1 class="foo" id="bar">second</h1>
+        <h1>third</h1>
+      </div>
+    EOF
     scraper = new_scraper(html) do
       process "h1", extractor("headers[]"=>["@id", "@class", :text])
       attr :headers
@@ -589,37 +589,41 @@ class ScraperTest < Test::Unit::TestCase
     scraper.scrape
     assert scraper.headers.is_a?(Array)
     assert_equal 3, scraper.headers.size
-    assert_equal "foo", scraper.headers[0]
-    assert_equal "bar", scraper.headers[1]
+    assert_equal "foo",   scraper.headers[0]
+    assert_equal "bar",   scraper.headers[1]
     assert_equal "third", scraper.headers[2]
   end
+  DIVS_ST_ND = <<-EOF
+    <div id="1">first</div>
+    <div id="2">second</div>
+  EOF
   def test_accessors_from_extractor
-    html = %Q{<div id="1">first</div><div id="2">second</div>}
-    scraper = new_scraper(html) do
+    scraper = new_scraper(DIVS_ST_ND) do
       process_first "div", :div_id=>"@id", :div_text=>:text
       result :div_id
     end
     value = scraper.scrape
     assert_equal "1", value
-    scraper = new_scraper(html) do
+    scraper = new_scraper(DIVS_ST_ND) do
       process_first "div", :div_id=>"@id", :div_text=>:text
       result :div_id, :div_text
     end
     value = scraper.scrape
-    assert_equal "1", value.div_id
+    assert_equal "1",     value.div_id
     assert_equal "first", value.div_text
-    scraper = new_scraper(html) do
+    scraper = new_scraper(DIVS_ST_ND) do
       process_first "div", :div_id=>"@id", :div_text=>:text
     end
     value = scraper.scrape
-    assert_equal "1", value.div_id
+    assert_equal "1",     value.div_id
     assert_equal "first", value.div_text
-    scraper = new_scraper(html) do
+    scraper = new_scraper(DIVS_ST_ND) do
       attr_accessor :div_class
       process_first "div", :div_id=>"@id", :div_text=>:text
       result :div_id, :div_class
@@ -628,7 +632,7 @@ class ScraperTest < Test::Unit::TestCase
     assert_equal "1", value.div_id
     assert_raise(NoMethodError) { value.div_text }
-    scraper = new_scraper(html) do
+    scraper = new_scraper(DIVS_ST_ND) do
       process "div", "div_ids[]"=>"@id"
       result :div_ids
     end
@@ -639,8 +643,7 @@ class ScraperTest < Test::Unit::TestCase
   def test_array_accessors
-    html = %Q{<div id="1">first</div><div id="2">second</div>}
-    scraper = new_scraper(html) do
+    scraper = new_scraper(DIVS_ST_ND) do
       array :div_id, :div_text
       process "div", :div_id=>"@id", :div_text=>:text
       result :div_id, :div_text
@@ -650,8 +653,8 @@ class ScraperTest < Test::Unit::TestCase
     assert_equal 2, value.div_text.size
     assert_equal "1", value.div_id[0]
     assert_equal "2", value.div_id[1]
-    assert_equal "first", value.div_text[0]
-    assert_equal "second", value.div_text[1]
+    assert_equal "first",   value.div_text[0]
+    assert_equal "second",  value.div_text[1]
   end
@@ -659,9 +662,17 @@ class ScraperTest < Test::Unit::TestCase
   # Root element tests.
   #
+  HTML_EMPTY = <<-EOF
+    <html>
+      <head>
+      </head>
+      <body>
+      </body>
+    </html>
+  EOF
   def test_scrape_body_by_default
-    html = %Q{<html><head></head><body></body></html>}
-    scraper = Class.new(Scraper::Base).new(html)
+    scraper = Class.new(Scraper::Base).new(HTML_EMPTY)
     scraper.class.instance_eval do
       process "head" do |element| @head = element end
       attr :head
@@ -675,17 +686,16 @@ class ScraperTest < Test::Unit::TestCase
   def test_changing_root_element
-    html = %Q{<html><head></head><body></body></html>}
-    only_header = new_scraper(html) do
+    only_header = new_scraper(HTML_EMPTY) do
       root_element "head"
       process "head" do |element| @head = element end
       attr :head
       process "body" do |element| @body = element end
       attr :body
     end
-    only_body = Class.new(only_header.class).new(html)
+    only_body = Class.new(only_header.class).new(HTML_EMPTY)
     only_body.class.root_element "body"
-    both_parts = Class.new(only_body.class).new(html)
+    both_parts = Class.new(only_body.class).new(HTML_EMPTY)
     both_parts.class.root_element nil
     # We set this scraper to begin with the head element,
     # so we can see the head element, but not the body.
@@ -709,8 +719,7 @@ class ScraperTest < Test::Unit::TestCase
   def test_prepare_and_result
     # Extracting the attribute skips the second match.
-    html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
-    scraper = new_scraper(html) do
+    scraper = new_scraper(DIVS123) do
       process("div") { |element| @count +=1 }
       define_method(:prepare) { @count = 1 }
       define_method(:result) { @count }
@@ -722,8 +731,7 @@ class ScraperTest < Test::Unit::TestCase
   def test_changing_document_from_prepare
     # Extracting the attribute skips the second match.
-    html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
-    scraper = new_scraper(html) do
+    scraper = new_scraper(DIVS123) do
       selector :divs, "div"
       define_method :prepare do |document|
         @document = divs(document)[1]
@@ -739,13 +747,12 @@ class ScraperTest < Test::Unit::TestCase
   def test_anonymous_scrapers
-    html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
     scraper = Scraper.define do
       array :ids
       process "div", :ids=>"@id"
       result :ids
     end
-    result = scraper.scrape(html)
+    result = scraper.scrape(DIVS123)
     assert_equal "1", result[0]
     assert_equal "2", result[1]
     assert_equal "3", result[2]
@@ -753,14 +760,13 @@ class ScraperTest < Test::Unit::TestCase
   def test_named_rules
-    html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
     scraper = Scraper.define do
       array :ids1, :ids2
       process :main, "div", :ids1=>"@id"
       process :main, "div", :ids2=>"@id"
       result :ids1, :ids2
     end
-    result = scraper.scrape(html)
+    result = scraper.scrape(DIVS123)
     assert_equal nil, result.ids1
     assert_equal 3, result.ids2.size
     assert_equal "1", result.ids2[0]
@@ -775,7 +781,7 @@ protected
     cls = Class.new(Scraper::Base)
     cls.root_element nil
     cls.parser :html_parser
-    cls.instance_eval &block if block
+    cls.class_eval &block if block
     cls.new(what)
   end
@@ -791,7 +797,7 @@ protected
     cls = Class.new(Scraper::Base)
     cls.root_element nil
     cls.parser :tidy
-    cls.instance_eval &block if block
+    cls.class_eval &block if block
     cls.new(what)
   end

metadata CHANGED Viewed

@@ -3,15 +3,15 @@ rubygems_version: 0.9.0
 specification_version: 1
 name: scrapi
 version: !ruby/object:Gem::Version
-  version: 1.1.2
-date: 2006-08-15 00:00:00 -07:00
-summary: scrAPI toolkit for Ruby
+  version: 1.2.0
+date: 2006-08-27 00:00:00 -07:00
+summary: scrAPI toolkit for Ruby. Uses CSS selectors to write easy, maintainable HTML scraping rules.
 require_paths:
 - lib
 email: assaf.arkin@gmail.com
-homepage: http://labnotes.org/
+homepage: http://blog.labnotes.org/category/scrapi/
 rubyforge_project: scrapi
-description: A framework for writing scrapers using CSS selectors and simple select => extract => store processing rules.
+description: scrAPI is an HTML scraping toolkit for Ruby. It uses CSS selectors to write easy, maintainable scraping rules to select, extract and store data from HTML content.
 autorequire: scrapi.rb
 default_executable:
 bindir: bin
@@ -40,6 +40,7 @@ files:
 - lib/html
 - lib/scraper/reader.rb
 - lib/scraper/base.rb
+- lib/scraper/microformats.rb
 - lib/tidy/libtidy.so
 - lib/tidy/libtidy.dll
 - lib/html/node_ext.rb