RubyGems - guitsaru-scraper - Versions diffs - 0.1.1 → 0.1.2 - Mend

guitsaru-scraper 0.1.1 → 0.1.2

Files changed (12) hide show

data/VERSION +1 -1
data/lib/scraper/link.rb +35 -7
data/lib/scraper.rb +11 -5
data/scraper.gemspec +3 -2
data/test/fake_pages/first_child_page.html +1 -0
data/test/fake_pages/first_page.html +1 -1
data/test/fake_pages/google.html +19 -0
data/test/fake_pages/main.html +2 -0
data/test/test_helper.rb +1 -0
data/test/test_link.rb +22 -1
data/test/test_scraper.rb +26 -1
metadata +3 -2

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.1.1
1	+ 0.1.2

data/lib/scraper/link.rb CHANGED Viewed

@@ -7,10 +7,10 @@ module Scrape
       @visited = false
     end
-    def scrape!(div=nil)
+    def scrape!(options = {})
       return [] if @visited
       @visited = true
-      return get_links(div)
+      return get_links(options)
     end
     def ==(other)
@@ -27,24 +27,52 @@ module Scrape
     end
     private
-    def get_links(div)
+    def get_links(options = {})
+      div = nil
+      ignore = []
+      if options[:div]
+        div = options[:div]
+      end
+      if options[:ignore]
+        ignore = options[:ignore]
+      end
       links = []
-      doc = Hpricot(Net::HTTP.get(URI.parse(url)))
+      doc = Hpricot(Net::HTTP.get(URI.parse(@url)))
       doc.search("#{div} a").each do |link|
         url = link['href']
         if url =~ /^\/(.*)/
           components = URI::split(@url)
           url = "#{components[0] || 'http'}://#{components[2]}#{url}"
-        elsif url =~ /^http:\/\//i
-          url = url
+        elsif url =~ /^https?:\/\//i
+                  url = url
+        elsif url =~ /file:\/\//
+          next
         elsif url =~ /^#/
           url = @url.gsub(/#.*/, '').gsub(/\/$/, '') + url
         else
           url = (File.dirname(@url) + '/' + (url || ''))
         end
-        links << Link.new(url, link.inner_html)
+        # Don't add this link if it matches a pattern in ignore
+        skip = false
+        ignore.each { |pattern| skip = true if url =~ pattern }
+        skip = true if options[:domain] && !url.include?(options[:domain])
+        if !skip
+          new_link = Link.new(url, link.inner_html.strip)
+          # Don't visit anchors, visit the main page instead.
+          if url =~ /(https?:\/\/.*)#(.*$)/i
+            links << Link.new($1, $2)
+            new_link.visited = true
+          end
+          links << new_link
+        end
       end
       return links.uniq

data/lib/scraper.rb CHANGED Viewed

@@ -8,14 +8,20 @@ class Scraper
   attr_accessor :url
-  def initialize(url)
-    self.url = url
+  # Scrapes a web page, collecting all links on the page and scraping each new link.
+  # Possible options
+  # options[:div] - The container div with the links
+  # options[:domain] - The domain to collect links from, all other domains are ignored
+  # options[:ignore] - An Array of regexes.  Any links matching one will be ignored.
+  def initialize(url, options = {})
+    @url = url
+    @options = options
   end
-  def scrape(div=nil)
+  def scrape(options = {})
     links = [Link.new(self.url)]
-    until (not_visited = links.uniq.select { |link| !link.visited}).empty?
-      not_visited.each { |link| links += link.scrape!(div) }
+    until (not_visited = links.uniq.select { |link| !link.visited }).empty?
+      not_visited.each { |link| links += link.scrape!(options.merge(@options)) }
     end
     return links.uniq

data/scraper.gemspec CHANGED Viewed

@@ -2,11 +2,11 @@
 Gem::Specification.new do |s|
   s.name = %q{scraper}
-  s.version = "0.1.1"
+  s.version = "0.1.2"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Matt Pruitt"]
-  s.date = %q{2009-06-17}
+  s.date = %q{2009-06-18}
   s.email = %q{guitsaru@gmail.com}
   s.extra_rdoc_files = [
     "LICENSE",
@@ -24,6 +24,7 @@ Gem::Specification.new do |s|
      "scraper.gemspec",
      "test/fake_pages/first_child_page.html",
      "test/fake_pages/first_page.html",
+     "test/fake_pages/google.html",
      "test/fake_pages/main.html",
      "test/fake_pages/not_added.html",
      "test/test_helper.rb",

data/test/fake_pages/first_child_page.html CHANGED Viewed

@@ -13,6 +13,7 @@
   <div id="content">
     <a href="/main.html">Main</a>
     <a href="#content">Content</a>
+    <a href="http://google.com">Google</a>
   </div>
 </body>
 </html>

data/test/fake_pages/first_page.html CHANGED Viewed

@@ -14,7 +14,7 @@
     <a href="not_added.html">Not Added</a>
   </div>
   <div id="content">
-    <a href="http://example.com/first_child_page.html">First Child Page</a>
+    <a href="http://example.com/first_child_page.html#content2">First Child Page</a>
   </div>
 </body>
 </html>

data/test/fake_pages/google.html ADDED Viewed

@@ -0,0 +1,19 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
+   "http://www.w3.org/TR/html4/strict.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+  <title>untitled</title>
+  <meta name="generator" content="TextMate http://macromates.com/">
+  <meta name="author" content="Matt Pruitt">
+  <!-- Date: 2009-06-17 -->
+</head>
+<body>
+  <div id="header">
+    <a href="not_added.html">Not Added</a>
+  </div>
+  <div id="content">
+  </div>
+</body>
+</html>

data/test/fake_pages/main.html CHANGED Viewed

@@ -15,6 +15,8 @@
   </div>
   <div id="content">
     <a href="first_page.html">First Page</a>
+    <a href="file://fileserver/file.pdf">A File</a>
+    <a href="main.html?action=edit">Edit</a>
   </div>
 </body>
 </html>

data/test/test_helper.rb CHANGED Viewed

@@ -12,5 +12,6 @@ class Test::Unit::TestCase
   FakeWeb.register_uri(:get, "http://example.com/first_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_page.html'))
   FakeWeb.register_uri(:get, "http://example.com/first_child_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_child_page.html'))
   FakeWeb.register_uri(:get, "http://example.com/not_added.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/not_added.html'))
+  FakeWeb.register_uri(:get, "http://google.com", :file => File.join(File.dirname(__FILE__), 'fake_pages/google.html'))
 end

data/test/test_link.rb CHANGED Viewed

@@ -37,19 +37,40 @@ class TestLink < Test::Unit::TestCase
       assert(@results.is_a?(Array))
       assert(@results.include?(Link.new('http://example.com/first_page.html')))
       assert(@results.include?(Link.new('http://example.com/not_added.html')))
+      assert(@results.include?(Link.new('http://example.com/main.html?action=edit')))
+      assert(!@results.include?(Link.new('http://example.com/first_child_page.html/file://fileserver/file.pdf')))
     end
   end
   context "scraping inside a div" do
     setup do
       @link = Link.new('http://example.com/main.html')
-      @results = @link.scrape!('#content')
+      @results = @link.scrape!(:div => '#content')
     end
     should "return an array of links on the page" do
       assert_not_nil(@results)
       assert(@results.is_a?(Array))
       assert(@results.include?(Link.new('http://example.com/first_page.html')))
+      assert(@results.include?(Link.new('http://example.com/main.html?action=edit')))
+    end
+    should "not return links not in the div" do
+      assert(!@results.include?(Link.new('http://example.com/not_added.html')), "Includes a link outside of the correct div.")
+    end
+  end
+  context "scraping with ignore options" do
+    setup do
+      @link = Link.new('http://example.com/main.html')
+      @results = @link.scrape!(:div => '#content', :ignore => [/\?/])
+    end
+    should "return an array of links on the page" do
+      assert_not_nil(@results)
+      assert(@results.is_a?(Array))
+      assert(@results.include?(Link.new('http://example.com/first_page.html')))
+      assert(!@results.include?(Link.new('http://example.com/main.html?action=edit')))
     end
     should "not return links not in the div" do

data/test/test_scraper.rb CHANGED Viewed

@@ -16,18 +16,43 @@ class TestScraper < Test::Unit::TestCase
   context "scraping" do
     setup do
       @scraper = Scraper.new('http://example.com/main.html')
-      @results = @scraper.scrape('#content')
+      @results = @scraper.scrape(:div => '#content')
     end
     should "Include a list of links on the pages." do
       assert(@results.include?(Link.new('http://example.com/first_page.html')))
       assert(@results.include?(Link.new('http://example.com/first_child_page.html')))
       assert(@results.include?(Link.new('http://example.com/first_child_page.html#content')))
+      assert(@results.include?(Link.new('http://example.com/first_child_page.html#content2')))
       assert(@results.include?(Link.new('http://example.com/main.html')))
+      assert(@results.include?(Link.new('http://google.com')))
     end
     should "Not include any links outside of the content div" do
       assert(!@results.include?(Link.new('http://example.com/not_added.html')))
     end
   end
+  context "scraping within domain" do
+    setup do
+      @scraper = Scraper.new('http://example.com/main.html', :domain => 'example.com')
+      @results = @scraper.scrape(:div => '#content')
+    end
+    should "Include a list of links on the pages." do
+      assert(@results.include?(Link.new('http://example.com/first_page.html')))
+      assert(@results.include?(Link.new('http://example.com/first_child_page.html')))
+      assert(@results.include?(Link.new('http://example.com/first_child_page.html#content')))
+      assert(@results.include?(Link.new('http://example.com/first_child_page.html#content2')))
+      assert(@results.include?(Link.new('http://example.com/main.html')))
+    end
+    should "Not include any links outside of the content div" do
+      assert(!@results.include?(Link.new('http://example.com/not_added.html')))
+    end
+    should "Not include any links outside of the domain" do
+      assert(!@results.include?(Link.new('http://google.com')))
+    end
+  end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: guitsaru-scraper
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.1.2
 platform: ruby
 authors:
 - Matt Pruitt
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-06-17 00:00:00 -07:00
+date: 2009-06-18 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -43,6 +43,7 @@ files:
 - scraper.gemspec
 - test/fake_pages/first_child_page.html
 - test/fake_pages/first_page.html
+- test/fake_pages/google.html
 - test/fake_pages/main.html
 - test/fake_pages/not_added.html
 - test/test_helper.rb