RubyGems - guitsaru-scraper - Versions diffs - 0.1.1 → 0.1.2 - Mend

guitsaru-scraper 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

data/VERSION +1 -1
data/lib/scraper/link.rb +35 -7
data/lib/scraper.rb +11 -5
data/scraper.gemspec +3 -2
data/test/fake_pages/first_child_page.html +1 -0
data/test/fake_pages/first_page.html +1 -1
data/test/fake_pages/google.html +19 -0
data/test/fake_pages/main.html +2 -0
data/test/test_helper.rb +1 -0
data/test/test_link.rb +22 -1
data/test/test_scraper.rb +26 -1
metadata +3 -2

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.1.1
1	+ 0.1.2

data/lib/scraper/link.rb CHANGED Viewed

@@ -7,10 +7,10 @@ module Scrape
       @visited = false
     end
-    def scrape!(div=nil)
+    def scrape!(options = {})
       return [] if @visited
       @visited = true
-      return get_links(div)
+      return get_links(options)
     end
     def ==(other)
@@ -27,24 +27,52 @@ module Scrape
     end
     private
-    def get_links(div)
+    def get_links(options = {})
+      div = nil
+      ignore = []
+      if options[:div]
+        div = options[:div]
+      end
+      if options[:ignore]
+        ignore = options[:ignore]
+      end
       links = []
-      doc = Hpricot(Net::HTTP.get(URI.parse(url)))
+      doc = Hpricot(Net::HTTP.get(URI.parse(@url)))
       doc.search("#{div} a").each do |link|
         url = link['href']
         if url =~ /^\/(.*)/
           components = URI::split(@url)
           url = "#{components[0] || 'http'}://#{components[2]}#{url}"
-        elsif url =~ /^http:\/\//i
-          url = url
+        elsif url =~ /^https?:\/\//i
+                  url = url
+        elsif url =~ /file:\/\//
+          next
         elsif url =~ /^#/
           url = @url.gsub(/#.*/, '').gsub(/\/$/, '') + url
         else
           url = (File.dirname(@url) + '/' + (url || ''))
         end
-        links << Link.new(url, link.inner_html)
+        # Don't add this link if it matches a pattern in ignore
+        skip = false
+        ignore.each { |pattern| skip = true if url =~ pattern }
+        skip = true if options[:domain] && !url.include?(options[:domain])
+        if !skip
+          new_link = Link.new(url, link.inner_html.strip)
+          # Don't visit anchors, visit the main page instead.
+          if url =~ /(https?:\/\/.*)#(.*$)/i
+            links << Link.new($1, $2)
+            new_link.visited = true
+          end
+          links << new_link
+        end
       end
       return links.uniq

data/lib/scraper.rb CHANGED Viewed

@@ -8,14 +8,20 @@ class Scraper
   attr_accessor :url
-  def initialize(url)
-    self.url = url
+  # Scrapes a web page, collecting all links on the page and scraping each new link.
+  # Possible options
+  # options[:div] - The container div with the links
+  # options[:domain] - The domain to collect links from, all other domains are ignored
+  # options[:ignore] - An Array of regexes.  Any links matching one will be ignored.
+  def initialize(url, options = {})
+    @url = url
+    @options = options
   end
-  def scrape(div=nil)
+  def scrape(options = {})
     links = [Link.new(self.url)]
-    until (not_visited = links.uniq.select { |link| !link.visited}).empty?
-      not_visited.each { |link| links += link.scrape!(div) }
+    until (not_visited = links.uniq.select { |link| !link.visited }).empty?
+      not_visited.each { |link| links += link.scrape!(options.merge(@options)) }
     end
     return links.uniq

data/scraper.gemspec CHANGED Viewed

@@ -2,11 +2,11 @@
 Gem::Specification.new do |s|
   s.name = %q{scraper}
-  s.version = "0.1.1"
+  s.version = "0.1.2"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Matt Pruitt"]
-  s.date = %q{2009-06-17}
+  s.date = %q{2009-06-18}
   s.email = %q{guitsaru@gmail.com}
   s.extra_rdoc_files = [
     "LICENSE",
@@ -24,6 +24,7 @@ Gem::Specification.new do |s|
      "scraper.gemspec",
      "test/fake_pages/first_child_page.html",
      "test/fake_pages/first_page.html",
+     "test/fake_pages/google.html",
      "test/fake_pages/main.html",
      "test/fake_pages/not_added.html",
      "test/test_helper.rb",

data/test/fake_pages/first_child_page.html CHANGED Viewed

@@ -13,6 +13,7 @@
   <div id="content">
     <a href="/main.html">Main</a>
     <a href="#content">Content</a>
+    <a href="http://google.com">Google</a>
   </div>
 </body>
 </html>

data/test/fake_pages/first_page.html CHANGED Viewed

@@ -14,7 +14,7 @@
     <a href="not_added.html">Not Added</a>
   </div>
   <div id="content">
-    <a href="http://example.com/first_child_page.html">First Child Page</a>
+    <a href="http://example.com/first_child_page.html#content2">First Child Page</a>
   </div>
 </body>
 </html>

data/test/fake_pages/google.html ADDED Viewed

@@ -0,0 +1,19 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
+   "http://www.w3.org/TR/html4/strict.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+  <title>untitled</title>
+  <meta name="generator" content="TextMate http://macromates.com/">
+  <meta name="author" content="Matt Pruitt">
+  <!-- Date: 2009-06-17 -->
+</head>
+<body>
+  <div id="header">
+    <a href="not_added.html">Not Added</a>
+  </div>
+  <div id="content">
+  </div>
+</body>
+</html>

data/test/fake_pages/main.html CHANGED Viewed

@@ -15,6 +15,8 @@
   </div>
   <div id="content">
     <a href="first_page.html">First Page</a>
+    <a href="file://fileserver/file.pdf">A File</a>
+    <a href="main.html?action=edit">Edit</a>
   </div>
 </body>
 </html>

data/test/test_helper.rb CHANGED Viewed

@@ -12,5 +12,6 @@ class Test::Unit::TestCase
   FakeWeb.register_uri(:get, "http://example.com/first_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_page.html'))
   FakeWeb.register_uri(:get, "http://example.com/first_child_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_child_page.html'))
   FakeWeb.register_uri(:get, "http://example.com/not_added.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/not_added.html'))
+  FakeWeb.register_uri(:get, "http://google.com", :file => File.join(File.dirname(__FILE__), 'fake_pages/google.html'))
 end

data/test/test_link.rb CHANGED Viewed

@@ -37,19 +37,40 @@ class TestLink < Test::Unit::TestCase
       assert(@results.is_a?(Array))
       assert(@results.include?(Link.new('http://example.com/first_page.html')))
       assert(@results.include?(Link.new('http://example.com/not_added.html')))
+      assert(@results.include?(Link.new('http://example.com/main.html?action=edit')))
+      assert(!@results.include?(Link.new('http://example.com/first_child_page.html/file://fileserver/file.pdf')))
     end
   end
   context "scraping inside a div" do
     setup do
       @link = Link.new('http://example.com/main.html')
-      @results = @link.scrape!('#content')
+      @results = @link.scrape!(:div => '#content')
     end
     should "return an array of links on the page" do
       assert_not_nil(@results)
       assert(@results.is_a?(Array))
       assert(@results.include?(Link.new('http://example.com/first_page.html')))
+      assert(@results.include?(Link.new('http://example.com/main.html?action=edit')))
+    end
+    should "not return links not in the div" do
+      assert(!@results.include?(Link.new('http://example.com/not_added.html')), "Includes a link outside of the correct div.")
+    end
+  end
+  context "scraping with ignore options" do
+    setup do
+      @link = Link.new('http://example.com/main.html')
+      @results = @link.scrape!(:div => '#content', :ignore => [/\?/])
+    end
+    should "return an array of links on the page" do
+      assert_not_nil(@results)
+      assert(@results.is_a?(Array))
+      assert(@results.include?(Link.new('http://example.com/first_page.html')))
+      assert(!@results.include?(Link.new('http://example.com/main.html?action=edit')))
     end
     should "not return links not in the div" do

data/test/test_scraper.rb CHANGED Viewed

@@ -16,18 +16,43 @@ class TestScraper < Test::Unit::TestCase
   context "scraping" do
     setup do
       @scraper = Scraper.new('http://example.com/main.html')
-      @results = @scraper.scrape('#content')
+      @results = @scraper.scrape(:div => '#content')
     end
     should "Include a list of links on the pages." do
       assert(@results.include?(Link.new('http://example.com/first_page.html')))
       assert(@results.include?(Link.new('http://example.com/first_child_page.html')))
       assert(@results.include?(Link.new('http://example.com/first_child_page.html#content')))
+      assert(@results.include?(Link.new('http://example.com/first_child_page.html#content2')))
       assert(@results.include?(Link.new('http://example.com/main.html')))
+      assert(@results.include?(Link.new('http://google.com')))
     end
     should "Not include any links outside of the content div" do
       assert(!@results.include?(Link.new('http://example.com/not_added.html')))
     end
   end
+  context "scraping within domain" do
+    setup do
+      @scraper = Scraper.new('http://example.com/main.html', :domain => 'example.com')
+      @results = @scraper.scrape(:div => '#content')
+    end
+    should "Include a list of links on the pages." do
+      assert(@results.include?(Link.new('http://example.com/first_page.html')))
+      assert(@results.include?(Link.new('http://example.com/first_child_page.html')))
+      assert(@results.include?(Link.new('http://example.com/first_child_page.html#content')))
+      assert(@results.include?(Link.new('http://example.com/first_child_page.html#content2')))
+      assert(@results.include?(Link.new('http://example.com/main.html')))
+    end
+    should "Not include any links outside of the content div" do
+      assert(!@results.include?(Link.new('http://example.com/not_added.html')))
+    end
+    should "Not include any links outside of the domain" do
+      assert(!@results.include?(Link.new('http://google.com')))
+    end
+  end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: guitsaru-scraper
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.1.2
 platform: ruby
 authors:
 - Matt Pruitt
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-06-17 00:00:00 -07:00
+date: 2009-06-18 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -43,6 +43,7 @@ files:
 - scraper.gemspec
 - test/fake_pages/first_child_page.html
 - test/fake_pages/first_page.html
+- test/fake_pages/google.html
 - test/fake_pages/main.html
 - test/fake_pages/not_added.html
 - test/test_helper.rb