RubyGems - guitsaru-scraper - Versions diffs - 0.1.2 → 0.2.0 - Mend

guitsaru-scraper 0.1.2 → 0.2.0

Files changed (6) hide show

data/VERSION CHANGED Viewed

data/lib/scraper.rb CHANGED Viewed

@@ -13,17 +13,31 @@ class Scraper
   # options[:div] - The container div with the links
   # options[:domain] - The domain to collect links from, all other domains are ignored
   # options[:ignore] - An Array of regexes.  Any links matching one will be ignored.
+  # options[:recursive] - A boolean.  If false, only get the top level links.  Default is true.
+  # options[:self] - A boolean.  Whether to include the main page in results.  Default is true.
   def initialize(url, options = {})
     @url = url
     @options = options
+    unless @options.has_key?(:recursive)
+      @options.merge!(:recursive => true)
+    end
+    unless @options.has_key?(:self)
+      @options.merge!(:self => true)
+    end
   end
   def scrape(options = {})
+    options.merge!(@options)
     links = [Link.new(self.url)]
     until (not_visited = links.uniq.select { |link| !link.visited }).empty?
-      not_visited.each { |link| links += link.scrape!(options.merge(@options)) }
+      not_visited.each { |link| links += link.scrape!(options) }
+      break unless options[:recursive]
     end
+    links.delete(Link.new(self.url)) unless options[:self]
     return links.uniq
   end
 end

data/scraper.gemspec CHANGED Viewed

@@ -2,11 +2,11 @@
 Gem::Specification.new do |s|
   s.name = %q{scraper}
-  s.version = "0.1.2"
+  s.version = "0.2.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Matt Pruitt"]
-  s.date = %q{2009-06-18}
+  s.date = %q{2009-07-23}
   s.email = %q{guitsaru@gmail.com}
   s.extra_rdoc_files = [
     "LICENSE",

data/test/test_helper.rb CHANGED Viewed

@@ -8,10 +8,10 @@ $LOAD_PATH.unshift(File.dirname(__FILE__))
 require 'scraper'
 class Test::Unit::TestCase
-  FakeWeb.register_uri(:get, "http://example.com/main.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/main.html'))
-  FakeWeb.register_uri(:get, "http://example.com/first_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_page.html'))
-  FakeWeb.register_uri(:get, "http://example.com/first_child_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_child_page.html'))
-  FakeWeb.register_uri(:get, "http://example.com/not_added.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/not_added.html'))
-  FakeWeb.register_uri(:get, "http://google.com", :file => File.join(File.dirname(__FILE__), 'fake_pages/google.html'))
+  FakeWeb.register_uri(:get, "http://example.com/main.html", :body => File.join(File.dirname(__FILE__), 'fake_pages/main.html'))
+  FakeWeb.register_uri(:get, "http://example.com/first_page.html", :body => File.join(File.dirname(__FILE__), 'fake_pages/first_page.html'))
+  FakeWeb.register_uri(:get, "http://example.com/first_child_page.html", :body => File.join(File.dirname(__FILE__), 'fake_pages/first_child_page.html'))
+  FakeWeb.register_uri(:get, "http://example.com/not_added.html", :body => File.join(File.dirname(__FILE__), 'fake_pages/not_added.html'))
+  FakeWeb.register_uri(:get, "http://google.com", :body => File.join(File.dirname(__FILE__), 'fake_pages/google.html'))
 end

data/test/test_scraper.rb CHANGED Viewed

@@ -55,4 +55,32 @@ class TestScraper < Test::Unit::TestCase
       assert(!@results.include?(Link.new('http://google.com')))
     end
   end
+  context "Non-recursive scraping" do
+    setup do
+      @scraper = Scraper.new('http://example.com/main.html', :recursive => false)
+      @results = @scraper.scrape(:div => '#content')
+    end
+    should "include top level links" do
+      assert(@results.include?(Link.new('http://example.com/first_page.html')))
+    end
+    should "not include recursive links" do
+      assert(!@results.include?(Link.new('http://example.com/first_child_page.html')))
+      assert(!@results.include?(Link.new('http://example.com/first_child_page.html#content')))
+      assert(!@results.include?(Link.new('http://example.com/first_child_page.html#content2')))
+    end
+  end
+    context "Scraping without self" do
+    setup do
+      @scraper = Scraper.new('http://example.com/main.html', :self => false)
+      @results = @scraper.scrape(:div => '#content')
+    end
+    should "not include self" do
+      assert(!@results.include?(Link.new('http://example.com/main.html')))
+    end
+  end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: guitsaru-scraper
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.2.0
 platform: ruby
 authors:
 - Matt Pruitt
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-06-18 00:00:00 -07:00
+date: 2009-07-23 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency