RubyGems - guitsaru-scraper - Versions diffs - 0.1.2 → 0.2.0 - Mend

guitsaru-scraper 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.1.2
1	+ 0.2.0

data/lib/scraper.rb CHANGED Viewed

@@ -13,17 +13,31 @@ class Scraper
   # options[:div] - The container div with the links
   # options[:domain] - The domain to collect links from, all other domains are ignored
   # options[:ignore] - An Array of regexes.  Any links matching one will be ignored.
+  # options[:recursive] - A boolean.  If false, only get the top level links.  Default is true.
+  # options[:self] - A boolean.  Whether to include the main page in results.  Default is true.
   def initialize(url, options = {})
     @url = url
     @options = options
+    unless @options.has_key?(:recursive)
+      @options.merge!(:recursive => true)
+    end
+    unless @options.has_key?(:self)
+      @options.merge!(:self => true)
+    end
   end
   def scrape(options = {})
+    options.merge!(@options)
     links = [Link.new(self.url)]
     until (not_visited = links.uniq.select { |link| !link.visited }).empty?
-      not_visited.each { |link| links += link.scrape!(options.merge(@options)) }
+      not_visited.each { |link| links += link.scrape!(options) }
+      break unless options[:recursive]
     end
+    links.delete(Link.new(self.url)) unless options[:self]
     return links.uniq
   end
 end

data/scraper.gemspec CHANGED Viewed

@@ -2,11 +2,11 @@
 Gem::Specification.new do |s|
   s.name = %q{scraper}
-  s.version = "0.1.2"
+  s.version = "0.2.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Matt Pruitt"]
-  s.date = %q{2009-06-18}
+  s.date = %q{2009-07-23}
   s.email = %q{guitsaru@gmail.com}
   s.extra_rdoc_files = [
     "LICENSE",

data/test/test_helper.rb CHANGED Viewed

@@ -8,10 +8,10 @@ $LOAD_PATH.unshift(File.dirname(__FILE__))
 require 'scraper'
 class Test::Unit::TestCase
-  FakeWeb.register_uri(:get, "http://example.com/main.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/main.html'))
-  FakeWeb.register_uri(:get, "http://example.com/first_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_page.html'))
-  FakeWeb.register_uri(:get, "http://example.com/first_child_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_child_page.html'))
-  FakeWeb.register_uri(:get, "http://example.com/not_added.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/not_added.html'))
-  FakeWeb.register_uri(:get, "http://google.com", :file => File.join(File.dirname(__FILE__), 'fake_pages/google.html'))
+  FakeWeb.register_uri(:get, "http://example.com/main.html", :body => File.join(File.dirname(__FILE__), 'fake_pages/main.html'))
+  FakeWeb.register_uri(:get, "http://example.com/first_page.html", :body => File.join(File.dirname(__FILE__), 'fake_pages/first_page.html'))
+  FakeWeb.register_uri(:get, "http://example.com/first_child_page.html", :body => File.join(File.dirname(__FILE__), 'fake_pages/first_child_page.html'))
+  FakeWeb.register_uri(:get, "http://example.com/not_added.html", :body => File.join(File.dirname(__FILE__), 'fake_pages/not_added.html'))
+  FakeWeb.register_uri(:get, "http://google.com", :body => File.join(File.dirname(__FILE__), 'fake_pages/google.html'))
 end

data/test/test_scraper.rb CHANGED Viewed

@@ -55,4 +55,32 @@ class TestScraper < Test::Unit::TestCase
       assert(!@results.include?(Link.new('http://google.com')))
     end
   end
+  context "Non-recursive scraping" do
+    setup do
+      @scraper = Scraper.new('http://example.com/main.html', :recursive => false)
+      @results = @scraper.scrape(:div => '#content')
+    end
+    should "include top level links" do
+      assert(@results.include?(Link.new('http://example.com/first_page.html')))
+    end
+    should "not include recursive links" do
+      assert(!@results.include?(Link.new('http://example.com/first_child_page.html')))
+      assert(!@results.include?(Link.new('http://example.com/first_child_page.html#content')))
+      assert(!@results.include?(Link.new('http://example.com/first_child_page.html#content2')))
+    end
+  end
+    context "Scraping without self" do
+    setup do
+      @scraper = Scraper.new('http://example.com/main.html', :self => false)
+      @results = @scraper.scrape(:div => '#content')
+    end
+    should "not include self" do
+      assert(!@results.include?(Link.new('http://example.com/main.html')))
+    end
+  end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: guitsaru-scraper
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.2.0
 platform: ruby
 authors:
 - Matt Pruitt
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-06-18 00:00:00 -07:00
+date: 2009-07-23 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency