guitsaru-scraper 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/scraper/link.rb +35 -7
- data/lib/scraper.rb +11 -5
- data/scraper.gemspec +3 -2
- data/test/fake_pages/first_child_page.html +1 -0
- data/test/fake_pages/first_page.html +1 -1
- data/test/fake_pages/google.html +19 -0
- data/test/fake_pages/main.html +2 -0
- data/test/test_helper.rb +1 -0
- data/test/test_link.rb +22 -1
- data/test/test_scraper.rb +26 -1
- metadata +3 -2
    
        data/VERSION
    CHANGED
    
    | @@ -1 +1 @@ | |
| 1 | 
            -
            0.1. | 
| 1 | 
            +
            0.1.2
         | 
    
        data/lib/scraper/link.rb
    CHANGED
    
    | @@ -7,10 +7,10 @@ module Scrape | |
| 7 7 | 
             
                  @visited = false
         | 
| 8 8 | 
             
                end
         | 
| 9 9 |  | 
| 10 | 
            -
                def scrape!( | 
| 10 | 
            +
                def scrape!(options = {})
         | 
| 11 11 | 
             
                  return [] if @visited
         | 
| 12 12 | 
             
                  @visited = true
         | 
| 13 | 
            -
                  return get_links( | 
| 13 | 
            +
                  return get_links(options)
         | 
| 14 14 | 
             
                end
         | 
| 15 15 |  | 
| 16 16 | 
             
                def ==(other)
         | 
| @@ -27,24 +27,52 @@ module Scrape | |
| 27 27 | 
             
                end
         | 
| 28 28 |  | 
| 29 29 | 
             
                private
         | 
| 30 | 
            -
                def get_links( | 
| 30 | 
            +
                def get_links(options = {})
         | 
| 31 | 
            +
                  div = nil
         | 
| 32 | 
            +
                  ignore = []
         | 
| 33 | 
            +
                  
         | 
| 34 | 
            +
                  if options[:div]
         | 
| 35 | 
            +
                    div = options[:div]
         | 
| 36 | 
            +
                  end
         | 
| 37 | 
            +
                  
         | 
| 38 | 
            +
                  if options[:ignore]
         | 
| 39 | 
            +
                    ignore = options[:ignore]
         | 
| 40 | 
            +
                  end
         | 
| 41 | 
            +
                  
         | 
| 31 42 | 
             
                  links = []
         | 
| 32 43 |  | 
| 33 | 
            -
                  doc = Hpricot(Net::HTTP.get(URI.parse(url)))
         | 
| 44 | 
            +
                  doc = Hpricot(Net::HTTP.get(URI.parse(@url)))
         | 
| 34 45 | 
             
                  doc.search("#{div} a").each do |link|
         | 
| 35 46 | 
             
                    url = link['href']
         | 
| 36 47 | 
             
                    if url =~ /^\/(.*)/
         | 
| 37 48 | 
             
                      components = URI::split(@url)
         | 
| 38 49 | 
             
                      url = "#{components[0] || 'http'}://#{components[2]}#{url}"
         | 
| 39 | 
            -
                    elsif url =~ /^ | 
| 40 | 
            -
             | 
| 50 | 
            +
                    elsif url =~ /^https?:\/\//i
         | 
| 51 | 
            +
                              url = url
         | 
| 52 | 
            +
                    elsif url =~ /file:\/\//
         | 
| 53 | 
            +
                      next
         | 
| 41 54 | 
             
                    elsif url =~ /^#/
         | 
| 42 55 | 
             
                      url = @url.gsub(/#.*/, '').gsub(/\/$/, '') + url
         | 
| 43 56 | 
             
                    else
         | 
| 44 57 | 
             
                      url = (File.dirname(@url) + '/' + (url || ''))
         | 
| 45 58 | 
             
                    end
         | 
| 46 59 |  | 
| 47 | 
            -
                     | 
| 60 | 
            +
                    # Don't add this link if it matches a pattern in ignore
         | 
| 61 | 
            +
                    skip = false
         | 
| 62 | 
            +
                    ignore.each { |pattern| skip = true if url =~ pattern }
         | 
| 63 | 
            +
                    skip = true if options[:domain] && !url.include?(options[:domain])
         | 
| 64 | 
            +
                    
         | 
| 65 | 
            +
                    if !skip
         | 
| 66 | 
            +
                      new_link = Link.new(url, link.inner_html.strip)
         | 
| 67 | 
            +
                      
         | 
| 68 | 
            +
                      # Don't visit anchors, visit the main page instead.
         | 
| 69 | 
            +
                      if url =~ /(https?:\/\/.*)#(.*$)/i
         | 
| 70 | 
            +
                        links << Link.new($1, $2)
         | 
| 71 | 
            +
                        new_link.visited = true
         | 
| 72 | 
            +
                      end
         | 
| 73 | 
            +
                      
         | 
| 74 | 
            +
                      links << new_link
         | 
| 75 | 
            +
                    end
         | 
| 48 76 | 
             
                  end
         | 
| 49 77 |  | 
| 50 78 | 
             
                  return links.uniq
         | 
    
        data/lib/scraper.rb
    CHANGED
    
    | @@ -8,14 +8,20 @@ class Scraper | |
| 8 8 |  | 
| 9 9 | 
             
              attr_accessor :url
         | 
| 10 10 |  | 
| 11 | 
            -
               | 
| 12 | 
            -
             | 
| 11 | 
            +
              # Scrapes a web page, collecting all links on the page and scraping each new link.
         | 
| 12 | 
            +
              # Possible options
         | 
| 13 | 
            +
              # options[:div] - The container div with the links
         | 
| 14 | 
            +
              # options[:domain] - The domain to collect links from, all other domains are ignored
         | 
| 15 | 
            +
              # options[:ignore] - An Array of regexes.  Any links matching one will be ignored.
         | 
| 16 | 
            +
              def initialize(url, options = {})
         | 
| 17 | 
            +
                @url = url
         | 
| 18 | 
            +
                @options = options
         | 
| 13 19 | 
             
              end
         | 
| 14 20 |  | 
| 15 | 
            -
              def scrape( | 
| 21 | 
            +
              def scrape(options = {})
         | 
| 16 22 | 
             
                links = [Link.new(self.url)]
         | 
| 17 | 
            -
                until (not_visited = links.uniq.select { |link| !link.visited}).empty?
         | 
| 18 | 
            -
                  not_visited.each { |link| links += link.scrape!( | 
| 23 | 
            +
                until (not_visited = links.uniq.select { |link| !link.visited }).empty?
         | 
| 24 | 
            +
                  not_visited.each { |link| links += link.scrape!(options.merge(@options)) }
         | 
| 19 25 | 
             
                end
         | 
| 20 26 |  | 
| 21 27 | 
             
                return links.uniq
         | 
    
        data/scraper.gemspec
    CHANGED
    
    | @@ -2,11 +2,11 @@ | |
| 2 2 |  | 
| 3 3 | 
             
            Gem::Specification.new do |s|
         | 
| 4 4 | 
             
              s.name = %q{scraper}
         | 
| 5 | 
            -
              s.version = "0.1. | 
| 5 | 
            +
              s.version = "0.1.2"
         | 
| 6 6 |  | 
| 7 7 | 
             
              s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
         | 
| 8 8 | 
             
              s.authors = ["Matt Pruitt"]
         | 
| 9 | 
            -
              s.date = %q{2009-06- | 
| 9 | 
            +
              s.date = %q{2009-06-18}
         | 
| 10 10 | 
             
              s.email = %q{guitsaru@gmail.com}
         | 
| 11 11 | 
             
              s.extra_rdoc_files = [
         | 
| 12 12 | 
             
                "LICENSE",
         | 
| @@ -24,6 +24,7 @@ Gem::Specification.new do |s| | |
| 24 24 | 
             
                 "scraper.gemspec",
         | 
| 25 25 | 
             
                 "test/fake_pages/first_child_page.html",
         | 
| 26 26 | 
             
                 "test/fake_pages/first_page.html",
         | 
| 27 | 
            +
                 "test/fake_pages/google.html",
         | 
| 27 28 | 
             
                 "test/fake_pages/main.html",
         | 
| 28 29 | 
             
                 "test/fake_pages/not_added.html",
         | 
| 29 30 | 
             
                 "test/test_helper.rb",
         | 
| @@ -14,7 +14,7 @@ | |
| 14 14 | 
             
                <a href="not_added.html">Not Added</a>
         | 
| 15 15 | 
             
              </div>
         | 
| 16 16 | 
             
              <div id="content">
         | 
| 17 | 
            -
                <a href="http://example.com/first_child_page.html">First Child Page</a>
         | 
| 17 | 
            +
                <a href="http://example.com/first_child_page.html#content2">First Child Page</a>
         | 
| 18 18 | 
             
              </div>
         | 
| 19 19 | 
             
            </body>
         | 
| 20 20 | 
             
            </html>
         | 
| @@ -0,0 +1,19 @@ | |
| 1 | 
            +
            <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
         | 
| 2 | 
            +
               "http://www.w3.org/TR/html4/strict.dtd">
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            <html lang="en">
         | 
| 5 | 
            +
            <head>
         | 
| 6 | 
            +
              <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
         | 
| 7 | 
            +
              <title>untitled</title>
         | 
| 8 | 
            +
              <meta name="generator" content="TextMate http://macromates.com/">
         | 
| 9 | 
            +
              <meta name="author" content="Matt Pruitt">
         | 
| 10 | 
            +
              <!-- Date: 2009-06-17 -->
         | 
| 11 | 
            +
            </head>
         | 
| 12 | 
            +
            <body>
         | 
| 13 | 
            +
              <div id="header">
         | 
| 14 | 
            +
                <a href="not_added.html">Not Added</a>
         | 
| 15 | 
            +
              </div>
         | 
| 16 | 
            +
              <div id="content">
         | 
| 17 | 
            +
              </div>
         | 
| 18 | 
            +
            </body>
         | 
| 19 | 
            +
            </html>
         | 
    
        data/test/fake_pages/main.html
    CHANGED
    
    
    
        data/test/test_helper.rb
    CHANGED
    
    | @@ -12,5 +12,6 @@ class Test::Unit::TestCase | |
| 12 12 | 
             
              FakeWeb.register_uri(:get, "http://example.com/first_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_page.html'))
         | 
| 13 13 | 
             
              FakeWeb.register_uri(:get, "http://example.com/first_child_page.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/first_child_page.html'))
         | 
| 14 14 | 
             
              FakeWeb.register_uri(:get, "http://example.com/not_added.html", :file => File.join(File.dirname(__FILE__), 'fake_pages/not_added.html'))
         | 
| 15 | 
            +
              FakeWeb.register_uri(:get, "http://google.com", :file => File.join(File.dirname(__FILE__), 'fake_pages/google.html'))
         | 
| 15 16 | 
             
            end
         | 
| 16 17 |  | 
    
        data/test/test_link.rb
    CHANGED
    
    | @@ -37,19 +37,40 @@ class TestLink < Test::Unit::TestCase | |
| 37 37 | 
             
                  assert(@results.is_a?(Array))
         | 
| 38 38 | 
             
                  assert(@results.include?(Link.new('http://example.com/first_page.html')))
         | 
| 39 39 | 
             
                  assert(@results.include?(Link.new('http://example.com/not_added.html')))
         | 
| 40 | 
            +
                  assert(@results.include?(Link.new('http://example.com/main.html?action=edit')))
         | 
| 41 | 
            +
                  assert(!@results.include?(Link.new('http://example.com/first_child_page.html/file://fileserver/file.pdf')))
         | 
| 40 42 | 
             
                end
         | 
| 41 43 | 
             
              end
         | 
| 42 44 |  | 
| 43 45 | 
             
              context "scraping inside a div" do
         | 
| 44 46 | 
             
                setup do
         | 
| 45 47 | 
             
                  @link = Link.new('http://example.com/main.html')
         | 
| 46 | 
            -
                  @results = @link.scrape!('#content')
         | 
| 48 | 
            +
                  @results = @link.scrape!(:div => '#content')
         | 
| 47 49 | 
             
                end
         | 
| 48 50 |  | 
| 49 51 | 
             
                should "return an array of links on the page" do
         | 
| 50 52 | 
             
                  assert_not_nil(@results)
         | 
| 51 53 | 
             
                  assert(@results.is_a?(Array))
         | 
| 52 54 | 
             
                  assert(@results.include?(Link.new('http://example.com/first_page.html')))
         | 
| 55 | 
            +
                  assert(@results.include?(Link.new('http://example.com/main.html?action=edit')))
         | 
| 56 | 
            +
                end
         | 
| 57 | 
            +
                
         | 
| 58 | 
            +
                should "not return links not in the div" do
         | 
| 59 | 
            +
                  assert(!@results.include?(Link.new('http://example.com/not_added.html')), "Includes a link outside of the correct div.")
         | 
| 60 | 
            +
                end
         | 
| 61 | 
            +
              end
         | 
| 62 | 
            +
              
         | 
| 63 | 
            +
              context "scraping with ignore options" do
         | 
| 64 | 
            +
                setup do
         | 
| 65 | 
            +
                  @link = Link.new('http://example.com/main.html')
         | 
| 66 | 
            +
                  @results = @link.scrape!(:div => '#content', :ignore => [/\?/])
         | 
| 67 | 
            +
                end
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                should "return an array of links on the page" do
         | 
| 70 | 
            +
                  assert_not_nil(@results)
         | 
| 71 | 
            +
                  assert(@results.is_a?(Array))
         | 
| 72 | 
            +
                  assert(@results.include?(Link.new('http://example.com/first_page.html')))
         | 
| 73 | 
            +
                  assert(!@results.include?(Link.new('http://example.com/main.html?action=edit')))
         | 
| 53 74 | 
             
                end
         | 
| 54 75 |  | 
| 55 76 | 
             
                should "not return links not in the div" do
         | 
    
        data/test/test_scraper.rb
    CHANGED
    
    | @@ -16,18 +16,43 @@ class TestScraper < Test::Unit::TestCase | |
| 16 16 | 
             
              context "scraping" do
         | 
| 17 17 | 
             
                setup do
         | 
| 18 18 | 
             
                  @scraper = Scraper.new('http://example.com/main.html')
         | 
| 19 | 
            -
                  @results = @scraper.scrape('#content')
         | 
| 19 | 
            +
                  @results = @scraper.scrape(:div => '#content')
         | 
| 20 20 | 
             
                end
         | 
| 21 21 |  | 
| 22 22 | 
             
                should "Include a list of links on the pages." do
         | 
| 23 23 | 
             
                  assert(@results.include?(Link.new('http://example.com/first_page.html')))
         | 
| 24 24 | 
             
                  assert(@results.include?(Link.new('http://example.com/first_child_page.html')))
         | 
| 25 25 | 
             
                  assert(@results.include?(Link.new('http://example.com/first_child_page.html#content')))
         | 
| 26 | 
            +
                  assert(@results.include?(Link.new('http://example.com/first_child_page.html#content2')))
         | 
| 26 27 | 
             
                  assert(@results.include?(Link.new('http://example.com/main.html')))
         | 
| 28 | 
            +
                  assert(@results.include?(Link.new('http://google.com')))
         | 
| 27 29 | 
             
                end
         | 
| 28 30 |  | 
| 29 31 | 
             
                should "Not include any links outside of the content div" do
         | 
| 30 32 | 
             
                  assert(!@results.include?(Link.new('http://example.com/not_added.html')))
         | 
| 31 33 | 
             
                end
         | 
| 32 34 | 
             
              end
         | 
| 35 | 
            +
              
         | 
| 36 | 
            +
              context "scraping within domain" do
         | 
| 37 | 
            +
                setup do
         | 
| 38 | 
            +
                  @scraper = Scraper.new('http://example.com/main.html', :domain => 'example.com')
         | 
| 39 | 
            +
                  @results = @scraper.scrape(:div => '#content')
         | 
| 40 | 
            +
                end
         | 
| 41 | 
            +
                
         | 
| 42 | 
            +
                should "Include a list of links on the pages." do
         | 
| 43 | 
            +
                  assert(@results.include?(Link.new('http://example.com/first_page.html')))
         | 
| 44 | 
            +
                  assert(@results.include?(Link.new('http://example.com/first_child_page.html')))
         | 
| 45 | 
            +
                  assert(@results.include?(Link.new('http://example.com/first_child_page.html#content')))
         | 
| 46 | 
            +
                  assert(@results.include?(Link.new('http://example.com/first_child_page.html#content2')))
         | 
| 47 | 
            +
                  assert(@results.include?(Link.new('http://example.com/main.html')))
         | 
| 48 | 
            +
                end
         | 
| 49 | 
            +
                
         | 
| 50 | 
            +
                should "Not include any links outside of the content div" do
         | 
| 51 | 
            +
                  assert(!@results.include?(Link.new('http://example.com/not_added.html')))
         | 
| 52 | 
            +
                end
         | 
| 53 | 
            +
                
         | 
| 54 | 
            +
                should "Not include any links outside of the domain" do
         | 
| 55 | 
            +
                  assert(!@results.include?(Link.new('http://google.com')))
         | 
| 56 | 
            +
                end
         | 
| 57 | 
            +
              end
         | 
| 33 58 | 
             
            end
         | 
    
        metadata
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification 
         | 
| 2 2 | 
             
            name: guitsaru-scraper
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version 
         | 
| 4 | 
            -
              version: 0.1. | 
| 4 | 
            +
              version: 0.1.2
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors: 
         | 
| 7 7 | 
             
            - Matt Pruitt
         | 
| @@ -9,7 +9,7 @@ autorequire: | |
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 11 |  | 
| 12 | 
            -
            date: 2009-06- | 
| 12 | 
            +
            date: 2009-06-18 00:00:00 -07:00
         | 
| 13 13 | 
             
            default_executable: 
         | 
| 14 14 | 
             
            dependencies: 
         | 
| 15 15 | 
             
            - !ruby/object:Gem::Dependency 
         | 
| @@ -43,6 +43,7 @@ files: | |
| 43 43 | 
             
            - scraper.gemspec
         | 
| 44 44 | 
             
            - test/fake_pages/first_child_page.html
         | 
| 45 45 | 
             
            - test/fake_pages/first_page.html
         | 
| 46 | 
            +
            - test/fake_pages/google.html
         | 
| 46 47 | 
             
            - test/fake_pages/main.html
         | 
| 47 48 | 
             
            - test/fake_pages/not_added.html
         | 
| 48 49 | 
             
            - test/test_helper.rb
         |