RubyGems - rawler - Versions diffs - 0.0.2 → 0.0.3 - Mend

rawler 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

data/README.txt CHANGED

@@ -4,11 +4,19 @@
 == DESCRIPTION:
-Rawler is a Ruby library that crawls your website and see the status code of each of your links. Useful for finding dead links.
+Rawler is a Ruby library that crawls your website and checks the status code for each of your links. Useful for finding dead links.
+Rawler will only parse pages with content type 'text/html', but it will check for the response code of every link.
 == SYNOPSIS:
-  rawler http://example.com
+  rawler http://example.com [options]
+	where [options] are:
+	  --username, -u <s>:   HTT Basic Username
+	  --password, -p <s>:   HTT Basic Password
+	       --version, -v:   Print version and exit
+	          --help, -h:   Show this message
 == INSTALL:
@@ -16,10 +24,7 @@ gem install rawler
 == TODO
-* Handle https (now returns 400). See http://stackoverflow.com/questions/1719809/ruby-on-rails-https-post-bad-request
 * Export to html
-* Handle multiple urls at once
-* Add user agent
 == LICENSE:

data/bin/rawler CHANGED

@@ -1,11 +1,33 @@
 #!/usr/bin/env ruby
 require File.join(File.dirname(__FILE__), '..', '/lib/rawler.rb')
+require File.join(File.dirname(__FILE__), '..', '/vendor/lib-trollop.rb')
-domain = ARGV[0]
+opts = Trollop::options do
+  version "rawler 0.0.3 (c) 2011 Oscar Del Ben"
+  banner <<-EOS
+Rawler is a command line utility for parsing links on a website
+Usage:
+      rawler http://example.com [options]
+where [options] are:
+EOS
+  # opt :domain, "domain that you want to test", :type => :string
+  opt :username, "HTT Basic Username", :type => :string
+  opt :password, "HTT Basic Password", :type => :string
+end
+domain = ARGV.shift
 if domain.nil?
-  puts "Usage: rawler http://example.com"
+  Trollop::die "Domain name is mandatory. Type --help for help"
+else
+  Trollop::options do
+    opt :domain, "Domain address", :type => :string
+  end
 end
-Rawler::Base.new(domain, $stdout).validate
+Rawler::Base.new(domain, $stdout, opts[:username], opts[:password]).validate

data/lib/rawler.rb CHANGED

@@ -1,13 +1,22 @@
 require 'rubygems'
 require 'net/http'
+require 'net/https'
 require 'nokogiri'
 $:.unshift(File.dirname(__FILE__)) unless
   $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
+require 'rawler/core_extensions'
 module Rawler
-  VERSION = '0.0.2'
+  VERSION = '0.0.3'
+  mattr_accessor :output
+  mattr_accessor :url
+  mattr_accessor :username, :password
   autoload :Base, "rawler/base"
   autoload :Crawler, "rawler/crawler"
+  autoload :Request, "rawler/request"
 end

data/lib/rawler/base.rb CHANGED

@@ -2,16 +2,19 @@ module Rawler
   class Base
-    attr_accessor :url, :responses
+    attr_accessor :responses
-    def initialize(url, output)
-      @url = url
+    def initialize(url, output, username=nil, password=nil)
       @responses = {}
-      $output = output
+      Rawler.url      = url
+      Rawler.output   = output
+      Rawler.username = username
+      Rawler.password = password
     end
     def validate
-      validate_links_in_page(url)
+      validate_links_in_page(Rawler.url)
     end
     private
@@ -30,32 +33,29 @@ module Rawler
     end
     def add_status_code(link)
-      uri = URI.parse(link)
-      response = nil
-      Net::HTTP.start(uri.host, uri.port) do |http|
-        path = (uri.path.size == 0)  ? "/" : uri.path
-        response = http.head(path, {'User-Agent'=>'Rawler'})
-      end
+      response = Rawler::Request.get(link)
-      $output.puts("#{response.code} - #{link}")
+      write("#{response.code} - #{link}")
       responses[link] = { :status => response.code.to_i }
     rescue Errno::ECONNREFUSED
-      puts "Connection refused - '#{link}'"
+      write("Connection refused - '#{link}'")
     rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
            Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
-      puts "Connection problems - #{link}"
+      write("Connection problems - '#{link}'")
     end
     def same_domain?(link)
-      URI.parse(url).host == URI.parse(link).host
+      URI.parse(Rawler.url).host == URI.parse(link).host
     end
     def not_yet_parsed?(link)
       responses[link].nil?
     end
+    def write(message)
+      Rawler.output.puts(message)
+    end
   end
 end

data/lib/rawler/crawler.rb CHANGED

@@ -9,12 +9,16 @@ module Rawler
     end
     def links
-      content = Net::HTTP.get(URI.parse(url))
+      if different_domain?(url, Rawler.url) || not_html?(url)
+        return []
+      end
-      doc = Nokogiri::HTML(content)
-      doc.css('a').map { |a| absolute_url(a['href']) }
+      response = Rawler::Request.get(url)
+      doc = Nokogiri::HTML(response.body)
+      doc.css('a').map { |a| absolute_url(a['href']) }.select { |url| valid_url?(url) }
     rescue Errno::ECONNREFUSED
-      $output.puts "Couldn't connect to #{url}"
+      write("Couldn't connect to #{url}")
       []
     end
@@ -23,6 +27,24 @@ module Rawler
     def absolute_url(path)
       URI.parse(url).merge(path.to_s).to_s
     end
+    def write(message)
+      Rawler.output.puts(message)
+    end
+    def different_domain?(url_1, url_2)
+      URI.parse(url_1).host != URI.parse(url_2).host
+    end
+    def not_html?(url)
+      Rawler::Request.head(url).content_type != 'text/html'
+    end
+    def valid_url?(url)
+      scheme = URI.parse(url).scheme
+      ['http', 'https'].include?(scheme)
+    end
   end

data/spec/spec_helper.rb CHANGED

@@ -5,6 +5,6 @@ require 'fakeweb'
 FakeWeb.allow_net_connect = false
-def register(uri, content, status=200)
-  FakeWeb.register_uri(:any, uri, :body => content, :status => status)
+def register(uri, content, status=200, options={})
+  FakeWeb.register_uri(:any, uri, { :body => content, :status => status, :content_type => 'text/html' }.merge(options))
 end

data/spec/unit/base_spec.rb CHANGED

@@ -72,6 +72,36 @@ describe Rawler::Base do
       rawler.responses[url][:status].should == 302
     end
+    it "should save username and password" do
+      rawler = Rawler::Base.new('http://example.com', output, 'my_user', 'secret')
+      Rawler.username.should == 'my_user'
+      Rawler.password.should == 'secret'
+    end
+    it "should rescue from Errno::ECONNREFUSED" do
+      url = 'http://example.com'
+      Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
+      output.should_receive(:puts).with("Connection refused - '#{url}'")
+      rawler.send(:add_status_code, url)
+    end
+    [Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
+    Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError].each do |error|
+       it "should rescue from #{error}" do
+         url = 'http://example.com'
+         Rawler::Request.should_receive(:get).and_raise error
+         output.should_receive(:puts).with("Connection problems - '#{url}'")
+         rawler.send(:add_status_code, url)
+       end
+    end
   end

data/spec/unit/crawler_spec.rb CHANGED

@@ -2,40 +2,96 @@ require File.dirname(__FILE__) + '/../spec_helper.rb'
 describe Rawler::Crawler do
+  let(:url) { 'http://example.com' }
+  before(:each) do
+    Rawler.stub!(:url).and_return(url)
+  end
   it "should parse all links" do
-    url = 'http://example.com/'
     register(url, site)
     Rawler::Crawler.new(url).links.should == ['http://example.com/foo', 'http://external.com/bar']
   end
+  it "should parse relative links" do
+    url = 'http://example.com/path'
+    register(url, '<a href="/foo">foo</a>')
+    Rawler::Crawler.new(url).links.should == ['http://example.com/foo']
+  end
+  it "should parse links only if the page is in the same domain as the main url" do
+    url = 'http://external.com/path'
+    register(url, '<a href="/foo">foo</a>')
+    Rawler.should_receive(:url).and_return('http://example.com')
+    Rawler::Crawler.new(url).links.should == []
+  end
   it "should return an empty array when raising Errno::ECONNREFUSED" do
-    url = 'http://example.com'
     register(url, site)
+    crawler = Rawler::Crawler.new(url)
-    Net::HTTP.should_receive(:get).and_raise Errno::ECONNREFUSED
+    Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
-    crawler = Rawler::Crawler.new(url).links.should == []
+    crawler.links.should == []
   end
-  it "should parse relative links" do
-    url = 'http://example.com/path'
-    register(url, '<a href="/foo">foo</a>')
+  it "should print a message when raising Errno::ECONNREFUSED" do
+    output = double('output')
+    register(url, site)
+    crawler = Rawler::Crawler.new(url)
+    Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
+    Rawler.should_receive(:output).and_return(output)
+    output.should_receive(:puts).with("Couldn't connect to #{url}")
+    crawler.links
+  end
+  context "should ignore content type other than text/html" do
+    ['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
+      it "should ignore '#{content_type}'" do
+        register(url, site, 200, :content_type => content_type)
+        crawler = Rawler::Crawler.new(url)
+        crawler.links.should == []
+      end
+    end
-    Rawler::Crawler.new(url).links.should == ['http://example.com/foo']
   end
-  # it "should print a message when raising Errno::ECONNREFUSED" do
-  #   pending "refactor output. Don't use a global variable"
-  #   url = 'http://example.com'
-  #   register(url, site)
-  #
-  #   Net::HTTP.should_receive(:get).and_raise Errno::ECONNREFUSED
-  #
-  #   $stdout.should_receive(:puts).with("Couldn't connect to #{url}")
-  #
-  #   Rawler::Crawler.new(url).links
-  # end
+  it "should ignore links other than http or https" do
+    content = <<-content
+      <a href="http://example.com/valid">foo</a>
+      <a href="mailto:info@example.com">invalid</a>
+      <a href="https://foo.com">valid</a>
+    content
+    register(url, content)
+    crawler = Rawler::Crawler.new(url)
+    crawler.links.should == ['http://example.com/valid', 'https://foo.com']
+  end
+  it "should crawl http basic pages" do
+    content = '<a href="http://example.com/secret-path">foo</a>'
+    register('http://example.com/secret', '', :status => ["401", "Unauthorized"])
+    register('http://foo:bar@example.com/secret', content)
+    Rawler.stub!(:username).and_return('foo')
+    Rawler.stub!(:password).and_return('bar')
+    crawler = Rawler::Crawler.new('http://example.com/secret')
+    crawler.links.should == ['http://example.com/secret-path']
+  end
   private

metadata CHANGED

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: rawler
 version: !ruby/object:Gem::Version
-  hash: 27
+  hash: 25
   prerelease:
   segments:
   - 0
   - 0
-  - 2
-  version: 0.0.2
+  - 3
+  version: 0.0.3
 platform: ruby
 authors:
 - Oscar Del Ben
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-01-10 00:00:00 +01:00
+date: 2011-01-11 00:00:00 +01:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -48,7 +48,10 @@ dependencies:
         version: 2.8.0
   type: :development
   version_requirements: *id002
-description: Rawler is a Ruby library that crawls your website and see the status code of each of your links. Useful for finding dead links.
+description: |-
+  Rawler is a Ruby library that crawls your website and checks the status code for each of your links. Useful for finding dead links.
+  Rawler will only parse pages with content type 'text/html', but it will check for the response code of every link.
 email:
 - info@oscardelben.com
 executables:
@@ -107,6 +110,6 @@ rubyforge_project: oscardelben
 rubygems_version: 1.4.1
 signing_key:
 specification_version: 3
-summary: Rawler is a Ruby library that crawls your website and see the status code of each of your links
+summary: Rawler is a Ruby library that crawls your website and checks the status code for each of your links
 test_files: []