RubyGems - rawler - Versions diffs - 0.0.4 → 0.0.5 - Mend

rawler 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

data/.autotest +23 -0
data/Manifest.txt +8 -3
data/README.txt +10 -1
data/lib/rawler.rb +1 -1
data/lib/rawler/base.rb +5 -3
data/lib/rawler/crawler.rb +6 -3
data/lib/rawler/request.rb +1 -1
data/spec/spec_helper.rb +8 -0
data/spec/unit/base_spec.rb +10 -1
data/spec/unit/crawler/base_spec.rb +75 -0
data/spec/unit/crawler/content_type_spec.rb +23 -0
data/spec/unit/crawler/exceptions_spec.rb +54 -0
data/spec/unit/crawler/http_basic_spec.rb +25 -0
data/spec/unit/crawler/url_domain_spec.rb +26 -0
metadata +15 -8
data/spec/unit/crawler_spec.rb +0 -114

data/.autotest ADDED Viewed

@@ -0,0 +1,23 @@
+# -*- ruby -*-
+require 'autotest/restart'
+# Autotest.add_hook :initialize do |at|
+#   at.extra_files << "../some/external/dependency.rb"
+#
+#   at.libs << ":../some/external"
+#
+#   at.add_exception 'vendor'
+#
+#   at.add_mapping(/dependency.rb/) do |f, _|
+#     at.files_matching(/test_.*rb$/)
+#   end
+#
+#   %w(TestA TestB).each do |klass|
+#     at.extra_class_map[klass] = "test/test_misc.rb"
+#   end
+# end
+# Autotest.add_hook :run_command do |at|
+#   system "rake build"
+# end

data/Manifest.txt CHANGED Viewed

@@ -1,17 +1,22 @@
+.autotest
 History.txt
 Manifest.txt
 README.txt
 Rakefile
 bin/rawler
+lib/rawler.rb
 lib/rawler/base.rb
-lib/rawler/core_extensions/module.rb
 lib/rawler/core_extensions.rb
+lib/rawler/core_extensions/module.rb
 lib/rawler/crawler.rb
 lib/rawler/request.rb
-lib/rawler.rb
 spec/spec.opts
 spec/spec_helper.rb
 spec/unit/base_spec.rb
-spec/unit/crawler_spec.rb
+spec/unit/crawler/base_spec.rb
+spec/unit/crawler/content_type_spec.rb
+spec/unit/crawler/exceptions_spec.rb
+spec/unit/crawler/http_basic_spec.rb
+spec/unit/crawler/url_domain_spec.rb
 tasks/rspec.rake
 vendor/lib-trollop.rb

data/README.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 = rawler
-* http://github.com/#{github_username}/#{project_name}
+* http://github.com/oscardelben/rawler
 == DESCRIPTION:
@@ -8,6 +8,8 @@ Rawler is a Ruby library that crawls your website and checks the status code for
 Rawler will only parse pages with content type 'text/html', but it will check for the response code of every link.
+Please note: I had to temporarily remove url encoding in order to resolve some issues, so if you find any issue, please let me know. I'm also going to use Mechanizer for parsing pages with the next release.
 == SYNOPSIS:
   rawler http://example.com [options]
@@ -24,8 +26,15 @@ gem install rawler
 == TODO
+* Follow redirects, but still inform about them
+* Respect robots.txt
 * Export to html
+== CONTRIBUTORS:
+* Vesa Vänskä https://github.com/vesan
+* Hugh Sasse
 == LICENSE:
 (The MIT License)

data/lib/rawler.rb CHANGED Viewed

@@ -6,7 +6,7 @@ require 'nokogiri'
 require 'rawler/core_extensions'
 module Rawler
-  VERSION = '0.0.4'
+  VERSION = '0.0.5'
   mattr_accessor :output
   mattr_accessor :url

data/lib/rawler/base.rb CHANGED Viewed

@@ -22,6 +22,8 @@ module Rawler
     def validate_links_in_page(current_url)
       Rawler::Crawler.new(current_url).links.each do |page_url|
         validate_page(page_url)
+        # Todo: include this in a configuration option
+        sleep(3)
       end
     end
@@ -39,13 +41,13 @@ module Rawler
       responses[link] = { :status => response.code.to_i }
     rescue Errno::ECONNREFUSED
       write("Connection refused - '#{link}'")
-    rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
-           Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
+    rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT,
+      EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
       write("Connection problems - '#{link}'")
     end
     def same_domain?(link)
-      URI.parse(URI.encode(Rawler.url)).host == URI.parse(URI.encode(link)).host
+      URI.parse(Rawler.url).host == URI.parse(link).host
     end
     def not_yet_parsed?(link)

data/lib/rawler/crawler.rb CHANGED Viewed

@@ -20,12 +20,15 @@ module Rawler
     rescue Errno::ECONNREFUSED
       write("Couldn't connect to #{url}")
       []
+    rescue Errno::ETIMEDOUT
+      write("Connection to #{url} timed out")
+      []
     end
     private
     def absolute_url(path)
-      URI.parse(URI.encode(url)).merge(URI.encode(path.to_s)).to_s
+      URI.parse(url).merge(path.to_s).to_s
     end
     def write(message)
@@ -33,7 +36,7 @@ module Rawler
     end
     def different_domain?(url_1, url_2)
-      URI.parse(URI.encode(url_1)).host != URI.parse(URI.encode(url_2)).host
+      URI.parse(url_1).host != URI.parse(url_2).host
     end
     def not_html?(url)
@@ -41,7 +44,7 @@ module Rawler
     end
     def valid_url?(url)
-      scheme = URI.parse(URI.encode(url)).scheme
+      scheme = URI.parse(url).scheme
       ['http', 'https'].include?(scheme)
     end

data/lib/rawler/request.rb CHANGED Viewed

@@ -15,7 +15,7 @@ module Rawler
       private
       def perform_request(method, url)
-        uri = URI.parse(URI.encode(url))
+        uri = URI.parse(url)
         http = Net::HTTP.new(uri.host, uri.port)
         http.use_ssl = (uri.scheme == 'https')

data/spec/spec_helper.rb CHANGED Viewed

@@ -1,3 +1,11 @@
+module Kernel
+  def sleep(duration)
+    nil
+  end
+end
 $:.unshift(File.dirname(__FILE__) + '/../lib')
 require 'rawler'

data/spec/unit/base_spec.rb CHANGED Viewed

@@ -49,6 +49,15 @@ describe Rawler::Base do
       rawler.validate
     end
+    it "should validate links with #hashtags" do
+      register('http://example.com/foo1', '<a href="http://example.com/page-with#hashtag">x</a>')
+      register('http://example.com/page-with', '')
+      output.should_receive(:puts).with('200 - http://example.com/page-with#hashtag')
+      rawler.validate
+    end
   end
@@ -89,7 +98,7 @@ describe Rawler::Base do
       rawler.send(:add_status_code, url)
     end
-    [Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
+    [Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT, EOFError,
     Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError].each do |error|
        it "should rescue from #{error}" do
          url = 'http://example.com'

data/spec/unit/crawler/base_spec.rb ADDED Viewed

@@ -0,0 +1,75 @@
+require File.dirname(__FILE__) + '/../../spec_helper.rb'
+describe Rawler::Crawler do
+  context "basic functionality" do
+    let(:url) { 'http://example.com' }
+    let(:crawler) { Rawler::Crawler.new(url) }
+    let(:content) {
+      content = <<-content
+        <p><a href="http://example.com/foo">foo</a></p>
+    		<p><a href="http://external.com/bar">bar</a></p>
+    	content
+    }
+    before(:each) do
+      register(url, content)
+    end
+    it "should parse all links" do
+      crawler.links.should == ['http://example.com/foo', 'http://external.com/bar']
+    end
+  end
+  context "relative paths" do
+    let(:url)     { 'http://example.com/path' }
+    let(:crawler) { Rawler::Crawler.new(url) }
+    let(:content) { '<a href="/foo">foo</a>' }
+    before(:each) do
+      register(url, content)
+    end
+    it "should parse relative links" do
+      crawler.links.should == ['http://example.com/foo']
+    end
+  end
+  context "different domains" do
+    let(:url)     { 'http://external.com/path' }
+    let(:crawler) { Rawler::Crawler.new(url) }
+    let(:content) { '<a href="/foo">foo</a>' }
+    before(:each) do
+      register(url, content)
+    end
+    it "should parse relative links" do
+      crawler.links.should == []
+    end
+  end
+  context "urls with hash tags" do
+    let(:url)     { 'http://example.com/path' }
+    let(:crawler) { Rawler::Crawler.new(url) }
+    let(:content) { '<a href="/foo#bar">foo</a>' }
+    before(:each) do
+      register(url, content)
+    end
+    it "should parse relative links" do
+      crawler.links.should == ['http://example.com/foo#bar']
+    end
+  end
+end

data/spec/unit/crawler/content_type_spec.rb ADDED Viewed

@@ -0,0 +1,23 @@
+require File.dirname(__FILE__) + '/../../spec_helper.rb'
+describe Rawler::Crawler do
+  context "content type" do
+    ['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
+      let(:url)     { 'http://example.com' }
+      let(:crawler) { Rawler::Crawler.new(url) }
+      before(:each) do
+        register(url, '', 200, :content_type => content_type)
+      end
+      it "should ignore '#{content_type}'" do
+        crawler.links.should == []
+      end
+    end
+  end
+end

data/spec/unit/crawler/exceptions_spec.rb ADDED Viewed

@@ -0,0 +1,54 @@
+require File.dirname(__FILE__) + '/../../spec_helper.rb'
+describe Rawler::Crawler do
+  context "Exceptions" do
+    let(:url)     { 'http://example.com' }
+    let(:crawler) { Rawler::Crawler.new(url) }
+    let(:output)  { double('output', :puts => nil) }
+    before(:each) do
+      register(url, '')
+      Rawler.stub!(:output).and_return(output)
+    end
+    context "Errno::ECONNREFUSED" do
+      before(:each) do
+        Rawler::Request.stub!(:get).and_raise Errno::ECONNREFUSED
+      end
+      it "should return an empty array" do
+        crawler.links.should == []
+      end
+      it "should print a message when raising Errno::ECONNREFUSED" do
+        output.should_receive(:puts).with("Couldn't connect to #{url}")
+        crawler.links
+      end
+    end
+    context "Errno::ETIMEDOUT" do
+      before(:each) do
+        Rawler::Request.stub!(:get).and_raise Errno::ETIMEDOUT
+      end
+      it "should return an empty array when raising Errno::ETIMEDOUT" do
+        crawler.links.should == []
+      end
+      it "should print a message when raising Errno::ETIMEDOUT" do
+        output.should_receive(:puts).with("Connection to #{url} timed out")
+        crawler.links
+      end
+    end
+  end
+end

data/spec/unit/crawler/http_basic_spec.rb ADDED Viewed

@@ -0,0 +1,25 @@
+require File.dirname(__FILE__) + '/../../spec_helper.rb'
+describe Rawler::Crawler do
+  context "http basic" do
+    let(:url)     { 'http://example.com' }
+    let(:content) { '<a href="http://example.com/secret-path">foo</a>' }
+    let(:crawler) { Rawler::Crawler.new('http://example.com/secret') }
+    before(:each) do
+      register('http://example.com/secret', '', :status => ["401", "Unauthorized"])
+      register('http://foo:bar@example.com/secret', content)
+      Rawler.stub!(:username).and_return('foo')
+      Rawler.stub!(:password).and_return('bar')
+    end
+    it "should crawl http basic pages" do
+      crawler.links.should == ['http://example.com/secret-path']
+    end
+  end
+end

data/spec/unit/crawler/url_domain_spec.rb ADDED Viewed

@@ -0,0 +1,26 @@
+require File.dirname(__FILE__) + '/../../spec_helper.rb'
+describe Rawler::Crawler do
+  context "url domain" do
+    let(:content) {
+      content = <<-content
+        <a href="http://example.com/valid">foo</a>
+        <a href="mailto:info@example.com">invalid</a>
+        <a href="https://foo.com">valid</a>
+      content
+    }
+    let(:url)     { 'http://example.com' }
+    let(:crawler) { Rawler::Crawler.new(url) }
+    before(:each) do
+      register(url, content)
+    end
+    it "should ignore links other than http or https" do
+      crawler.links.should == ['http://example.com/valid', 'https://foo.com']
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: rawler
 version: !ruby/object:Gem::Version
-  hash: 23
+  hash: 21
   prerelease:
   segments:
   - 0
   - 0
-  - 4
-  version: 0.0.4
+  - 5
+  version: 0.0.5
 platform: ruby
 authors:
 - Oscar Del Ben
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-01-11 00:00:00 +01:00
+date: 2011-01-21 00:00:00 +01:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -52,6 +52,8 @@ description: |-
   Rawler is a Ruby library that crawls your website and checks the status code for each of your links. Useful for finding dead links.
   Rawler will only parse pages with content type 'text/html', but it will check for the response code of every link.
+  Please note: I had to temporarily remove url encoding in order to resolve some issues, so if you find any issue, please let me know. I'm also going to use Mechanizer for parsing pages with the next release.
 email:
 - info@oscardelben.com
 executables:
@@ -63,25 +65,30 @@ extra_rdoc_files:
 - Manifest.txt
 - README.txt
 files:
+- .autotest
 - History.txt
 - Manifest.txt
 - README.txt
 - Rakefile
 - bin/rawler
+- lib/rawler.rb
 - lib/rawler/base.rb
-- lib/rawler/core_extensions/module.rb
 - lib/rawler/core_extensions.rb
+- lib/rawler/core_extensions/module.rb
 - lib/rawler/crawler.rb
 - lib/rawler/request.rb
-- lib/rawler.rb
 - spec/spec.opts
 - spec/spec_helper.rb
 - spec/unit/base_spec.rb
-- spec/unit/crawler_spec.rb
+- spec/unit/crawler/base_spec.rb
+- spec/unit/crawler/content_type_spec.rb
+- spec/unit/crawler/exceptions_spec.rb
+- spec/unit/crawler/http_basic_spec.rb
+- spec/unit/crawler/url_domain_spec.rb
 - tasks/rspec.rake
 - vendor/lib-trollop.rb
 has_rdoc: true
-homepage: http://github.com/#{github_username}/#{project_name}
+homepage: http://github.com/oscardelben/rawler
 licenses: []
 post_install_message:

data/spec/unit/crawler_spec.rb DELETED Viewed

@@ -1,114 +0,0 @@
-require File.dirname(__FILE__) + '/../spec_helper.rb'
-describe Rawler::Crawler do
-  let(:url) { 'http://example.com' }
-  before(:each) do
-    Rawler.stub!(:url).and_return(url)
-  end
-  it "should parse all links" do
-    register(url, site)
-    Rawler::Crawler.new(url).links.should == ['http://example.com/foo', 'http://external.com/bar']
-  end
-  it "should parse relative links" do
-    url = 'http://example.com/path'
-    register(url, '<a href="/foo">foo</a>')
-    Rawler::Crawler.new(url).links.should == ['http://example.com/foo']
-  end
-  it "should parse links only if the page is in the same domain as the main url" do
-    url = 'http://external.com/path'
-    register(url, '<a href="/foo">foo</a>')
-    Rawler.should_receive(:url).and_return('http://example.com')
-    Rawler::Crawler.new(url).links.should == []
-  end
-  it "should return an empty array when raising Errno::ECONNREFUSED" do
-    register(url, site)
-    crawler = Rawler::Crawler.new(url)
-    Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
-    crawler.links.should == []
-  end
-  it "should print a message when raising Errno::ECONNREFUSED" do
-    output = double('output')
-    register(url, site)
-    crawler = Rawler::Crawler.new(url)
-    Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
-    Rawler.should_receive(:output).and_return(output)
-    output.should_receive(:puts).with("Couldn't connect to #{url}")
-    crawler.links
-  end
-  context "should ignore content type other than text/html" do
-    ['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
-      it "should ignore '#{content_type}'" do
-        register(url, site, 200, :content_type => content_type)
-        crawler = Rawler::Crawler.new(url)
-        crawler.links.should == []
-      end
-    end
-  end
-  it "should ignore links other than http or https" do
-    content = <<-content
-      <a href="http://example.com/valid">foo</a>
-      <a href="mailto:info@example.com">invalid</a>
-      <a href="https://foo.com">valid</a>
-    content
-    register(url, content)
-    crawler = Rawler::Crawler.new(url)
-    crawler.links.should == ['http://example.com/valid', 'https://foo.com']
-  end
-  it "should crawl http basic pages" do
-    content = '<a href="http://example.com/secret-path">foo</a>'
-    register('http://example.com/secret', '', :status => ["401", "Unauthorized"])
-    register('http://foo:bar@example.com/secret', content)
-    Rawler.stub!(:username).and_return('foo')
-    Rawler.stub!(:password).and_return('bar')
-    crawler = Rawler::Crawler.new('http://example.com/secret')
-    crawler.links.should == ['http://example.com/secret-path']
-  end
-  private
-  def site
-    <<-site
-      <!DOCTYPE html>
-      <html>
-      	<body>
-      		<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
-      		<p><a href="http://example.com/foo">foo</a></p>
-      		<p><a href="http://external.com/bar">bar</a></p>
-      	</body>
-      </html>
-    site
-  end
-end