RubyGems - crawler - Versions diffs - 0.2.0 - Mend

crawler 0.2.0

Files changed (23) hide show

data/.gitignore +3 -0
data/VERSION +1 -0
data/bin/crawler +37 -0
data/lib/crawler.rb +2 -0
data/lib/crawler/observer.rb +22 -0
data/lib/crawler/webcrawler.rb +74 -0
data/spec/crawler/crawler_spec.rb +136 -0
data/spec/crawler/observer_spec.rb +28 -0
data/spec/fixtures/excluded/shouldnt-hit.html +6 -0
data/spec/fixtures/exclusion.html +7 -0
data/spec/fixtures/external.html +8 -0
data/spec/fixtures/index.html +9 -0
data/spec/fixtures/messed-up.html +7 -0
data/spec/fixtures/non-html.html +7 -0
data/spec/fixtures/non-http.html +14 -0
data/spec/fixtures/page2.html +9 -0
data/spec/fixtures/page3.html +8 -0
data/spec/fixtures/page4.html +0 -0
data/spec/fixtures/page5.html +8 -0
data/spec/fixtures/pdf.pdf +0 -0
data/spec/fixtures/self-reference.html +13 -0
data/spec/spec_helper.rb +4 -0
metadata +87 -0

data/.gitignore ADDED

@@ -0,0 +1,3 @@
+/doc/*
+/log/*
+/pkg/*

data/VERSION ADDED

	@@ -0,0 +1 @@
1	+ 0.2.0

data/bin/crawler ADDED

@@ -0,0 +1,37 @@
+#! /usr/bin/ruby
+$LOAD_PATH << File.join(File.dirname(__FILE__), "..", "lib")
+require 'rubygems'
+require 'crawler'
+require 'optparse'
+options = {}
+logfile = ""
+optparser = OptionParser.new do |opts|
+  opts.on("-t", "--timeout X", Integer, "Timemout limit in seconds") { |x| options[:timeout] = x}
+  opts.on("-x", "--exclude path", Array, "List of paths to be excluded") { |x| options[:exclude] = x}
+  opts.on("-l", "--log file", String, "Filename to use as a log") {|log| logfile = log }
+  opts.parse!(ARGV)
+end
+unless logfile.empty?
+  log = File.new(logfile, "w")
+else
+  log = $stdout
+end
+uri_string = ARGV[0]
+begin
+  uri = URI.parse(uri_string)
+  raise unless uri.is_a?(URI::HTTP)
+rescue
+  puts "Error parsing URI: #{uri_string}"
+  Process.exit
+end
+crawler = Crawler::Webcrawler.new(options)
+observer = Crawler::Observer.new(log)
+crawler.add_observer(observer)
+crawler.crawl(uri)

data/lib/crawler.rb ADDED

	@@ -0,0 +1,2 @@
1	+ require 'crawler/webcrawler'
2	+ require 'crawler/observer'

data/lib/crawler/observer.rb ADDED

@@ -0,0 +1,22 @@
+module Crawler
+  # Observer watches a Webcrawler and outputs messages to a log object. This defaults to STDOUT but may be anything which responds to +puts+.
+  class Observer
+    # Log object. Must respond to +puts+.
+    attr_accessor :log
+    # Creates a new Observer object
+    def initialize(log=$stdout)
+      @log = log
+    end
+    # Called by the Observable module through Webcrawler.
+    def update(response, url)
+      @log.puts "Scanning: " + url.to_s
+      if response.kind_of?(Net::HTTPClientError) or response.kind_of?(Net::HTTPServerError)
+        @log.puts "#{response.code} encountered for " + url.to_s
+      end
+    end
+  end
+end

data/lib/crawler/webcrawler.rb ADDED

@@ -0,0 +1,74 @@
+require 'set'
+require 'observer'
+require 'net/http'
+require 'nokogiri'
+require 'timeout'
+module Crawler
+  class Webcrawler
+    include Observable
+    # Set of all URIs which have been crawled
+    attr_accessor :crawled
+    # Queue of URIs to be crawled. Array which acts as a LIFO queue.
+    attr_accessor :queue
+    # Hash of options
+    attr_accessor :options
+    # Accepts the following options:
+    # * timeout -- Time limit for the crawl operation, after which a Timeout::Error exception is raised.
+    def initialize(options={})
+      @crawled = Set.new
+      @queue = []
+      @options = {
+        :timeout => 1.0/0, #Infinity
+        :external => false,
+        :exclude => []
+      }.merge(options)
+    end
+    # Given a URI object, the crawler will explore every linked page recursively using the Breadth First Search algorithm.
+    def crawl(start_uri)
+      start_uri = start_uri.normalize
+      @queue << start_uri
+      timeout(@options[:timeout]) {
+        while(uri = @queue.shift)
+          Net::HTTP.start(uri.host, uri.port) do |http|
+            head = http.head(uri.path)
+            next if head.content_type != "text/html" # If the page retrieved is not an HTML document, we'll choke on it anyway. Skip it
+            resp = http.get(uri.path)
+            changed
+            notify_observers(resp, uri)
+            html = Nokogiri.parse(resp.body)
+            a_tags = html.search("a")
+            @queue = @queue + a_tags.collect do |t|
+              begin
+                next_uri = uri + t.attribute("href").to_s.strip
+              rescue
+                nil
+              end
+            end
+            @queue = @queue.compact.uniq
+            @queue = @queue.reject {|u|
+              @crawled.include?(u) or
+              u == uri or
+              !(u.kind_of?(URI::HTTP)) or
+              (u.host != uri.host and !@options[:external]) or
+              (@options[:exclude].any? { |excl| u.path.include?(excl)})
+            }
+          end
+          @crawled << uri
+        end
+      }
+    end
+  end
+end

data/spec/crawler/crawler_spec.rb ADDED

@@ -0,0 +1,136 @@
+require File.join(File.dirname(__FILE__), "..", "spec_helper.rb")
+module Crawler
+  describe Webcrawler do
+    before(:all) do
+      @uri_base = 'http://localhost:12000/'
+      www_root = File.join(File.dirname(__FILE__), '..', 'fixtures')
+      @server = Thread.new do
+        s = WEBrick::HTTPServer.new({:Port => 12000, :DocumentRoot => www_root, :AccessLog => []})
+        @port = s.config[:Port]
+        begin
+          s.start
+        ensure
+          s.shutdown
+        end
+      end
+    end
+    after(:all) do
+      @server.exit
+    end
+    context "before crawl" do
+      it "should have an empty crawl list" do
+        crawler = Webcrawler.new
+        crawler.crawled.should be_empty
+      end
+    end
+    context "during a crawl" do
+        before(:each) do
+          @crawler = Webcrawler.new
+          @obs = mock("observer", :update => nil, :null_object => true)
+          #@obs = Observer.new
+          @crawler.add_observer(@obs)
+        end
+        it "should send notifications" do
+          uri = URI.parse(@uri_base)
+          @obs.should_receive(:update)
+          @crawler.crawl(uri)
+        end
+        it "should send status code and URL" do
+          uri = URI.parse(@uri_base)
+          @obs.should_receive(:update).with(kind_of(Net::HTTPResponse), kind_of(URI))
+          @crawler.crawl(uri)
+        end
+        it "should send 404 for missing URL" do
+          uri = URI.parse(@uri_base + 'doesnotexist.html')
+          @obs.should_receive(:update).with(instance_of(Net::HTTPNotFound), uri)
+          @crawler.crawl(uri)
+        end
+        it "should not crawl a page more than once" do
+          uri = URI.parse(@uri_base)
+          @obs.should_receive(:update).with(kind_of(Net::HTTPResponse), uri + '/page5.html').once
+          @obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), uri + '/page5.html')
+          @crawler.crawl(uri)
+        end
+        it "should not add the current page to the queue" do
+          uri = URI.parse(@uri_base + "self-reference.html")
+          @obs.should_receive(:update).with(kind_of(Net::HTTPResponse), uri).once
+          @obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), uri)
+          @crawler.crawl(uri)
+        end
+        it "should remove nil items from the queue" do
+          uri = URI.parse(@uri_base + "self-reference.html")
+          @obs.should_receive(:update).twice
+          @crawler.crawl(uri)
+        end
+        it "should convert any exceptions to nil" do
+          uri = URI.parse(@uri_base + 'messed-up.html')
+          lambda { @crawler.crawl(uri) }.should_not raise_error
+        end
+        it "should not crawl anything but HTTP web addresses" do
+          uri = URI.parse(@uri_base + 'non-http.html')
+          @obs.should_receive(:update).once
+          @crawler.crawl(uri)
+        end
+        it "should not, by default, crawl outside its original host" do
+          uri = URI.parse(@uri_base + 'external.html')
+          @obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), URI.parse("http://example.com"))
+          @crawler.crawl(uri)
+        end
+        it "should only download HTML content types" do
+          uri = URI.parse(@uri_base + 'non-html.html')
+          @obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), uri + '/pdf.pdf')
+          @crawler.crawl(uri)
+        end
+        it "should not download anything in the excluded option" do
+          uri = URI.parse(@uri_base + 'exclusion.html')
+          @crawler.options[:exclude] = ["/excluded/"]
+          @obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), uri + '/excluded/shouldnt-hit.html')
+          @crawler.crawl(uri)
+        end
+      end
+    context "after crawl" do
+      before(:each) do
+        @crawler = Webcrawler.new
+        @uri = URI.parse(@uri_base)
+        @crawler.crawl(@uri)
+      end
+      it "should have at least one item in crawled" do
+        @crawler.should have_at_least(1).crawled
+      end
+      it "should have put crawled links into crawled" do
+        @crawler.should have_at_least(2).crawled
+      end
+      it "should have the children of child pages in crawled" do
+        @crawler.crawled.should include(@uri + "/page4.html")
+      end
+      it "should have an empty queue" do
+        @crawler.queue.should be_empty
+      end
+    end
+  end
+end

data/spec/crawler/observer_spec.rb ADDED

@@ -0,0 +1,28 @@
+require File.join(File.dirname(__FILE__), "..", "spec_helper.rb")
+require 'stringio'
+module Crawler
+  describe Observer do
+    def test_code(code, log, obs)
+      log.should_receive(:puts).with("#{code} encountered for http://example.com/")
+      resp = Net::HTTPResponse::CODE_TO_OBJ["#{code}"].new("1.1", code, "")
+      obs.update(resp, URI.parse("http://example.com/"))
+    end
+    it "should output a warning when an error code is reached" do
+      log = double('log', :null_object => true)
+      obs = Observer.new(log)
+      (400..416).each { |code| test_code(code, log, obs) }
+      (500..505).each { |code| test_code(code, log, obs) }
+    end
+    it "should not output a warning when 200 is encountered" do
+      log = double('log')
+      obs = Observer.new(log)
+      log.should_not_receive(:puts).with(/\d{3} encountered/)
+      obs.update(Net::HTTPOK.new("1.1", "200", ""), URI.parse("http://example.com/"))
+    end
+  end
+end

data/spec/fixtures/excluded/shouldnt-hit.html ADDED

@@ -0,0 +1,6 @@
+<html>
+<head>
+</head>
+<body>
+	you should never see this
+</body>

data/spec/fixtures/exclusion.html ADDED

@@ -0,0 +1,7 @@
+<html>
+<head>
+</head>
+<body>
+	<a href="/excluded/shouldnt-hit.html">nope</a>
+</body>
+</html>

data/spec/fixtures/external.html ADDED

@@ -0,0 +1,8 @@
+<html>
+<head>
+	<title>External</title>
+</head>
+<body>
+	<a href="http://example.com/">example</a>
+</body>
+</html>

data/spec/fixtures/index.html ADDED

@@ -0,0 +1,9 @@
+<html>
+<head>
+	<title>Test</title>
+</head>
+<body>
+	<a href="page2.html">Page 2</a>
+	<a href="page3.html">Page 3</a>
+</body>
+</html>

data/spec/fixtures/messed-up.html ADDED

@@ -0,0 +1,7 @@
+<html>
+<head></head>
+<body>
+	<a href=" flange.html">yes</a>
+	<a href="javascript: void(0);">javascript</a>
+</body>
+</html>

data/spec/fixtures/non-html.html ADDED

@@ -0,0 +1,7 @@
+<html>
+<head>
+</head>
+<body>
+	<a href="/pdf.pdf">pdf</a>
+</body>
+</html>

data/spec/fixtures/non-http.html ADDED

@@ -0,0 +1,14 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
+   "http://www.w3.org/TR/html4/strict.dtd">
+<html lang="en">
+<head>
+	<title>non-http</title>
+</head>
+<body>
+	<!-- None of the following should be followed -->
+	<a href="mailto:test@example.com">mailto</a>
+	<a href="ftp://ftp.example.com">ftp</a>
+</body>
+</html>

data/spec/fixtures/page2.html ADDED

@@ -0,0 +1,9 @@
+<html>
+<head>
+	<title>test2</title>
+</head>
+<body>
+	<a href="/page4.html">Link</a>
+	<a href="/page5.html">Link</a>
+</body>
+</html>

data/spec/fixtures/page3.html ADDED

@@ -0,0 +1,8 @@
+<html>
+<head>
+	<title>test2</title>
+</head>
+<body>
+	<a href="/page5.html">Link</a>
+</body>
+</html>

data/spec/fixtures/page4.html ADDED

File without changes

data/spec/fixtures/page5.html ADDED

@@ -0,0 +1,8 @@
+<html>
+	<head>
+		<title>something</title>
+	</head>
+	<body>
+		<a href="/">Whatever</a>
+	</body>
+</html>

data/spec/fixtures/pdf.pdf ADDED

File without changes

data/spec/fixtures/self-reference.html ADDED

@@ -0,0 +1,13 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
+   "http://www.w3.org/TR/html4/strict.dtd">
+<html lang="en">
+<head>
+	<title>self-reference</title>
+</head>
+<body>
+	<a href="self-reference.html">link</a> <!-- will be converted to nil -->
+	<a href="page5.html">link</a>
+</body>
+</html>

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,4 @@
+$LOAD_PATH << File.join(File.dirname(__FILE__), "..", "lib")
+require 'spec'
+require 'crawler'
+require 'WEBrick'

metadata ADDED

@@ -0,0 +1,87 @@
+--- !ruby/object:Gem::Specification
+name: crawler
+version: !ruby/object:Gem::Version
+  version: 0.2.0
+platform: ruby
+authors:
+- Tyler Cunnion
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-01-25 00:00:00 -05:00
+default_executable: crawler
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+    version:
+description: BFS webcrawler that implements Observable
+email: tyler.cunnion@gmail.com
+executables:
+- crawler
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- VERSION
+- bin/crawler
+- lib/crawler.rb
+- lib/crawler/observer.rb
+- lib/crawler/webcrawler.rb
+- spec/crawler/crawler_spec.rb
+- spec/crawler/observer_spec.rb
+- spec/fixtures/excluded/shouldnt-hit.html
+- spec/fixtures/exclusion.html
+- spec/fixtures/external.html
+- spec/fixtures/index.html
+- spec/fixtures/messed-up.html
+- spec/fixtures/non-html.html
+- spec/fixtures/non-http.html
+- spec/fixtures/page2.html
+- spec/fixtures/page3.html
+- spec/fixtures/page4.html
+- spec/fixtures/page5.html
+- spec/fixtures/pdf.pdf
+- spec/fixtures/self-reference.html
+- spec/spec_helper.rb
+has_rdoc: true
+homepage: http://github.com/tylercunnion/crawler
+licenses: []
+post_install_message:
+rdoc_options:
+- --charset=UTF-8
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.5
+signing_key:
+specification_version: 3
+summary: Simple webcrawler
+test_files:
+- spec/crawler/crawler_spec.rb
+- spec/crawler/observer_spec.rb
+- spec/spec_helper.rb