RubyGems - crawler - Versions diffs - 0.2 - Mend

crawler 0.2

Files changed (26) hide show

data/Manifest +24 -0
data/Rakefile +10 -0
data/bin/crawler +37 -0
data/crawler.gemspec +32 -0
data/features/step_definitions/crawler_steps.rb +20 -0
data/features/support/env.rb +2 -0
data/lib/crawler.rb +2 -0
data/lib/crawler/observer.rb +22 -0
data/lib/crawler/webcrawler.rb +74 -0
data/spec/crawler/crawler_spec.rb +136 -0
data/spec/crawler/observer_spec.rb +28 -0
data/spec/fixtures/excluded/shouldnt-hit.html +6 -0
data/spec/fixtures/exclusion.html +7 -0
data/spec/fixtures/external.html +8 -0
data/spec/fixtures/index.html +9 -0
data/spec/fixtures/messed-up.html +7 -0
data/spec/fixtures/non-html.html +7 -0
data/spec/fixtures/non-http.html +14 -0
data/spec/fixtures/page2.html +9 -0
data/spec/fixtures/page3.html +8 -0
data/spec/fixtures/page4.html +0 -0
data/spec/fixtures/page5.html +8 -0
data/spec/fixtures/pdf.pdf +0 -0
data/spec/fixtures/self-reference.html +13 -0
data/spec/spec_helper.rb +4 -0
metadata +85 -0

data/Manifest ADDED

@@ -0,0 +1,24 @@
+Rakefile
+bin/crawler
+features/step_definitions/crawler_steps.rb
+features/support/env.rb
+lib/crawler.rb
+lib/crawler/observer.rb
+lib/crawler/webcrawler.rb
+spec/crawler/crawler_spec.rb
+spec/crawler/observer_spec.rb
+spec/fixtures/excluded/shouldnt-hit.html
+spec/fixtures/exclusion.html
+spec/fixtures/external.html
+spec/fixtures/index.html
+spec/fixtures/messed-up.html
+spec/fixtures/non-html.html
+spec/fixtures/non-http.html
+spec/fixtures/page2.html
+spec/fixtures/page3.html
+spec/fixtures/page4.html
+spec/fixtures/page5.html
+spec/fixtures/pdf.pdf
+spec/fixtures/self-reference.html
+spec/spec_helper.rb
+Manifest

data/Rakefile ADDED

@@ -0,0 +1,10 @@
+require 'rake'
+require 'echoe'
+Echoe.new('crawler', '0.2') do |g|
+  g.description = "Simple webcrawler"
+  g.url = "http://github.com/tylercunnion/crawler"
+  g.author = "Tyler Cunnion"
+  g.email = "tyler.cunnion+ruby@gmail.com"
+  g.ignore_pattern = ["tmp/*", "features/*", "log/*"]
+end

data/bin/crawler ADDED

@@ -0,0 +1,37 @@
+#! /usr/bin/ruby
+$LOAD_PATH << File.join(File.dirname(__FILE__), "..", "lib")
+require 'rubygems'
+require 'crawler'
+require 'optparse'
+options = {}
+logfile = ""
+optparser = OptionParser.new do |opts|
+  opts.on("-t", "--timeout X", Integer, "Timemout limit in seconds") { |x| options[:timeout] = x}
+  opts.on("-x", "--exclude path", Array, "List of paths to be excluded") { |x| options[:exclude] = x}
+  opts.on("-l", "--log file", String, "Filename to use as a log") {|log| logfile = log }
+  opts.parse!(ARGV)
+end
+unless logfile.empty?
+  log = File.new(logfile, "w")
+else
+  log = $stdout
+end
+uri_string = ARGV[0]
+begin
+  uri = URI.parse(uri_string)
+  raise unless uri.is_a?(URI::HTTP)
+rescue
+  puts "Error parsing URI: #{uri_string}"
+  Process.exit
+end
+crawler = Crawler::Webcrawler.new(options)
+observer = Crawler::Observer.new(log)
+crawler.add_observer(observer)
+crawler.crawl(uri)

data/crawler.gemspec ADDED

@@ -0,0 +1,32 @@
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = %q{crawler}
+  s.version = "0.2"
+  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
+  s.authors = ["Tyler Cunnion"]
+  s.date = %q{2010-01-25}
+  s.default_executable = %q{crawler}
+  s.description = %q{Simple webcrawler}
+  s.email = %q{tyler.cunnion+ruby@gmail.com}
+  s.executables = ["crawler"]
+  s.extra_rdoc_files = ["bin/crawler", "lib/crawler.rb", "lib/crawler/observer.rb", "lib/crawler/webcrawler.rb"]
+  s.files = ["Rakefile", "bin/crawler", "features/step_definitions/crawler_steps.rb", "features/support/env.rb", "lib/crawler.rb", "lib/crawler/observer.rb", "lib/crawler/webcrawler.rb", "spec/crawler/crawler_spec.rb", "spec/crawler/observer_spec.rb", "spec/fixtures/excluded/shouldnt-hit.html", "spec/fixtures/exclusion.html", "spec/fixtures/external.html", "spec/fixtures/index.html", "spec/fixtures/messed-up.html", "spec/fixtures/non-html.html", "spec/fixtures/non-http.html", "spec/fixtures/page2.html", "spec/fixtures/page3.html", "spec/fixtures/page4.html", "spec/fixtures/page5.html", "spec/fixtures/pdf.pdf", "spec/fixtures/self-reference.html", "spec/spec_helper.rb", "Manifest", "crawler.gemspec"]
+  s.homepage = %q{http://github.com/tylercunnion/crawler}
+  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Crawler"]
+  s.require_paths = ["lib"]
+  s.rubyforge_project = %q{crawler}
+  s.rubygems_version = %q{1.3.5}
+  s.summary = %q{Simple webcrawler}
+  if s.respond_to? :specification_version then
+    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
+    s.specification_version = 3
+    if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
+    else
+    end
+  else
+  end
+end

data/features/step_definitions/crawler_steps.rb ADDED

@@ -0,0 +1,20 @@
+Given /^the crawl has not begun$/ do
+end
+When /^I start a crawl with the URI "([^\"]*)"$/ do |arg1|
+  @obs = Crawler::Observer.new
+  @crawler = Crawler::Webcrawler.new
+  @crawler.add_observer(@obs)
+  @uri = URI.parse(arg1)
+  @crawler.crawl(@uri)
+end
+Then /^the page should be downloaded$/ do
+  @crawler.crawled.should include(@uri)
+end
+Then /^the observer should be updated$/ do
+  @obs.should_receive(:update)
+end

data/features/support/env.rb ADDED

	@@ -0,0 +1,2 @@
1	+ $LOAD_PATH << File.join(File.dirname(__FILE__), "..", "..", "lib")
2	+ require 'crawler'

data/lib/crawler.rb ADDED

	@@ -0,0 +1,2 @@
1	+ require 'crawler/webcrawler'
2	+ require 'crawler/observer'

data/lib/crawler/observer.rb ADDED

@@ -0,0 +1,22 @@
+module Crawler
+  # Observer watches a Webcrawler and outputs messages to a log object. This defaults to STDOUT but may be anything which responds to +puts+.
+  class Observer
+    # Log object. Must respond to +puts+.
+    attr_accessor :log
+    # Creates a new Observer object
+    def initialize(log=$stdout)
+      @log = log
+    end
+    # Called by the Observable module through Webcrawler.
+    def update(response, url)
+      @log.puts "Scanning: " + url.to_s
+      if response.kind_of?(Net::HTTPClientError) or response.kind_of?(Net::HTTPServerError)
+        @log.puts "#{response.code} encountered for " + url.to_s
+      end
+    end
+  end
+end

data/lib/crawler/webcrawler.rb ADDED

@@ -0,0 +1,74 @@
+require 'set'
+require 'observer'
+require 'net/http'
+require 'nokogiri'
+require 'timeout'
+module Crawler
+  class Webcrawler
+    include Observable
+    # Set of all URIs which have been crawled
+    attr_accessor :crawled
+    # Queue of URIs to be crawled. Array which acts as a LIFO queue.
+    attr_accessor :queue
+    # Hash of options
+    attr_accessor :options
+    # Accepts the following options:
+    # * timeout -- Time limit for the crawl operation, after which a Timeout::Error exception is raised.
+    def initialize(options={})
+      @crawled = Set.new
+      @queue = []
+      @options = {
+        :timeout => 1.0/0, #Infinity
+        :external => false,
+        :exclude => []
+      }.merge(options)
+    end
+    # Given a URI object, the crawler will explore every linked page recursively using the Breadth First Search algorithm.
+    def crawl(start_uri)
+      start_uri = start_uri.normalize
+      @queue << start_uri
+      timeout(@options[:timeout]) {
+        while(uri = @queue.shift)
+          Net::HTTP.start(uri.host, uri.port) do |http|
+            head = http.head(uri.path)
+            next if head.content_type != "text/html" # If the page retrieved is not an HTML document, we'll choke on it anyway. Skip it
+            resp = http.get(uri.path)
+            changed
+            notify_observers(resp, uri)
+            html = Nokogiri.parse(resp.body)
+            a_tags = html.search("a")
+            @queue = @queue + a_tags.collect do |t|
+              begin
+                next_uri = uri + t.attribute("href").to_s.strip
+              rescue
+                nil
+              end
+            end
+            @queue = @queue.compact.uniq
+            @queue = @queue.reject {|u|
+              @crawled.include?(u) or
+              u == uri or
+              !(u.kind_of?(URI::HTTP)) or
+              (u.host != uri.host and !@options[:external]) or
+              (@options[:exclude].any? { |excl| u.path.include?(excl)})
+            }
+          end
+          @crawled << uri
+        end
+      }
+    end
+  end
+end

data/spec/crawler/crawler_spec.rb ADDED

@@ -0,0 +1,136 @@
+require File.join(File.dirname(__FILE__), "..", "spec_helper.rb")
+module Crawler
+  describe Webcrawler do
+    before(:all) do
+      @uri_base = 'http://localhost:12000/'
+      www_root = File.join(File.dirname(__FILE__), '..', 'fixtures')
+      @server = Thread.new do
+        s = WEBrick::HTTPServer.new({:Port => 12000, :DocumentRoot => www_root, :AccessLog => []})
+        @port = s.config[:Port]
+        begin
+          s.start
+        ensure
+          s.shutdown
+        end
+      end
+    end
+    after(:all) do
+      @server.exit
+    end
+    context "before crawl" do
+      it "should have an empty crawl list" do
+        crawler = Webcrawler.new
+        crawler.crawled.should be_empty
+      end
+    end
+    context "during a crawl" do
+        before(:each) do
+          @crawler = Webcrawler.new
+          @obs = mock("observer", :update => nil, :null_object => true)
+          #@obs = Observer.new
+          @crawler.add_observer(@obs)
+        end
+        it "should send notifications" do
+          uri = URI.parse(@uri_base)
+          @obs.should_receive(:update)
+          @crawler.crawl(uri)
+        end
+        it "should send status code and URL" do
+          uri = URI.parse(@uri_base)
+          @obs.should_receive(:update).with(kind_of(Net::HTTPResponse), kind_of(URI))
+          @crawler.crawl(uri)
+        end
+        it "should send 404 for missing URL" do
+          uri = URI.parse(@uri_base + 'doesnotexist.html')
+          @obs.should_receive(:update).with(instance_of(Net::HTTPNotFound), uri)
+          @crawler.crawl(uri)
+        end
+        it "should not crawl a page more than once" do
+          uri = URI.parse(@uri_base)
+          @obs.should_receive(:update).with(kind_of(Net::HTTPResponse), uri + '/page5.html').once
+          @obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), uri + '/page5.html')
+          @crawler.crawl(uri)
+        end
+        it "should not add the current page to the queue" do
+          uri = URI.parse(@uri_base + "self-reference.html")
+          @obs.should_receive(:update).with(kind_of(Net::HTTPResponse), uri).once
+          @obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), uri)
+          @crawler.crawl(uri)
+        end
+        it "should remove nil items from the queue" do
+          uri = URI.parse(@uri_base + "self-reference.html")
+          @obs.should_receive(:update).twice
+          @crawler.crawl(uri)
+        end
+        it "should convert any exceptions to nil" do
+          uri = URI.parse(@uri_base + 'messed-up.html')
+          lambda { @crawler.crawl(uri) }.should_not raise_error
+        end
+        it "should not crawl anything but HTTP web addresses" do
+          uri = URI.parse(@uri_base + 'non-http.html')
+          @obs.should_receive(:update).once
+          @crawler.crawl(uri)
+        end
+        it "should not, by default, crawl outside its original host" do
+          uri = URI.parse(@uri_base + 'external.html')
+          @obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), URI.parse("http://example.com"))
+          @crawler.crawl(uri)
+        end
+        it "should only download HTML content types" do
+          uri = URI.parse(@uri_base + 'non-html.html')
+          @obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), uri + '/pdf.pdf')
+          @crawler.crawl(uri)
+        end
+        it "should not download anything in the excluded option" do
+          uri = URI.parse(@uri_base + 'exclusion.html')
+          @crawler.options[:exclude] = ["/excluded/"]
+          @obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), uri + '/excluded/shouldnt-hit.html')
+          @crawler.crawl(uri)
+        end
+      end
+    context "after crawl" do
+      before(:each) do
+        @crawler = Webcrawler.new
+        @uri = URI.parse(@uri_base)
+        @crawler.crawl(@uri)
+      end
+      it "should have at least one item in crawled" do
+        @crawler.should have_at_least(1).crawled
+      end
+      it "should have put crawled links into crawled" do
+        @crawler.should have_at_least(2).crawled
+      end
+      it "should have the children of child pages in crawled" do
+        @crawler.crawled.should include(@uri + "/page4.html")
+      end
+      it "should have an empty queue" do
+        @crawler.queue.should be_empty
+      end
+    end
+  end
+end

data/spec/crawler/observer_spec.rb ADDED

@@ -0,0 +1,28 @@
+require File.join(File.dirname(__FILE__), "..", "spec_helper.rb")
+require 'stringio'
+module Crawler
+  describe Observer do
+    def test_code(code, log, obs)
+      log.should_receive(:puts).with("#{code} encountered for http://example.com/")
+      resp = Net::HTTPResponse::CODE_TO_OBJ["#{code}"].new("1.1", code, "")
+      obs.update(resp, URI.parse("http://example.com/"))
+    end
+    it "should output a warning when an error code is reached" do
+      log = double('log', :null_object => true)
+      obs = Observer.new(log)
+      (400..416).each { |code| test_code(code, log, obs) }
+      (500..505).each { |code| test_code(code, log, obs) }
+    end
+    it "should not output a warning when 200 is encountered" do
+      log = double('log')
+      obs = Observer.new(log)
+      log.should_not_receive(:puts).with(/\d{3} encountered/)
+      obs.update(Net::HTTPOK.new("1.1", "200", ""), URI.parse("http://example.com/"))
+    end
+  end
+end

data/spec/fixtures/excluded/shouldnt-hit.html ADDED

@@ -0,0 +1,6 @@
+<html>
+<head>
+</head>
+<body>
+	you should never see this
+</body>

data/spec/fixtures/exclusion.html ADDED

@@ -0,0 +1,7 @@
+<html>
+<head>
+</head>
+<body>
+	<a href="/excluded/shouldnt-hit.html">nope</a>
+</body>
+</html>

data/spec/fixtures/external.html ADDED

@@ -0,0 +1,8 @@
+<html>
+<head>
+	<title>External</title>
+</head>
+<body>
+	<a href="http://example.com/">example</a>
+</body>
+</html>

data/spec/fixtures/index.html ADDED

@@ -0,0 +1,9 @@
+<html>
+<head>
+	<title>Test</title>
+</head>
+<body>
+	<a href="page2.html">Page 2</a>
+	<a href="page3.html">Page 3</a>
+</body>
+</html>

data/spec/fixtures/messed-up.html ADDED

@@ -0,0 +1,7 @@
+<html>
+<head></head>
+<body>
+	<a href=" flange.html">yes</a>
+	<a href="javascript: void(0);">javascript</a>
+</body>
+</html>

data/spec/fixtures/non-html.html ADDED

@@ -0,0 +1,7 @@
+<html>
+<head>
+</head>
+<body>
+	<a href="/pdf.pdf">pdf</a>
+</body>
+</html>

data/spec/fixtures/non-http.html ADDED

@@ -0,0 +1,14 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
+   "http://www.w3.org/TR/html4/strict.dtd">
+<html lang="en">
+<head>
+	<title>non-http</title>
+</head>
+<body>
+	<!-- None of the following should be followed -->
+	<a href="mailto:test@example.com">mailto</a>
+	<a href="ftp://ftp.example.com">ftp</a>
+</body>
+</html>

data/spec/fixtures/page2.html ADDED

@@ -0,0 +1,9 @@
+<html>
+<head>
+	<title>test2</title>
+</head>
+<body>
+	<a href="/page4.html">Link</a>
+	<a href="/page5.html">Link</a>
+</body>
+</html>

data/spec/fixtures/page3.html ADDED

@@ -0,0 +1,8 @@
+<html>
+<head>
+	<title>test2</title>
+</head>
+<body>
+	<a href="/page5.html">Link</a>
+</body>
+</html>

data/spec/fixtures/page4.html ADDED

File without changes

data/spec/fixtures/page5.html ADDED

@@ -0,0 +1,8 @@
+<html>
+	<head>
+		<title>something</title>
+	</head>
+	<body>
+		<a href="/">Whatever</a>
+	</body>
+</html>

data/spec/fixtures/pdf.pdf ADDED

File without changes

data/spec/fixtures/self-reference.html ADDED

@@ -0,0 +1,13 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
+   "http://www.w3.org/TR/html4/strict.dtd">
+<html lang="en">
+<head>
+	<title>self-reference</title>
+</head>
+<body>
+	<a href="self-reference.html">link</a> <!-- will be converted to nil -->
+	<a href="page5.html">link</a>
+</body>
+</html>

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,4 @@
+$LOAD_PATH << File.join(File.dirname(__FILE__), "..", "lib")
+require 'spec'
+require 'crawler'
+require 'WEBrick'

metadata ADDED

@@ -0,0 +1,85 @@
+--- !ruby/object:Gem::Specification
+name: crawler
+version: !ruby/object:Gem::Version
+  version: "0.2"
+platform: ruby
+authors:
+- Tyler Cunnion
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-01-25 00:00:00 -05:00
+default_executable:
+dependencies: []
+description: Simple webcrawler
+email: tyler.cunnion+ruby@gmail.com
+executables:
+- crawler
+extensions: []
+extra_rdoc_files:
+- bin/crawler
+- lib/crawler.rb
+- lib/crawler/observer.rb
+- lib/crawler/webcrawler.rb
+files:
+- Rakefile
+- bin/crawler
+- features/step_definitions/crawler_steps.rb
+- features/support/env.rb
+- lib/crawler.rb
+- lib/crawler/observer.rb
+- lib/crawler/webcrawler.rb
+- spec/crawler/crawler_spec.rb
+- spec/crawler/observer_spec.rb
+- spec/fixtures/excluded/shouldnt-hit.html
+- spec/fixtures/exclusion.html
+- spec/fixtures/external.html
+- spec/fixtures/index.html
+- spec/fixtures/messed-up.html
+- spec/fixtures/non-html.html
+- spec/fixtures/non-http.html
+- spec/fixtures/page2.html
+- spec/fixtures/page3.html
+- spec/fixtures/page4.html
+- spec/fixtures/page5.html
+- spec/fixtures/pdf.pdf
+- spec/fixtures/self-reference.html
+- spec/spec_helper.rb
+- Manifest
+- crawler.gemspec
+has_rdoc: true
+homepage: http://github.com/tylercunnion/crawler
+licenses: []
+post_install_message:
+rdoc_options:
+- --line-numbers
+- --inline-source
+- --title
+- Crawler
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "1.2"
+  version:
+requirements: []
+rubyforge_project: crawler
+rubygems_version: 1.3.5
+signing_key:
+specification_version: 3
+summary: Simple webcrawler
+test_files: []