RubyGems - rider - Versions diffs - 0.2 - Mend

rider 0.2

Files changed (21) hide show

data/README ADDED

File without changes

data/Rakefile ADDED

	@@ -0,0 +1 @@
1	+ Dir['tasks/*/.rake'].each { \|rake\| load rake }

data/bin/crawl ADDED

@@ -0,0 +1,21 @@
+#!/usr/bin/env ruby
+require 'lib/rider'
+queue_name = ARGV[0]
+queue = Rider::Queue.new(queue_name)
+puts "Crawling URLs from #{queue.filename}"
+# will crawl all URLs
+crawler = Rider::Crawler.new(//, queue)
+crawler.each_document do |uri, metadata, contents|
+  puts "-"*60
+  puts "URL: #{uri.to_s}"
+  puts "Metadata: #{metadata.inspect}"
+  puts "Contents excerpt: #{contents[0..250]}"
+  puts
+end
+puts
+puts "Crawl finished"

data/lib/rider.rb ADDED

@@ -0,0 +1,36 @@
+$:.unshift File.dirname(__FILE__)
+require 'rubygems'
+require 'logger'
+require 'mechanize'
+require 'timeout'
+require 'yaml'
+require 'rider/queue'
+require 'rider/part_queue'
+require 'rider/crawler'
+$KCODE = 'u'
+module Rider
+  VERSION = '0.1'
+  LOGGER = Logger.new(STDOUT)
+  LOGGER.level = Logger::DEBUG
+  def log
+    LOGGER
+  end
+  module_function :log
+  def to_absolute(uri, link)
+    link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
+    return nil if link.nil? or link.empty?
+    relative = URI(link)
+    absolute = uri.merge(relative)
+    absolute.path = '/' if absolute.path.nil? or absolute.path.empty?
+    return absolute
+  end
+end

data/lib/rider/crawler.rb ADDED

@@ -0,0 +1,102 @@
+require 'hpricot'
+module Rider
+  class Crawler
+    # Creates a new Crawler, with the specified +mask+ (a Regexp) and queue (a +Rider::Queue+ instance).
+    def initialize(mask, queue)
+      @mask = mask
+      @queue = queue
+      @seen_urls = []
+      @www = WWW::Mechanize.new do |a|
+        a.log = Logger.new("tmp/www.log")
+        a.pluggable_parser.default = Hpricot
+      end
+    end
+    # Returns true if +url+ passes the +mask+.
+    def match_mask?(url)
+      @mask.match(url) != nil
+    end
+    # Crawls documents and passes their URL, response headers, and data to the supplied block.
+    def each_document
+      while doc_data = next_document()
+        follow_urls = yield(doc_data) || []
+        add_follow_urls(follow_urls)
+      end
+    end
+    def add_follow_urls(urls)
+      urls.each { |url| @queue.push(url) if follow_url?(url) }
+    end
+    def follow_url?(url)
+      match_mask?(url) and !seen_url?(url)
+    end
+    SKIPPABLE_EXCEPTIONS = [Errno::ETIMEDOUT, WWW::Mechanize::ResponseCodeError, Errno::EHOSTUNREACH, SocketError,
+                            Errno::ECONNREFUSED, Timeout::Error, Net::HTTPBadResponse, Hpricot::ParseError]
+    # Returns the next retrievable document from the next valid URL in the queue.
+    def next_document
+      begin
+        url = next_url()
+        return nil if url.nil?
+        doc_data = get(url)
+        saw_url(url)
+        return doc_data
+      rescue Exception=>ex
+        if SKIPPABLE_EXCEPTIONS.include?(ex.class)
+          Rider.log.debug("EXCEPTION: #{ex.inspect}, skipping...")
+          retry # go on to the next document
+        else
+          raise ex
+        end
+      end
+    end
+    # Gets the document at the specified +url+. Returns an Array [uri, metadata, contents]
+    def get(url)
+      uri = URI.parse(url)
+      Timeout::timeout(8, Timeout::Error) do
+        case uri.scheme
+        when 'http'
+          get_http(uri)
+        when 'file'
+          get_file(uri)
+        else
+          raise(ArgumentError, "don't know how to get #{url}")
+        end
+      end
+    end
+    def get_file(uri)
+      filename = uri.gsub(/^file:\/\//, '')
+      [uri, {}, File.read(filename)]
+    end
+    def get_http(uri)
+      page = @www.get(uri)
+      meta = page.response
+      [uri, meta, page]
+    end
+    # Retrieves the next URL in the queue that matches the +mask+.
+    def next_url
+      while url = @queue.shift
+        return url if valid_url?(url)
+      end
+    end
+    def valid_url?(url)
+      !seen_url?(url) && match_mask?(url)
+    end
+    def seen_url?(url)
+      @seen_urls.include?(url)
+    end
+    def saw_url(url)
+      @seen_urls << url
+    end
+  end
+end

data/lib/rider/part_queue.rb ADDED

@@ -0,0 +1,85 @@
+module Rider
+  class HostPartitionedQueue
+    attr_reader :name
+    def initialize(name)
+      @name = name
+      clear
+    end
+    def push(url)
+      host = get_host(url)
+      @hosts << host unless @hosts.include?(host)
+      @urls_by_host[host] ||= []
+      @urls_by_host[host] << url
+      return true
+    end
+    def shift
+      if empty?
+        Rider.log.debug("Q #{name} POP nil")
+        return nil
+      end
+      host = @hosts[@current_host_index]
+      url = @urls_by_host[host].shift
+      if @urls_by_host[host].empty?
+        @hosts.delete_at(@current_host_index)
+        @urls_by_host.delete(host)
+        # no need to increment @current_host_index since we just effectively pushed every element down by one
+        # by deleting from @hosts, UNLESS it was the last item in the array, in which case that index doesn't
+        # exist anymore
+        increment_current_host_index if @current_host_index == @hosts.length
+      else
+        increment_current_host_index
+      end
+      return url
+    end
+    def clear
+      @urls_by_host = {}
+      @hosts = []
+      @current_host_index = 0
+    end
+    def empty?
+      @hosts.empty?
+    end
+    def ==(another_queue)
+      another_queue.instance_variable_get("@urls_by_host") == @urls_by_host &&
+      another_queue.instance_variable_get("@hosts") == @hosts &&
+      another_queue.instance_variable_get("@current_host_index") == @current_host_index
+    end
+    def serialize
+      File.open(filename, 'w') do |file|
+        file.write(self.to_yaml)
+      end
+    end
+    def self.unserialize(name)
+      filename = "tmp/#{name}.q"
+      return nil unless File.exist?(filename)
+      YAML.load_file("tmp/#{name}.q")
+    end
+    private
+    def get_host(url)
+      URI.parse(url).host
+    end
+    def increment_current_host_index
+      if @hosts.length == 0
+        @current_host_index = 0
+      else
+        # increment by one but go back to 0 if it exceeds the length of the array
+        @current_host_index = (@current_host_index + 1) % @hosts.length
+      end
+    end
+    def filename
+      "tmp/#{name}.q"
+    end
+  end
+end

data/lib/rider/queue.rb ADDED

@@ -0,0 +1,40 @@
+module Rider
+  class Rider::Queue
+    attr_reader :filename
+    def initialize(filename)
+      raise(ArgumentError, "queues must have a filename") if !filename or filename.empty?
+      @filename = filename
+    end
+    def push(item)
+      Rider.log.debug("Q #{filename} PUSH #{item}")
+      File.open(filename, "a") do |file|
+        file.puts(item)
+      end
+      return true
+    end
+    def shift
+      if empty?
+        Rider.log.debug("Q #{filename} SHIFT nil")
+        return nil
+      end
+      lines = File.readlines(filename)
+      item = lines.shift.strip
+      File.open(filename, "w") do |file|
+        file.write(lines.join)
+      end
+      Rider.log.debug("Q #{filename} SHIFT #{item}")
+      return item
+    end
+    def clear
+      File.unlink(filename) if File.exist?(filename)
+      return true
+    end
+    def empty?
+      !File.exist?(filename) or File.open(filename).read == ""
+    end
+  end
+end

data/spec/crawler_spec.rb ADDED

@@ -0,0 +1,94 @@
+require 'spec/spec_helper'
+describe Rider::Crawler do
+  before do
+    @queue = Rider::Queue.new('web')
+    @crawler = Rider::Crawler.new(/http:\/\/localhost/, @queue)
+  end
+  describe "when checking URLs against mask" do
+    it "should return true for a URL that matches the mask" do
+      @crawler.match_mask?("http://localhost/some/path").should == true
+    end
+    it "should return false for a URL that does not match the mask" do
+      @crawler.match_mask?("http://example.com/some/path").should == false
+    end
+  end
+  describe "when checking URL validity" do
+    before do
+      @urls = %w(http://example.com/invalid http://localhost/valid http://localhost/valid/unseen)
+    end
+    it "should return URLs matching the mask" do
+      @urls.select { |url| @crawler.valid_url?(url) }.should == ["http://localhost/valid", "http://localhost/valid/unseen"]
+    end
+    it "should return only unseen URLs" do
+      @crawler.saw_url('http://localhost/valid')
+      @urls.select { |url| @crawler.valid_url?(url) }.should == ['http://localhost/valid/unseen']
+    end
+  end
+  describe "when determining URLs to follow" do
+    it "should follow URLs that match the mask" do
+      @crawler.follow_url?('http://localhost/abc').should == true
+    end
+    it "should not follow URLs that don't match the mask" do
+      @crawler.follow_url?('http://invalid.com').should == false
+    end
+    it "should follow URLs that haven't been seen"
+    it "should not follow URLs that have been seen already"
+  end
+  describe "when getting the next document" do
+  end
+  describe "when getting documents" do
+    it "should raise an error for schemes other than http and file" do
+      lambda { @crawler.get('ftp://example.com') }.should raise_error(ArgumentError)
+    end
+    describe "when getting file:// documents" do
+      before do
+        @filename = File.expand_path(File.join(File.dirname(__FILE__), 'data', 'apples.html'))
+        @file_uri = 'file://' + @filename
+      end
+      it "should return an array whose first element is the uri" do
+        @crawler.get_file(@file_uri)[0].should == @file_uri
+      end
+      it "should return an array whose second element is blank metadata" do
+        @crawler.get_file(@file_uri)[1].should == {}
+      end
+      it "should return an array whose third element is the file contents" do
+        @crawler.get_file(@file_uri)[2].should == File.read(@filename)
+      end
+    end
+    describe "when getting http:// documents" do
+      before do
+        @doc_uri = 'http://localhost/simplewikipedia/articles/a/l/g/Algebra.html'
+      end
+      it "should return an array whose first element is the uri" do
+        @crawler.get_http(@doc_uri)[0].should == @doc_uri
+      end
+      it "should return an array whose second element is blank metadata" do
+        meta = @crawler.get_http(@doc_uri)[1]
+        meta['Content-type'].should == 'text/html'
+      end
+      it "should return an array whose third element is the file contents" do
+        @crawler.get_http(@doc_uri)[2].match(/Algebra is taught in school/).should_not == nil
+      end
+    end
+  end
+end

data/spec/data/apples.html ADDED

@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
+	"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
+<head>
+	<title>Apples</title>
+</head>
+<body>
+<h1>Apples</h1>
+<p>
+   Some apples are <a href="colors.html">red</a>.
+   Some are <a href="colors.html">green</a>.
+   They <a href="prices.html">do not cost much money</a>.
+   You can <a href="http://buyapples.com/">buy apples</a>.
+</p>
+</body>
+</html>

data/spec/data/colors.html ADDED

@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
+	"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
+<head>
+	<title>Colors</title>
+</head>
+<body>
+<h1>Colors</h1>
+<dl>
+  <dt>Red</dt>
+  <dd><a href="apples.html">Apples</a></dd>
+  <dt>Green</dt>
+  <dd><a href="apples.html">Apples</a> or <a href="prices.html">dollars</a>.</dd>
+</dl>
+</body>
+</html>

data/spec/data/fruits.html ADDED

@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
+	"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
+<head>
+	<title>Fruits</title>
+</head>
+<body>
+<p><a href="apples.html">Apples</a> are a fruit.</a></p>
+</body>
+</html>

data/spec/data/notitle.html ADDED

@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
+	"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
+<head>
+	<title></title>
+</head>
+<body>
+</body>
+</html>

data/spec/data/prices.html ADDED

@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
+	"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
+<head>
+	<title>Prices</title>
+</head>
+<body>
+<h1>Prices</h1>
+<table>
+  <tr>
+    <th>Item</th>
+    <th>Price</th>
+  </tr>
+  <tr>
+    <td><a href="apples.html">Apples</a></td>
+    <td>$0.35</td>
+  </tr>
+  <tr>
+    <td><a href="colors.html">Colors</a></td>
+    <td>N/A</td>
+  </tr>
+</table>
+</body>
+</html>

data/spec/data/tiny.html ADDED

	@@ -0,0 +1 @@
1	+ <html><head><title>asdf</title></head><body>asdf</body></html>

data/spec/part_queue_spec.rb ADDED

@@ -0,0 +1,40 @@
+require 'spec/spec_helper'
+require 'spec/queue_spec'
+describe Rider::HostPartitionedQueue do
+  it_should_behave_like "queue"
+  before do
+    @q = Rider::HostPartitionedQueue.new('test')
+  end
+  it "should alternate among hosts when shifting" do
+    %w(http://example.com/path1 http://example.com/path2 http://example.net/ http://localhost/path).each { |u| @q.push(u) }
+    [@q.shift, @q.shift, @q.shift, @q.shift].should ==
+      %w(http://example.com/path1 http://example.net/ http://localhost/path http://example.com/path2)
+  end
+  it "should return the same host if only one distinct host exists" do
+    %w(http://example.com/path1 http://example.com/path2 http://example.com/path3).each { |u| @q.push(u) }
+    [@q.shift, @q.shift, @q.shift].should == %w(http://example.com/path1 http://example.com/path2 http://example.com/path3)
+  end
+  it "should be equal to another queue with the same objects and state" do
+    @q2 = Rider::HostPartitionedQueue.new('test2')
+    %w(http://example.com/path1 http://example.com/path2 http://example.net/ http://localhost/path).each { |u| @q.push(u) }
+    %w(http://example.com/path1 http://example.com/path2 http://example.net/ http://localhost/path).each { |u| @q2.push(u) }
+    @q.should == @q2
+  end
+  describe "when serializing" do
+    it "should write and read itself back" do
+      %w(http://example.com/path1 http://example.com/path2 http://example.net/ http://localhost/path).each { |u| @q.push(u) }
+      @q.serialize
+      Rider::HostPartitionedQueue.unserialize('test').should == @q
+    end
+    it "should return nil if asked to unserialize from a nonexistent file" do
+      Rider::HostPartitionedQueue.unserialize('nonexistent').should == nil
+    end
+  end
+end

data/spec/queue_spec.rb ADDED

@@ -0,0 +1,43 @@
+require 'spec/spec_helper'
+shared_examples_for "queue" do
+  it "must not have a blank or nil name" do
+    lambda { Rider::Queue.new(nil) }.should raise_error(ArgumentError)
+    lambda { Rider::Queue.new('') }.should raise_error(ArgumentError)
+  end
+  it "should be empty after clearing" do
+    @q.clear
+    @q.empty?.should == true
+  end
+  it "should push then shift one item" do
+    @q.push('blue')
+    @q.shift.should == 'blue'
+  end
+  describe "when empty" do
+    it "should return nil if shifted" do
+      @q.shift.should == nil
+    end
+  end
+  it "should not clobber the queue upon initialization"
+end
+describe Rider::Queue do
+  before do
+    @q = Rider::Queue.new('tmp/colors.q')
+    @q.clear
+  end
+  after do
+    @q.clear
+  end
+  it "should push then shift multiple items" do
+     %w(red green orange).each { |color| @q.push(color) }
+     puts "POP x 3"
+     [@q.shift, @q.shift, @q.shift].should == %w(red green orange)
+   end
+end

data/spec/spec_helper.rb ADDED

	@@ -0,0 +1 @@
1	+ require 'lib/rider'

data/tasks/deployment.rake ADDED

@@ -0,0 +1,25 @@
+namespace "doc" do
+  desc "Generate RDoc docs"
+  task :generate do
+    # Using rake/rdoctask invoked old rdoc 1.x for some reason, but this invokes rdoc 2.x
+    sh "rdoc --all --title 'Rider - Ruby Web crawler' --line-numbers --inline-source --force-update --all --charset utf-8 --main README README lib/"
+  end
+  desc "Upload docs to site"
+  task :upload do
+    sh "tar czfv rider-rdoc.tgz doc/"
+    puts
+    puts "Going to upload..."
+    puts
+    sh "scp rider-rdoc.tgz cardinal.stanford.edu:WWW/rider/"
+    sh "ssh cardinal.stanford.edu 'cd WWW/rider;tar xzfv rider-rdoc.tgz'"
+    sh "rm rider-rdoc.tgz"
+    puts
+    puts "Upload complete"
+  end
+  desc "Generate & upload"
+  task :update=>[:generate, :upload]
+end

data/tasks/environment.rake ADDED

@@ -0,0 +1,7 @@
+task :ruby_env do
+  RUBY_APP = if RUBY_PLATFORM =~ /java/
+    "jruby"
+  else
+    "ruby"
+  end unless defined? RUBY_APP
+end

data/tasks/rspec.rake ADDED

@@ -0,0 +1,9 @@
+require 'spec'
+require 'spec/rake/spectask'
+desc "Run the specs under spec/"
+Spec::Rake::SpecTask.new do |t|
+  t.spec_opts = ['--colour', '--diff']
+  t.spec_files = FileList['spec/**/*_spec.rb']
+end

metadata ADDED

@@ -0,0 +1,95 @@
+--- !ruby/object:Gem::Specification
+name: rider
+version: !ruby/object:Gem::Version
+  version: "0.2"
+platform: ruby
+authors:
+- Quinn Slack
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-10-07 00:00:00 -03:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: hpricot
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.8.1
+    version:
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.9.3
+    version:
+description: Ruby Web crawler
+email: me@rafaelss.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- bin/crawl
+- lib/rider/crawler.rb
+- lib/rider/part_queue.rb
+- lib/rider/queue.rb
+- lib/rider.rb
+- Rakefile
+- README
+- spec/crawler_spec.rb
+- spec/data/apples.html
+- spec/data/colors.html
+- spec/data/fruits.html
+- spec/data/notitle.html
+- spec/data/prices.html
+- spec/data/tiny.html
+- spec/part_queue_spec.rb
+- spec/queue_spec.rb
+- spec/spec_helper.rb
+- tasks/deployment.rake
+- tasks/environment.rake
+- tasks/rspec.rake
+has_rdoc: true
+homepage: http://qslack.com/
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.5
+signing_key:
+specification_version: 3
+summary: Ruby Web crawler
+test_files:
+- spec/crawler_spec.rb
+- spec/part_queue_spec.rb
+- spec/queue_spec.rb