RubyGems - cobweb - Versions diffs - 0.0.1 - Mend

cobweb 0.0.1

Files changed (13) hide show

data/README.textile +63 -0
data/lib/cobweb.rb +130 -0
data/lib/content_link_parser.rb +71 -0
data/lib/content_process_job.rb +13 -0
data/lib/crawl_job.rb +71 -0
data/lib/namespaced_redis.rb +52 -0
data/spec/cobweb/cobweb_spec.rb +189 -0
data/spec/cobweb/content_link_parser_spec.rb +104 -0
data/spec/cobweb/crawl_job_spec.rb +24 -0
data/spec/samples/sample_html_links.html +34 -0
data/spec/spec.opts +2 -0
data/spec/spec_helper.rb +1 -0
metadata +133 -0

data/README.textile ADDED Viewed

@@ -0,0 +1,63 @@
+h1. Cobweb v0.0.1
+h2. Intro
+Crawler that utilises resque jobs to perform the crawl allowing clustering of crawls.
+h2. Installation
+Install crawler as a gem
+bq. gem install cobweb
+h2. Usage
+h4. new(options)
+Creates a new crawler object based on a base_url
+  * options - Options are passed in as a hash,
+    ** :follow_redirects - transparently follows redirects and populates the :redirect_through key in the content hash(true)
+    ** :redirect_limit   - sets the limit to be used for concurrent redirects(10)
+    ** :processing_queue - specifies the processing queue for content to be sent to (ContentProcessJob)
+    ** :debug            - enables debug output (false)
+    ** :quiet            - hides default output (false)
+    ** :cache            - if set, enables the cache and sets the ttl (300)
+bq. crawler = CobWeb.new(:follow_redirects => false)
+h4. start(base_url)
+  * base_url - the url to start the crawl from
+h4. get(url)
+  * url - url requested
+h2. License
+h3. The MIT License
+Copyright (c) 2010 6Central Limited
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

data/lib/cobweb.rb ADDED Viewed

@@ -0,0 +1,130 @@
+require 'rubygems'
+require 'uri'
+require 'resque'
+require 'digest/sha1'
+Dir[File.dirname(__FILE__) + '/*.rb'].each do |file|
+  require [File.dirname(__FILE__), File.basename(file, File.extname(file))].join("/")
+end
+class CobWeb
+  def initialize(options = {})
+    @options = options
+    @options[:follow_redirects] = true if @options[:follow_redirects].nil?
+    @options[:redirect_limit] = 10 if @options[:redirect_limit].nil?
+    @options[:processing_queue] = ContentProcessJob if @options[:processing_queue].nil?
+    @options[:debug] = false unless @options[:debug]
+  end
+  def start(base_url)
+    raise ":base_url is required" unless base_url
+    request = {
+      :crawl_id => Digest::SHA1.hexdigest(Time.now.to_s),
+      :url => base_url
+    }
+    request.merge!(@options)
+    Resque.enqueue(CrawlJob, request)
+  end
+  def get(url, redirect_limit = @options[:redirect_limit])
+    raise "url cannot be nil" if url.nil?
+    # get the unique id for this request
+    unique_id = Digest::SHA1.hexdigest(url)
+    # connect to redis
+    redis = NamespacedRedis.new(Redis.new, "cobweb")
+    content = {}
+    # check if it has already been cached
+    if redis.get(unique_id) and @options[:cache]
+      puts "Cache hit for #{url}" unless @options[:quiet]
+      content = JSON.parse(redis.get(unique_id)).deep_symbolize_keys
+      content
+    else
+      # this url is valid for processing so lets get on with it
+      print "Retrieving #{url }... " unless @options[:quiet]
+      uri = URI.parse(url)
+      # retrieve data
+      http = Net::HTTP.new(uri.host, uri.port)
+      if uri.scheme == "https"
+        http.use_ssl = true
+        http.verify_mode = OpenSSL::SSL::VERIFY_NONE
+      end
+      request_time = Time.now.to_f
+      request = Net::HTTP::Get.new(uri.request_uri)
+      response = http.request(request)
+      if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
+        puts "redirected... " unless @options[:quiet]
+        url = response['location']
+        redirect_limit = redirect_limit - 1
+        content = get(response['location'], redirect_limit)
+        content[:url] = uri.to_s
+        content[:redirect_through] = [] if content[:redirect_through].nil?
+        content[:redirect_through].insert(0, response['location'])
+        content[:response_time] = Time.now.to_f - request_time
+      else
+        content[:response_time] = Time.now.to_f - request_time
+        puts "Retrieved." unless @options[:quiet]
+        # create the content container
+        content[:url] = uri.to_s
+        content[:status_code] = response.code.to_i
+        content[:content_type] = response.content_type
+        charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1  ] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
+        charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
+        content[:character_set] = charset
+        content[:content_length] = response.content_length
+        content[:content_body] = response.body
+        content[:location] = response["location"]
+        content[:headers] = response.to_hash.symbolize_keys
+        # parse data for links
+        link_parser = ContentLinkParser.new(content[:url], content[:content_body])
+        content[:links] = link_parser.link_data
+        # add content to cache if required
+        if @options[:cache]
+          redis.set(unique_id, content.to_json)
+          redis.expire unique_id, content_request[:cache].to_i
+        end
+      end
+    end
+    content
+  end
+end
+## add symbolize methods to hash
+class Hash
+  def symbolize_keys
+    keys.each do |key|
+      if key.instance_of? String
+        value = self[key]
+        self.delete(key)
+        self[key.to_sym] = value
+      end
+    end
+    self
+  end
+  def deep_symbolize_keys
+    symbolize_keys
+    keys.each do |key|
+      if self[key].instance_of? Hash
+        self[key].deep_symbolize_keys
+      end
+    end
+    self
+  end
+end

data/lib/content_link_parser.rb ADDED Viewed

@@ -0,0 +1,71 @@
+class ContentLinkParser
+  require "nokogiri"
+  require "absolutize"
+  def initialize(url, content, options = {})
+    @options = options
+    @url = url
+    @doc = Nokogiri::HTML(content)
+    base_url = @url.to_s
+    if @doc.at("base[href]")
+      base_url = @doc.at("base[href]").attr("href").to_s
+    end
+    @absolutize = Absolutize.new(base_url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
+    @options[:tags] = {}
+    @options[:tags][:links] = [["a[href]", "href"], ["frame[src]", "src"], ["meta[@http-equiv=\"refresh\"]", "content"], ["link[href]:not([rel])", "href"], ["area[href]", "href"]]
+    @options[:tags][:images] = [["img[src]", "src"]]
+    @options[:tags][:related] = [["link[rel]", "href"]]
+    @options[:tags][:scripts] = [["script[src]", "src"]]
+    @options[:tags][:styles] = [["link[rel='stylesheet'][href]", "href"], ["style[@type^='text/css']", /url\("?(.*?)"?\)/]]
+    #clear the default tags if required
+    @options[:tags] = {} if @options[:ignore_default_tags]
+    @options[:tags].merge!(@options[:additional_tags]) unless @options[:additional_tags].nil?
+  end
+  def link_data
+    data = {}
+    @options[:tags].keys.each do |key|
+      data[key.to_sym] = self.instance_eval(key.to_s)
+    end
+    data
+  end
+  def all_links
+    data = link_data
+    data.keys.map{|key| data[key]}.flatten.uniq
+  end
+  def method_missing(m)
+    if @options[:tags].keys.include?(m)
+      links = []
+      @options[:tags][m].each do |selector, attribute|
+        find_matches(links, selector, attribute)
+      end
+      links.uniq
+    else
+      puts "Warning: There was no configuration on how to find #{m} links"
+      []
+    end
+  end
+  def find_matches(array, selector, attribute)
+    if attribute.kind_of? String or attribute.kind_of? Symbol
+      @doc.css(selector).each do |tag|
+        uri = @absolutize.url(tag[attribute])
+        array << uri.to_s
+      end
+    elsif attribute.instance_of? Regexp
+      @doc.css(selector).each do |tag|
+        tag.content.scan(attribute) {|match| array << @absolutize.url(match[0])}
+      end
+    end
+  end
+end

data/lib/content_process_job.rb ADDED Viewed

@@ -0,0 +1,13 @@
+class ContentProcessJob
+  require "ap"
+  @queue = :cobweb_process_job
+  def self.perform(content)
+    content.symbolize_keys
+    puts "Dummy Processing for #{content[:url]}"
+    #ap content.keys
+  end
+end

data/lib/crawl_job.rb ADDED Viewed

@@ -0,0 +1,71 @@
+class CrawlJob
+  require "net/https"
+  require "uri"
+  require "redis"
+  @queue = :cobweb_crawl_job
+  def self.perform(content_request)
+    # change all hash keys to symbols
+    content_request.deep_symbolize_keys
+    redis = NamespacedRedis.new(Redis.new, "cobweb-#{content_request[:crawl_id]}")
+    @absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
+    # check we haven't crawled this url before
+    unless redis.sismember "crawled", content_request[:url]
+      # increment counter and check we haven't hit our crawl limit
+      redis.incr "crawl-counter"
+      crawl_counter = redis.get("crawl-counter").to_i
+      queue_counter = redis.get("queue-counter").to_i
+      if crawl_counter <= content_request[:crawl_limit]
+        content = CobWeb.get(content_request)
+        redis.sadd "crawled", content_request[:url]
+        set_base_url redis, content, content_request[:base_url]
+        if queue_counter <= content_request[:crawl_limit]
+          ap content[:links]
+          content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link|
+            ap link
+            unless redis.sismember "crawled", link
+              puts redis.get("base_url")
+              puts "---------------------------------"
+              if link.match(Regexp.new("^#{redis.get("base_url")}"))
+                new_request = content_request.clone
+                new_request[:url] = link
+                new_request[:parent] = content_request[:url]
+                Resque.enqueue(CrawlJob, new_request)
+                redis.incr "queue-counter"
+              end
+            end
+          end
+        end
+        # enqueue to processing queue
+        Resque.enqueue(const_get(content_request[:processing_queue]), content)
+        puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
+        puts "Crawled: #{crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{queue_counter}" if content_request[:debug]
+      else
+        puts "Crawl Limit Exceeded by #{crawl_counter - content_request[:crawl_limit]} objects" if content_request[:debug]
+      end
+    else
+      puts "Already crawled #{content_request[:url]}" if content_request[:debug]
+    end
+  end
+  private
+  def self.set_base_url(redis, content, base_url)
+    if redis.get("base_url").nil?
+      if content[:status_code] >= 300 and content[:status_code] < 400
+        #redirect received for first url
+        redis.set("base_url", @absolutize.url(content[:location]).to_s)
+        puts "Warning: base_url given redirects to another location, setting base_url to #{@absolutize.url(content[:location]).to_s}"
+      else
+        redis.set("base_url", base_url)
+      end
+    end
+  end
+end

data/lib/namespaced_redis.rb ADDED Viewed

@@ -0,0 +1,52 @@
+class NamespacedRedis
+  def initialize(redis, namespace="")
+    raise "redis must be supplied" if redis.nil?
+    @redis = redis
+    @namespace = namespace
+  end
+  def sismember(key, member)
+    @redis.sismember namespaced(key), member
+  end
+  def sadd(key, value)
+    @redis.sadd namespaced(key), value
+  end
+  def get(key)
+    @redis.get namespaced(key)
+  end
+  def incr(key)
+    @redis.incr namespaced(key)
+  end
+  def exist(key)
+    @redis.exist namespaced(key)
+  end
+  def set(key, value)
+    @redis.set namespaced(key), value
+  end
+  def del(key)
+    @redis.del namespaced(key)
+  end
+  def expire(key, value)
+    @redis.expire namespaced(key), value
+  end
+  def namespaced(key)
+    "#{@namespace}-#{key}"
+  end
+  def native
+    @redis
+  end
+  def namespace
+    @namespace
+  end
+end

data/spec/cobweb/cobweb_spec.rb ADDED Viewed

@@ -0,0 +1,189 @@
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+describe CobWeb do
+  before(:each) do
+    @base_url = "http://www.baseurl.com/"
+    @default_headers = {"Cache-Control" => "private, max-age=0",
+                        "Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
+                        "Expires" => "-1",
+                        "Content-Type" => "text/html; charset=UTF-8",
+                        "Content-Encoding" => "gzip",
+                        "Transfer-Encoding" => "chunked",
+                        "Server" => "gws",
+                        "X-XSS-Protection" => "1; mode=block"}
+    @cobweb = CobWeb.new :quiet => true
+  end
+  describe "with mock" do
+    before(:each) do
+      @mock_http_client = mock(Net::HTTP)
+      @mock_http_request = mock(Net::HTTPRequest)
+      @mock_http_redirect_request = mock(Net::HTTPRequest)
+      @mock_http_redirect_request2 = mock(Net::HTTPRequest)
+      @mock_http_response = mock(Net::HTTPResponse)
+      @mock_http_redirect_response = mock(Net::HTTPRedirection)
+      @mock_http_get = mock(Net::HTTP::Get)
+      Net::HTTP.stub!(:new).and_return(@mock_http_client)
+      Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
+      Net::HTTP::Get.stub!(:new).with("/redirect.html").and_return(@mock_http_redirect_request)
+      Net::HTTP::Get.stub!(:new).with("/redirect2.html").and_return(@mock_http_redirect_request2)
+      @mock_http_client.stub!(:request).with(@mock_http_request).and_return(@mock_http_response)
+      @mock_http_client.stub!(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
+      @mock_http_client.stub!(:request).with(@mock_http_redirect_request2).and_return(@mock_http_redirect_response2)
+      @mock_http_response.stub!(:code).and_return(200)
+      @mock_http_response.stub!(:content_type).and_return("text/html")
+      @mock_http_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
+      @mock_http_response.stub!(:[]).with("location").and_return(@default_headers["location"])
+      @mock_http_response.stub!(:content_length).and_return(1024)
+      @mock_http_response.stub!(:body).and_return("asdf")
+      @mock_http_response.stub!(:to_hash).and_return(@default_headers)
+      @mock_http_redirect_response.stub!(:code).and_return(301)
+      @mock_http_redirect_response.stub!(:content_type).and_return("text/xml")
+      @mock_http_redirect_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
+      @mock_http_redirect_response.stub!(:[]).with("location").and_return("http://redirected-to.com/redirect2.html")
+      @mock_http_redirect_response.stub!(:content_length).and_return(2048)
+      @mock_http_redirect_response.stub!(:body).and_return("redirected body")
+      @mock_http_redirect_response.stub!(:to_hash).and_return(@default_headers)
+      @mock_http_redirect_response2.stub!(:code).and_return(301)
+      @mock_http_redirect_response2.stub!(:content_type).and_return("text/xml")
+      @mock_http_redirect_response2.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
+      @mock_http_redirect_response2.stub!(:[]).with("location").and_return("http://redirected-to.com/redirected.html")
+      @mock_http_redirect_response2.stub!(:content_length).and_return(2048)
+      @mock_http_redirect_response2.stub!(:body).and_return("redirected body")
+      @mock_http_redirect_response2.stub!(:to_hash).and_return(@default_headers)
+    end
+    it "should generate a cobweb object" do
+      CobWeb.new.should be_an_instance_of CobWeb
+    end
+    describe "get" do
+      it "should return a hash with default values" do
+        @cobweb.get(@base_url).should be_an_instance_of Hash
+      end
+      it "should return a hash with default values without quiet option" do
+        @cobweb.get(@base_url).should be_an_instance_of Hash
+      end
+      it "should raise exception if there is no url" do
+        lambda {@cobweb.get(nil)}.should raise_error("url cannot be nil")
+      end
+      describe "content object" do
+        it "should return the url" do
+          @cobweb.get(@base_url)[:url].should == @base_url
+        end
+        it "should return correct content-types" do
+          @mock_http_response.stub!(:content_type).and_return("image/jpeg")
+          @cobweb.get(@base_url)[:content_type].should == "image/jpeg"
+        end
+        it "should return correct status-code" do
+          @mock_http_response.stub!(:code).and_return(404)
+          @cobweb.get(@base_url)[:status_code].should == 404
+        end
+        it "should return correct status-code" do
+          @mock_http_response.stub!(:code).and_return(404)
+          @cobweb.get(@base_url)[:status_code].should == 404
+        end
+        it "should return correct character_set" do
+          @cobweb.get(@base_url)[:character_set].should == "UTF-8"
+        end
+        it "should return correct content_length" do
+          @cobweb.get(@base_url)[:content_length].should == 1024
+        end
+        it "should return correct content_body" do
+          @cobweb.get(@base_url)[:content_body].should == "asdf"
+        end
+        it "should return correct location" do
+          @cobweb.get(@base_url)[:location].should == nil
+          @mock_http_response.stub!(:[]).with("location").and_return("http://google.com/")
+          @cobweb.get(@base_url)[:location].should == "http://google.com/"
+        end
+        it "should return correct headers" do
+          @cobweb.get(@base_url)[:headers].should == @default_headers
+        end
+        it "should return correct a hash of links" do
+          @cobweb.get(@base_url)[:links].should be_an_instance_of Hash
+        end
+        it "should return the response time for the url" do
+          @cobweb.get(@base_url)[:response_time].should be_an_instance_of Float
+        end
+      end
+      describe "with redirect" do
+        before(:each) do
+          @base_url = "http://redirect-me.com/redirect.html"
+          @cobweb = CobWeb.new(:follow_redirects => true, :quiet => true)
+        end
+        it "should flow through redirect" do
+          @mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
+          content = @cobweb.get(@base_url)
+          content.should be_an_instance_of Hash
+          content[:url].should == "http://redirect-me.com/redirect.html"
+          content[:redirect_through].length.should == 2
+          content[:content_type].should == "text/html"
+          content[:content_body].should == "asdf"
+        end
+        it "should return the path followed" do
+          @mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
+          content = @cobweb.get(@base_url)
+          content[:redirect_through].should == ["http://redirected-to.com/redirect2.html", "http://redirected-to.com/redirected.html"]
+        end
+        it "should not follow with redirect disabled" do
+          @cobweb = CobWeb.new(:follow_redirects => false)
+          @mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
+          content = @cobweb.get(@base_url)
+          content[:url].should == "http://redirect-me.com/redirect.html"
+          content[:redirect_through].should be_nil
+          content[:status_code].should == 301
+          content[:content_type].should == "text/xml"
+          content[:content_body].should == "redirected body"
+        end
+      end
+    end
+  end
+  describe "without mock" do
+    it "should throw invalid url exception for an invalid url" do
+      lambda {@cobweb.get("asdgas asv\"£%\"^%&*%")}.should raise_error URI::InvalidURIError
+    end
+    it "should throw exception when server is unavailable" #do
+    #  lambda {@cobweb.get({:url => "http://www.oasdjgoisadjgoisdiog.com"})}.should raise_error URI::InvalidURIError
+    #end
+    it "should return a valid content hash when url doesn't exist on a live server" do
+      status_code = @cobweb.get("http://test.com/laskdjflsdajf")[:status_code]
+      status_code.should == 404
+    end
+  end
+end

data/spec/cobweb/content_link_parser_spec.rb ADDED Viewed

@@ -0,0 +1,104 @@
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+require File.expand_path(File.dirname(__FILE__) + '/../../lib/content_link_parser.rb')
+describe ContentLinkParser do
+  before(:each) do
+    @base_url = "http://www.baseurl.com/"
+    @content = File.read(File.dirname(__FILE__) + "/../samples/sample_html_links.html")
+    @content_parser = ContentLinkParser.new("http://sample-links.com/", @content)
+  end
+  it "should load the sample document" do
+    @content.should_not be_nil
+    @content.should_not be_empty
+  end
+  it "should create a content link parser" do
+    @content_parser.should_not be_nil
+    @content_parser.should be_an_instance_of ContentLinkParser
+  end
+  describe "using default tags" do
+    describe "returning general links" do
+      it "should return some links from the sample data" do
+        links = @content_parser.links
+        links.should_not be_nil
+        links.should_not be_empty
+      end
+      it "should return the correct links" do
+        links = @content_parser.links
+        links.length.should == 4
+      end
+    end
+    describe "returning image links" do
+      it "should return some image links from the sample data" do
+        links = @content_parser.images
+        links.should_not be_nil
+        links.should_not be_empty
+      end
+      it "should return the correct links" do
+        links = @content_parser.images
+        links.length.should == 1
+      end
+    end
+    describe "returning related links" do
+      it "should return some related links from the sample data" do
+        links = @content_parser.related
+        links.should_not be_nil
+        links.should_not be_empty
+      end
+      it "should return the correct links" do
+        links = @content_parser.related
+        links.length.should == 2
+      end
+    end
+    describe "returning script links" do
+      it "should return some script links from the sample data" do
+        links = @content_parser.scripts
+        links.should_not be_nil
+        links.should_not be_empty
+      end
+      it "should return the correct links" do
+        links = @content_parser.scripts
+        links.length.should == 1
+      end
+    end
+    describe "returning style links" do
+      it "should return some style links from the sample data" do
+        links = @content_parser.styles
+        links.should_not be_nil
+        links.should_not be_empty
+      end
+      it "should return the correct links" do
+        links = @content_parser.styles
+        links.length.should == 3
+      end
+    end
+    describe "returning unknown link type" do
+      it "should return an empty array" do
+        links = @content_parser.asdfasdfsadf
+        links.should_not be_nil
+        links.should be_an_instance_of Array
+      end
+    end
+  end
+  describe "returning all link data" do
+    it "should return a hash with all link data" do
+      link_data = @content_parser.link_data
+      link_data.should_not be_nil
+      link_data.should be_an_instance_of Hash
+      link_data.keys.length.should == 5
+      link_data[:links].length.should == 4
+    end
+  end
+  describe "ignoring default tags" do
+    it "should not return any links" do
+      parser = ContentLinkParser.new("http://sample-links.com", @content, :ignore_default_tags => true)
+      parser.links.should be_empty
+    end
+  end
+end

data/spec/cobweb/crawl_job_spec.rb ADDED Viewed

@@ -0,0 +1,24 @@
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+describe CrawlJob do
+  before(:each) do
+    @base_url = "http://www.baseurl.com/"
+    client = Net::HTTPClient.new
+    puts client.get('http://www.google.com.au')
+    puts "asdf"
+    @cobweb = CobWeb.new("http://www.google.com")
+  end
+  it "should be a cobweb type" do
+    @cobweb.should be_an_instance_of CobWeb
+  end
+end

data/spec/samples/sample_html_links.html ADDED Viewed

@@ -0,0 +1,34 @@
+<html>
+<head>
+<title>Sample HTML Document With all types of links</title>
+<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+<meta name="description" content="Information for people running web search indexing robots, and web site managers attempting to understand what's going on when a robot visits their site.">
+<meta name="keywords" content="robots, crawlers, crawling, spiders, index, indexing, indexers, gatherers, search engines, searching, FAQ, checklist">
+<meta name="DC.date.modified" content="2003-07-18">
+<meta http-equiv="refresh" content="http://sampleurl-metarefresh.com/"/>
+<link rel="stylesheet" type="text/css" href="http://sampleurl-linkcss/" />
+<link rel="home" type="text/html" href="http://sampleurl-linkhome/" />
+<script type="text/javascript" src="script.js"></script>
+<STYLE TYPE="text/css" MEDIA="screen, projection">
+<!--
+  @import url(http://www.htmlhelp.com/style.css);
+  @import url(/stylesheets/punk.css);
+  DT { background: yellow; color: black }
+-->
+</STYLE>
+</head>
+<body bgcolor="#FFFFFF"><!-- #BeginLibraryItem "/Library/navtop.lbi" --></p>
+<a href="http://sampleurl-a.com/">Click Here for Sample URL 1</a>
+<frameset><frame src="http://sampleurl-frame.com/"></frame></frameset>
+<map id="testmap"><area href="http://sampleurl-area"></area>></map>
+<img src="http://sampleurl-img/"/>
+</body>
+</html>

data/spec/spec.opts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ --colour
2	+ --format specdoc

data/spec/spec_helper.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ require File.expand_path(File.dirname(__FILE__) + '/../lib/cobweb')

metadata ADDED Viewed

@@ -0,0 +1,133 @@
+--- !ruby/object:Gem::Specification
+name: cobweb
+version: !ruby/object:Gem::Version
+  hash: 29
+  prerelease: false
+  segments:
+  - 0
+  - 0
+  - 1
+  version: 0.0.1
+platform: ruby
+authors:
+- Stewart McKee
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-11-10 00:00:00 +00:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: resque
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  type: :runtime
+  version_requirements: *id001
+- !ruby/object:Gem::Dependency
+  name: redis
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  type: :runtime
+  version_requirements: *id002
+- !ruby/object:Gem::Dependency
+  name: absolutize
+  prerelease: false
+  requirement: &id003 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  type: :runtime
+  version_requirements: *id003
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  prerelease: false
+  requirement: &id004 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  type: :runtime
+  version_requirements: *id004
+description:
+email: stewart@rockwellcottage.com
+executables: []
+extensions: []
+extra_rdoc_files:
+- README.textile
+files:
+- spec/samples/sample_html_links.html
+- spec/spec.opts
+- spec/spec_helper.rb
+- spec/cobweb/content_link_parser_spec.rb
+- spec/cobweb/cobweb_spec.rb
+- spec/cobweb/crawl_job_spec.rb
+- lib/namespaced_redis.rb
+- lib/cobweb.rb
+- lib/content_process_job.rb
+- lib/content_link_parser.rb
+- lib/crawl_job.rb
+- README.textile
+has_rdoc: false
+homepage: http://github.com/stewartmckee/cobweb
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.7
+signing_key:
+specification_version: 3
+summary: Crawler utilizing resque
+test_files: []