RubyGems - cobweb - Versions diffs - 0.0.12 → 0.0.13 - Mend

cobweb 0.0.12 → 0.0.13

Files changed (7) hide show

data/README.textile +4 -2
data/lib/cobweb.rb +25 -18
data/lib/cobweb_crawler.rb +120 -0
data/lib/content_link_parser.rb +10 -4
data/spec/cobweb/cobweb_crawler_spec.rb +56 -0
data/spec/cobweb/cobweb_spec.rb +1 -4
metadata +25 -12

data/README.textile CHANGED

@@ -1,12 +1,14 @@
-h1. Cobweb v0.0.6
+h1. Cobweb v0.0.13
 h2. Intro
-  CobWeb has two functions.  Firstly it is a http client that allows get and head requests returning a hash of data relating to the requested resource.  The second main function is to utilize this combined with the power of Resque to cluster the crawls allowing you crawl quickly.
+  CobWeb has three methods of running.  Firstly it is a http client that allows get and head requests returning a hash of data relating to the requested resource.  The second main function is to utilize this combined with the power of Resque to cluster the crawls allowing you crawl quickly.  Lastly you can run the crawler with a block that uses each of the pages found in the crawl.
   When running on resque, passing in a Class and queue name it will enqueue all resources to this queue for processing, passing in the hash it has generated.  You then implement the perform method to process the resource for your own application.
+  Documentation for running with a block will come soon..  Check out the CobwebCrawler spec for hints.
   The data available in the returned hash are:
   * :url - url of the resource requested

data/lib/cobweb.rb CHANGED

@@ -51,10 +51,10 @@ class CobWeb
     raise "url cannot be nil" if url.nil?
-    absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
+    absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => true, :force_escaping => false, :remove_anchors => true)
     # get the unique id for this request
-    unique_id = Digest::SHA1.hexdigest(url)
+    unique_id = Digest::SHA1.hexdigest(url.to_s)
     # connect to redis
     redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb")
@@ -70,23 +70,26 @@ class CobWeb
       content
     else
       # this url is valid for processing so lets get on with it
-      print "Retrieving #{url }... " unless @options[:quiet]
       uri = Addressable::URI.parse(url.strip)
       # retrieve data
-      http = Net::HTTP.new(uri.host, uri.inferred_port)
+      unless @http && @http.address == uri.host && @http.port == uri.inferred_port
+        puts "Creating connection to #{uri.host}..."
+        @http = Net::HTTP.new(uri.host, uri.inferred_port)
+      end
       if uri.scheme == "https"
-        http.use_ssl = true
-        http.verify_mode = OpenSSL::SSL::VERIFY_NONE
+        @http.use_ssl = true
+        @http.verify_mode = OpenSSL::SSL::VERIFY_NONE
       end
       request_time = Time.now.to_f
-      http.read_timeout = @options[:timeout].to_i
-      http.open_timeout = @options[:timeout].to_i
+      @http.read_timeout = @options[:timeout].to_i
+      @http.open_timeout = @options[:timeout].to_i
       begin
-        response = http.start() {|http|
-          response = http.get(uri.request_uri)
-        }
+        print "Retrieving #{url }... " unless @options[:quiet]
+        request = Net::HTTP::Get.new uri.request_uri
+        response = @http.request request
         if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
           puts "redirected... " unless @options[:quiet]
@@ -107,9 +110,11 @@ class CobWeb
           content[:url] = uri.to_s
           content[:status_code] = response.code.to_i
           content[:mime_type] = response.content_type.split(";")[0].strip
-          charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
-          charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
-          content[:character_set] = charset
+          if response["Content-Type"].include? ";"
+            charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
+            charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
+            content[:character_set] = charset
+          end
           content[:length] = response.content_length
           if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
             content[:body] = response.body
@@ -216,9 +221,11 @@ class CobWeb
           content[:status_code] = response.code.to_i
           unless response.content_type.nil?
             content[:mime_type] = response.content_type.split(";")[0].strip
-            charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
-            charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
-            content[:character_set] = charset
+            if response["Content-Type"].include? ";"
+              charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
+              charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
+              content[:character_set] = charset
+            end
           end
           # add content to cache if required

data/lib/cobweb_crawler.rb ADDED

@@ -0,0 +1,120 @@
+class CobwebCrawler
+  def initialize(options={})
+    @options = options
+    @statistic = {}
+    @queue = []
+    @crawled = []
+    @cobweb = CobWeb.new(@options)
+  end
+  def crawl(base_url, crawl_options = {}, &block)
+    @options[:base_url] = base_url unless @options.has_key? :base_url
+    @crawl_options = crawl_options
+    @absolutize = Absolutize.new(@options[:base_url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
+    crawl_counter = @crawled.count
+    @queue << base_url
+    while !@queue.empty? && (@options[:crawl_limit].to_i == 0 || @options[:crawl_limit].to_i > crawl_counter)
+      url = @queue.first
+      @options[:url] = url
+      unless @crawled.include?(url) || url =~ /\/(.+?)\/\1\/\1/
+        begin
+          content = @cobweb.get(@options[:url])
+          if @statistic[:average_response_time].nil?
+            @statistic[:average_response_time] = content[:response_time].to_f
+          else
+            @statistic[:average_response_time] = (((@statistic[:average_response_time] * crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1))
+          end
+          @statistic[:maximum_response_time] = content[:response_time] if @statistic[:maximum_response_time].nil? || @statistic[:maximum_response_time] < content[:response_time]
+          @statistic[:minimum_response_time] = content[:response_time] if @statistic[:minimum_response_time].nil? || @statistic[:minimum_response_time] > content[:response_time]
+          if @statistic[:average_length]
+            @statistic[:average_length] = (((@statistic[:average_length].to_i*crawl_counter) + content[:length].to_i) / (crawl_counter + 1))
+          else
+            @statistic[:average_length] = content[:length].to_i
+          end
+          @statistic[:maximum_length] = content[:length].to_i if @statistic[:maximum_length].nil? || content[:length].to_i > @statistic[:maximum_length].to_i
+          @statistic[:minimum_length] = content[:length].to_i if @statistic[:minimum_length].nil? || content[:length].to_i < @statistic[:minimum_length].to_i
+          @statistic[:total_length] = @statistic[:total_length].to_i + content[:length].to_i
+          if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
+            @statistic[:page_count] = @statistic[:page_count].to_i + 1
+            @statistic[:page_size] = @statistic[:page_count].to_i + content[:length].to_i
+          else
+            @statistic[:asset_count] = @statistic[:asset_count].to_i + 1
+            @statistic[:asset_size] = @statistic[:asset_count].to_i + content[:length].to_i
+          end
+          mime_counts = {}
+          if @statistic.has_key? :mime_counts
+            mime_counts = @statistic[:mime_counts]
+            if mime_counts.has_key? content[:mime_type]
+              mime_counts[content[:mime_type]] += 1
+            else
+              mime_counts[content[:mime_type]] = 1
+            end
+          else
+            mime_counts = {content[:mime_type] => 1}
+          end
+          @statistic[:mime_counts] = mime_counts
+          status_counts = {}
+          if @statistic.has_key? :status_counts
+            status_counts = @statistic[:status_counts]
+            if status_counts.has_key? content[:status_code].to_i
+              status_counts[content[:status_code].to_i] += 1
+            else
+              status_counts[content[:status_code].to_i] = 1
+            end
+          else
+            status_counts = {content[:status_code].to_i => 1}
+          end
+          @statistic[:status_counts] = status_counts
+          @crawled << url
+          crawl_counter += 1
+          @queue.delete(url)
+          content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link|
+            unless @crawled.include? link
+              puts "Checking if #{link} matches #{@options[:base_url]} as internal?" if @options[:debug]
+              if link.to_s.match(Regexp.new("^#{@options[:base_url]}"))
+                puts "Matched as #{link} as internal" if @options[:debug]
+                unless @crawled.include? link.to_s or @queue.include? link.to_s
+                  puts "Added #{link.to_s} to queue" if @options[:debug]
+                  @queue << link.to_s
+                end
+              end
+            end
+          end
+          @queue.uniq!
+          puts "Crawled: #{crawl_counter} Limit: #{@options[:crawl_limit]} Queued: #{@queue.count}" if @options[:debug]
+          yield content if block_given?
+        rescue => e
+          puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!"
+          ap e
+          @queue.delete(url)
+        end
+      else
+        puts "Already crawled #{@options[:url]}" if @options[:debug]
+      end
+    end
+    @statistic
+  end
+end

data/lib/content_link_parser.rb CHANGED

@@ -56,13 +56,19 @@ class ContentLinkParser
   def find_matches(array, selector, attribute)
     if attribute.kind_of? String or attribute.kind_of? Symbol
-      @doc.css(selector).each do |tag|
-        uri = @absolutize.url(tag[attribute])
-        array << uri.to_s
+      @doc.css(selector).each do |tag|
+        begin
+          uri = @absolutize.url(tag[attribute])
+          array << uri.to_s
+        rescue
+        end
       end
     elsif attribute.instance_of? Regexp
       @doc.css(selector).each do |tag|
-        tag.content.scan(attribute) {|match| array << @absolutize.url(match[0])}
+        begin
+          tag.content.scan(attribute) {|match| array << @absolutize.url(match[0])}
+        rescue
+        end
       end
     end
   end

data/spec/cobweb/cobweb_crawler_spec.rb ADDED

@@ -0,0 +1,56 @@
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+describe CobwebCrawler do
+  before(:each) do
+    @base_url = "http://www.baseurl.com/"
+    @default_headers = {"Cache-Control" => "private, max-age=0",
+                        "Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
+                        "Expires" => "-1",
+                        "Content-Type" => "text/html; charset=UTF-8",
+                        "Content-Encoding" => "gzip",
+                        "Transfer-Encoding" => "chunked",
+                        "Server" => "gws",
+                        "X-XSS-Protection" => "1; mode=block"}
+  end
+  describe "with mock" do
+    it "should generate a cobweb_crawler object" do
+      CobwebCrawler.new.should be_an_instance_of CobwebCrawler
+    end
+    describe "crawl" do
+      it "should crawl a site" do
+        # temporary tests to run crawler - proper specs to follow.. honest
+        crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
+        statistics = crawler.crawl("http://www.rockwellcottage.com/")
+        ap statistics
+      end
+      it "should take a block" do
+        # temporary tests to run crawler - proper specs to follow.. honest
+        crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
+        statistics = crawler.crawl("http://www.rockwellcottage.com/") do |content|
+          ap content[:url]
+        end
+        ap statistics
+      end
+    end
+  end
+end

data/spec/cobweb/cobweb_spec.rb CHANGED

@@ -1,5 +1,4 @@
 require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
-require "ap"
 describe CobWeb do
@@ -28,6 +27,7 @@ describe CobWeb do
       @mock_http_response = mock(Net::HTTPResponse)
       @mock_http_redirect_response = mock(Net::HTTPRedirection)
+      @mock_http_redirect_response2 = mock(Net::HTTPRedirection)
       @mock_http_get = mock(Net::HTTP::Get)
       Net::HTTP.stub!(:new).and_return(@mock_http_client)
@@ -171,9 +171,6 @@ describe CobWeb do
   end
   describe "without mock" do
-    it "should throw invalid url exception for an invalid url" do
-      lambda {@cobweb.get("asdgas asv\"£%\"^%&*%")}.should raise_error URI::InvalidURIError
-    end
     it "should throw exception when server is unavailable" #do
     #  lambda {@cobweb.get({:url => "http://www.oasdjgoisadjgoisdiog.com"})}.should raise_error URI::InvalidURIError

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: cobweb
 version: !ruby/object:Gem::Version
-  version: 0.0.12
+  version: 0.0.13
   prerelease:
 platform: ruby
 authors:
@@ -9,11 +9,11 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-01-15 00:00:00.000000000 Z
+date: 2012-02-10 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: resque
-  requirement: &70227287202120 !ruby/object:Gem::Requirement
+  requirement: &70125097329480 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70227287202120
+  version_requirements: *70125097329480
 - !ruby/object:Gem::Dependency
   name: redis
-  requirement: &70227287201020 !ruby/object:Gem::Requirement
+  requirement: &70125097328760 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70227287201020
+  version_requirements: *70125097328760
 - !ruby/object:Gem::Dependency
   name: absolutize
-  requirement: &70227287200100 !ruby/object:Gem::Requirement
+  requirement: &70125097328280 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70227287200100
+  version_requirements: *70125097328280
 - !ruby/object:Gem::Dependency
   name: nokogiri
-  requirement: &70227287199600 !ruby/object:Gem::Requirement
+  requirement: &70125097327660 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70227287199600
+  version_requirements: *70125097327660
 - !ruby/object:Gem::Dependency
   name: addressable
-  requirement: &70227287198900 !ruby/object:Gem::Requirement
+  requirement: &70125097327060 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -65,7 +65,18 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70227287198900
+  version_requirements: *70125097327060
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: &70125097326400 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: *70125097326400
 description:
 email: stewart@rockwellcottage.com
 executables: []
@@ -73,12 +84,14 @@ extensions: []
 extra_rdoc_files:
 - README.textile
 files:
+- spec/cobweb/cobweb_crawler_spec.rb
 - spec/cobweb/cobweb_spec.rb
 - spec/cobweb/content_link_parser_spec.rb
 - spec/samples/sample_html_links.html
 - spec/spec.opts
 - spec/spec_helper.rb
 - lib/cobweb.rb
+- lib/cobweb_crawler.rb
 - lib/cobweb_finished_job.rb
 - lib/cobweb_process_job.rb
 - lib/content_link_parser.rb