RubyGems - simplecrawler - Versions diffs - 0.1.7 → 0.1.8 - Mend

simplecrawler 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

data/LICENSE +14 -0
data/lib/simplecrawler.rb +179 -170
data/tests/simplecrawler_test.rb +9 -4
metadata +40 -46
data/examples/accessibility_report.rb +0 -44
data/examples/crawl.rb +0 -12
data/examples/find_broken_links.rb +0 -21
data/examples/find_pdfs.rb +0 -20
data/examples/list_site_links.rb +0 -11
data/examples/result.htm +0 -1282
data/examples/riksdagen.txt +0 -66

data/LICENSE ADDED

@@ -0,0 +1,14 @@
+Copright 2007, Peter Krantz
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License (LGPL) as published by
+the Free Software Foundation; either version 3 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.

data/lib/simplecrawler.rb CHANGED

@@ -1,12 +1,17 @@
-# == Simple Crawler
+# == Simple Crawler
 # :title: SimpleCrawler - a generic web crawler library in Ruby
 # Author::    Peter Krantz (http://www.peterkrantz.com)
 # License::   LGPL (See LICENSE file)
 #
-# The SimpleCrawler module is a library for crawling web sites. The crawler provides comprehensive data from the page crawled which can be used for page analysis, indexing, accessibility checks etc. Restrictions can be specified to limit crawling of binary files.
+# The SimpleCrawler module is a library for crawling web sites. The crawler
+# provides comprehensive data from the page crawled which can be used for page
+# analysis, indexing, accessibility checks etc. Restrictions can be specified
+# to limit crawling of binary files.
 #
 # == Output
-# The SimpleCrawler::Crawler class yields a SimpleCrawler::Document object instance. This object contains information about a specific URI such as http headers and response data etc.
+# The SimpleCrawler::Crawler class yields a SimpleCrawler::Document object
+# instance. This object contains information about a specific URI such as http
+# headers and response data etc.
 #
 # == Contributions
 # None yet :-) Why don't you go ahead and be first?
@@ -16,171 +21,175 @@
 module SimpleCrawler
-	require 'uri'
-	require 'rubygems'
-	require 'hpricot'
-	require 'open-uri'
-	require File.dirname(__FILE__) + '/document'
-	MARKUP_MIME_TYPES = ["text/html", "text/xml", "application/xml", "application/xhtml+xml"]
-	VERSION = "0.1.7"
-	class Crawler
-		attr_accessor :user_agent, :skip_patterns, :include_patterns, :load_binary_data, :queue, :visited, :site_uri, :maxcount, :current_count
-		def initialize(url)
-			@load_binary_data = false #default, skip loading of pagedata for binary files into Document.data
-			@site_uri = URI.parse(url)
-			@site_uri.path = "/" if @site_uri.path == ""
-			@visited = Hash.new
-			@queue = Array.new
-			@current_count = 0
-			add_uri(@site_uri)
-		end
-		# Override this method for your own logging needs.
-		def log(message)
-			puts message
-		end
-		# Check if a path should be ignored because it matches a skip pattern or is already visited.
-		def skip_uri?(uri)
-			#Check if maxcount is reached
-			if @maxcount
-				if @current_count >= @maxcount
-					return true
-				end
-			end
-			#Check if path belongs to site
-			unless (uri.relative? or uri.host == @site_uri.host)
-				return true
-			end
-			#Check if fragment identifier (e.g. #content)
-			if uri.path.length == 0 and uri.fragment.length > 0
-				return true
-			end
-			#Check if uri already visited in this crawl or if it is queued for crawling
-			if @visited.has_key?(uri.path) or @queue.include?(uri.path)
-				return true
-			end
-			#Check if uri is in a skip pattern
-			if @skip_patterns
-				for skip_pattern in @skip_patterns
-					re = Regexp.new(skip_pattern)
-					if re.match(uri.path)
-						return true
-					end
-				end
-			end
-			#Check if uri is in at least one of the include patterns
-			if @include_patterns
-				match_found = false
-				for include_pattern in @include_patterns
-					re = Regexp.new(include_pattern)
-					if re.match(uri.path)
-						match_found = true
-					end
-				end
-				return true unless match_found
-			end
-			return false
-		end
-		def add_uri(uri)
-			if uri.class == String
-				uri = URI.parse(uri.strip)
-			end
-			unless skip_uri?(uri)
-				@queue.push uri.path
-				@current_count = @current_count + 1
-				@visited[uri.path] = false
-				log("   Added #{uri}")
-			end
-		end
-		def get_doc(path)
-			doc = Document.new
-			begin
-				uri = @site_uri.clone
-				uri.path = path if path != "/"
-				doc.uri = uri
-				doc.fetched_at = Time.now
-				log("Opening #{uri}")
-				file = open(uri)
-				mime_type = file.meta["content-type"].split(";")[0] if file.meta["content-type"]
-				if MARKUP_MIME_TYPES.include?(mime_type.downcase) or @load_binary_data
-					log("Loading data from #{uri}")
-					doc.data = file.read
-				else
-					log("Skipping data for #{uri}")
-					doc.data = nil
-				end
-				doc.headers = file.meta
-				doc.http_status = file.status
-			rescue => error
-				log("Error fetching #{uri}: #{error.message}")
-				if error.message[0..2] =~ /\d\d\d/ then
-					doc.http_status = [error.message[0..2], error.message[3..-1]]
-					return doc
-				else
-					raise error
-				end
-			end
-			return doc
-		end
-		def queue_local_links(doc)
-			return if doc.data == nil
-			log("Queuing links for #{doc.uri}")
-			Hpricot.buffer_size = 524288 #Allow for asp.net bastard-sized viewstate attributes...
-			doc = Hpricot(doc.data)
-			links = doc.search("a[@href]")
-			for link in links
-				if link.attributes["href"].length > 0 then
-					begin
-						uri = URI.parse(link.attributes["href"])
-						add_uri(uri)
-					rescue
-						#skip this link
-					end
-				end
-			end
-			doc = nil
-		end
-		# Initiate crawling.
-		def crawl()
-			while (!@queue.empty?)
-				uri = @queue.shift
-				current_doc = get_doc(uri)
-				yield current_doc
-				queue_local_links(current_doc)
-				@visited[uri] = true
-			end
-		end
-	end
+  require 'uri'
+  require 'rubygems'
+  require 'hpricot'
+  require 'open-uri'
+  require File.dirname(__FILE__) + '/document'
+  MARKUP_MIME_TYPES = ["text/html", "text/xml", "application/xml", "application/xhtml+xml"]
+  VERSION = "0.1.8"
+  class Crawler
+    attr_accessor :user_agent, :skip_patterns, :include_patterns, :load_binary_data, :queue, :visited, :site_uri, :maxcount, :current_count
+    def initialize(url)
+      @load_binary_data = false #default, skip loading of pagedata for binary files into Document.data
+      @site_uri = URI.parse(url)
+      @site_uri.path = "/" if @site_uri.path == ""
+      @visited = Hash.new
+      @queue = Array.new
+      @current_count = 0
+      add_uri(@site_uri)
+    end
+    # Override this method for your own logging needs.
+    def log(message)
+      puts message
+    end
+    # Check if a path should be ignored because it matches a skip pattern or is already visited.
+    def skip_uri?(uri)
+      #Check if maxcount is reached
+      if @maxcount
+        if @current_count >= @maxcount
+          return true
+        end
+      end
+      #Check if path belongs to site
+      unless (uri.relative? or uri.host == @site_uri.host)
+        return true
+      end
+      #Check if fragment identifier (e.g. #content)
+      if uri.request_uri.length == 0 and uri.fragment.length > 0
+        return true
+      end
+      #Check if uri already visited in this crawl or if it is queued for crawling
+      if @visited.has_key?(uri.request_uri) or @queue.include?(uri.request_uri)
+        return true
+      end
+      #Check if uri is in a skip pattern
+      if @skip_patterns
+        for skip_pattern in @skip_patterns
+          re = Regexp.new(skip_pattern)
+          if re.match(uri.request_uri)
+            return true
+          end
+        end
+      end
+      #Check if uri is in at least one of the include patterns
+      if @include_patterns
+        match_found = false
+        for include_pattern in @include_patterns
+          re = Regexp.new(include_pattern)
+          if re.match(uri.request_uri)
+            match_found = true
+          end
+        end
+        return true unless match_found
+      end
+      return false
+    end
+    def add_uri(uri)
+      if uri.class == String
+        uri = URI.parse(uri.strip)
+      end
+      unless skip_uri?(uri)
+        @queue.push uri.request_uri
+        @current_count = @current_count + 1
+        @visited[uri.request_uri] = false
+        log("   Added #{uri}")
+      end
+    end
+    def get_doc(request_uri)
+      doc = Document.new
+      begin
+        log("   Getting #{request_uri}")
+        request_uri = URI.parse(request_uri)
+        uri = @site_uri.clone
+        uri.path = request_uri.path #if request_uri.path.to_s != "/"
+        uri.query = request_uri.query
+        doc.uri = uri
+        doc.fetched_at = Time.now
+        log("Opening #{uri}")
+        file = open(uri)
+        mime_type = file.meta["content-type"].split(";")[0] if file.meta["content-type"]
+        if MARKUP_MIME_TYPES.include?(mime_type.downcase) or @load_binary_data
+          log("Loading data from #{uri}")
+          doc.data = file.read
+        else
+          log("Skipping data for #{uri}")
+          doc.data = nil
+        end
+        doc.headers = file.meta
+        doc.http_status = file.status
+      rescue => error
+        log("Error fetching #{uri}: #{error.message}")
+        if error.message[0..2] =~ /\d\d\d/ then
+          doc.http_status = [error.message[0..2], error.message[3..-1]]
+          return doc
+        else
+          raise error
+        end
+      end
+      return doc
+    end
+    def queue_local_links(doc)
+      return if doc.data == nil
+      log("Queuing links for #{doc.uri}")
+      Hpricot.buffer_size = 524288 #Allow for asp.net bastard-sized viewstate attributes...
+      doc = Hpricot(doc.data)
+      links = doc.search("a[@href]")
+      for link in links
+        if link.attributes["href"].length > 0 then
+          begin
+            uri = URI.parse(link.attributes["href"])
+            add_uri(uri)
+          rescue
+            #skip this link
+          end
+        end
+      end
+      doc = nil
+    end
+    # Initiate crawling.
+    def crawl()
+      while (!@queue.empty?)
+        uri = @queue.shift
+        current_doc = get_doc(uri)
+        yield current_doc
+        queue_local_links(current_doc)
+        @visited[uri] = true
+      end
+    end
+  end
 end

data/tests/simplecrawler_test.rb CHANGED

@@ -3,11 +3,11 @@ require 'test/unit'
 require 'uri'
 class SimpleCrawlerTest < Test::Unit::TestCase
   def setup
 	  @simplecrawler = SimpleCrawler::Crawler.new("http://www.example.com/")
   end
   def test_initialize_crawler
 	  @crawler = SimpleCrawler::Crawler.new("http://www.example.com/")
@@ -30,6 +30,11 @@ class SimpleCrawlerTest < Test::Unit::TestCase
   end
+  def test_include_pattern_query
+	  @simplecrawler.include_patterns = ["\\/test\\?a=b"]
+	  assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/test?a=b"))
+  end
   def test_maxcount_limit
 	  @simplecrawler.maxcount = 2
@@ -49,14 +54,14 @@ class SimpleCrawlerTest < Test::Unit::TestCase
 	  assert @simplecrawler.skip_uri?(URI.parse("http://www.example.com/word.doc"))
 	  assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/doc.htm"))
   end
   def test_include_pattern
 	  @simplecrawler.include_patterns = ["\\/test\\/", "docs"]
 	  assert @simplecrawler.skip_uri?(URI.parse("http://www.example.com/word.doc"))
 	  assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/test/doc.htm"))
 	  assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/docs/doc.htm"))
   end
   def test_addded_paths_shuld_be_distinct

metadata CHANGED

@@ -1,72 +1,66 @@
---- !ruby/object:Gem::Specification
+--- !ruby/object:Gem::Specification
 name: simplecrawler
-version: !ruby/object:Gem::Version
-  version: 0.1.7
+version: !ruby/object:Gem::Version
+  version: 0.1.8
+  prerelease:
 platform: ruby
-authors:
+authors:
 - Peter Krantz
-autorequire: simplecrawler
+autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-05-04 00:00:00 +02:00
+date: 2011-11-02 00:00:00.000000000 +01:00
 default_executable:
-dependencies:
-- !ruby/object:Gem::Dependency
+dependencies:
+- !ruby/object:Gem::Dependency
   name: hpricot
+  requirement: &2161765200 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0.5'
   type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: "0.5"
-    version:
-description:
+  prerelease: false
+  version_requirements: *2161765200
+description: ! "The SimpleCrawler module is a library for crawling web\n  sites. The
+  crawler provides comprehensive data from the page crawled which\n  can be used for
+  page analysis, indexing, accessibility checks etc.\n    Restrictions can be specified
+  to limit crawling of binary files."
 email: peter.krantzNODAMNSPAM@gmail.com
 executables: []
 extensions: []
 extra_rdoc_files: []
-files:
-- README
+files:
 - lib/document.rb
 - lib/simplecrawler.rb
+- LICENSE
+- README
 - tests/simplecrawler_test.rb
-- examples/accessibility_report.rb
-- examples/crawl.rb
-- examples/find_broken_links.rb
-- examples/find_pdfs.rb
-- examples/list_site_links.rb
-- examples/result.htm
-- examples/riksdagen.txt
 has_rdoc: true
 homepage: http://www.peterkrantz.com/simplecrawler/wiki/
+licenses: []
 post_install_message:
 rdoc_options: []
-require_paths:
+require_paths:
 - lib
-required_ruby_version: !ruby/object:Gem::Requirement
-  requirements:
-  - - ">="
-    - !ruby/object:Gem::Version
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
       version: 1.8.2
-  version:
-required_rubygems_version: !ruby/object:Gem::Requirement
-  requirements:
-  - - ">="
-    - !ruby/object:Gem::Version
-      version: "0"
-  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
 requirements: []
 rubyforge_project: simplecrawler
-rubygems_version: 1.3.1
+rubygems_version: 1.6.2
 signing_key:
-specification_version: 2
+specification_version: 3
 summary: A generic library for web crawling.
-test_files:
+test_files:
 - tests/simplecrawler_test.rb