RubyGems - simplecrawler - Versions diffs - 0.1.4 → 0.1.6 - Mend

simplecrawler 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

data/lib/simplecrawler.rb CHANGED Viewed

@@ -23,11 +23,11 @@ module SimpleCrawler
 	require File.dirname(__FILE__) + '/document'
 	MARKUP_MIME_TYPES = ["text/html", "text/xml", "application/xml", "application/xhtml+xml"]
-	VERSION = "0.1.4"
+	VERSION = "0.1.6"
 	class Crawler
-		attr_accessor :user_agent, :skip_patterns, :load_binary_data, :queue, :visited, :site_uri, :maxcount, :current_count
+		attr_accessor :user_agent, :skip_patterns, :include_patterns, :load_binary_data, :queue, :visited, :site_uri, :maxcount, :current_count
 		def initialize(url)
 			@load_binary_data = false #default, skip loading of pagedata for binary files into Document.data
@@ -80,6 +80,19 @@ module SimpleCrawler
 				end
 			end
+			#Check if uri is in at least one of the include patterns
+			if @include_patterns
+				match_found = false
+				for include_pattern in @include_patterns
+					re = Regexp.new(include_pattern)
+					if re.match(uri.path)
+						match_found = true
+					end
+				end
+				return true unless match_found
+			end
 			return false
 		end
@@ -106,15 +119,17 @@ module SimpleCrawler
 				uri.path = path if path != "/"
 				doc.uri = uri
-				log("Trying #{uri}")
+				log("Opening #{uri}")
 				file = open(uri)
 				mime_type = file.meta["content-type"].split(";")[0] if file.meta["content-type"]
 				if MARKUP_MIME_TYPES.include?(mime_type.downcase) or @load_binary_data
+					log("Loading data from #{uri}")
 					doc.data = file.read
 				else
+					log("Skipping data for #{uri}")
 					doc.data = nil
 				end
@@ -140,6 +155,7 @@ module SimpleCrawler
 					begin
 						uri = URI.parse(link.attributes["href"])
 						add_uri(uri)
+						log("   Added #{uri}")
 					rescue
 						#skip this link
 					end

data/tests/simplecrawler_test.rb CHANGED Viewed

@@ -50,6 +50,14 @@ class SimpleCrawlerTest < Test::Unit::TestCase
 	  assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/doc.htm"))
   end
+  def test_include_pattern
+	  @simplecrawler.include_patterns = ["\\/test\\/", "docs"]
+	  assert @simplecrawler.skip_uri?(URI.parse("http://www.example.com/word.doc"))
+	  assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/test/doc.htm"))
+	  assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/docs/doc.htm"))
+  end
   def test_addded_paths_shuld_be_distinct
 	  @simplecrawler.add_uri("http://www.example.com/") # This path is already in the queue

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: simplecrawler
 version: !ruby/object:Gem::Version
-  version: 0.1.4
+  version: 0.1.6
 platform: ruby
 authors:
 - Peter Krantz
@@ -9,11 +9,12 @@ autorequire: simplecrawler
 bindir: bin
 cert_chain: []
-date: 2008-09-17 00:00:00 +02:00
+date: 2008-11-28 00:00:00 +01:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
   name: hpricot
+  type: :runtime
   version_requirement:
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
@@ -60,7 +61,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project: simplecrawler
-rubygems_version: 1.0.0
+rubygems_version: 1.3.1
 signing_key:
 specification_version: 2
 summary: A generic library for web crawling.