simplecrawler 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/simplecrawler.rb CHANGED
@@ -23,11 +23,11 @@ module SimpleCrawler
23
23
  require File.dirname(__FILE__) + '/document'
24
24
 
25
25
  MARKUP_MIME_TYPES = ["text/html", "text/xml", "application/xml", "application/xhtml+xml"]
26
- VERSION = "0.1.4"
26
+ VERSION = "0.1.6"
27
27
 
28
28
  class Crawler
29
29
 
30
- attr_accessor :user_agent, :skip_patterns, :load_binary_data, :queue, :visited, :site_uri, :maxcount, :current_count
30
+ attr_accessor :user_agent, :skip_patterns, :include_patterns, :load_binary_data, :queue, :visited, :site_uri, :maxcount, :current_count
31
31
 
32
32
  def initialize(url)
33
33
  @load_binary_data = false #default, skip loading of pagedata for binary files into Document.data
@@ -80,6 +80,19 @@ module SimpleCrawler
80
80
  end
81
81
  end
82
82
 
83
+ #Check if uri is in at least one of the include patterns
84
+ if @include_patterns
85
+ match_found = false
86
+ for include_pattern in @include_patterns
87
+ re = Regexp.new(include_pattern)
88
+ if re.match(uri.path)
89
+ match_found = true
90
+ end
91
+ end
92
+
93
+ return true unless match_found
94
+ end
95
+
83
96
  return false
84
97
  end
85
98
 
@@ -106,15 +119,17 @@ module SimpleCrawler
106
119
  uri.path = path if path != "/"
107
120
  doc.uri = uri
108
121
 
109
- log("Trying #{uri}")
122
+ log("Opening #{uri}")
110
123
 
111
124
  file = open(uri)
112
125
 
113
126
  mime_type = file.meta["content-type"].split(";")[0] if file.meta["content-type"]
114
127
 
115
128
  if MARKUP_MIME_TYPES.include?(mime_type.downcase) or @load_binary_data
129
+ log("Loading data from #{uri}")
116
130
  doc.data = file.read
117
131
  else
132
+ log("Skipping data for #{uri}")
118
133
  doc.data = nil
119
134
  end
120
135
 
@@ -140,6 +155,7 @@ module SimpleCrawler
140
155
  begin
141
156
  uri = URI.parse(link.attributes["href"])
142
157
  add_uri(uri)
158
+ log(" Added #{uri}")
143
159
  rescue
144
160
  #skip this link
145
161
  end
@@ -50,6 +50,14 @@ class SimpleCrawlerTest < Test::Unit::TestCase
50
50
  assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/doc.htm"))
51
51
  end
52
52
 
53
+ def test_include_pattern
54
+ @simplecrawler.include_patterns = ["\\/test\\/", "docs"]
55
+ assert @simplecrawler.skip_uri?(URI.parse("http://www.example.com/word.doc"))
56
+ assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/test/doc.htm"))
57
+ assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/docs/doc.htm"))
58
+ end
59
+
60
+
53
61
 
54
62
  def test_addded_paths_shuld_be_distinct
55
63
  @simplecrawler.add_uri("http://www.example.com/") # This path is already in the queue
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simplecrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Krantz
@@ -9,11 +9,12 @@ autorequire: simplecrawler
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-09-17 00:00:00 +02:00
12
+ date: 2008-11-28 00:00:00 +01:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: hpricot
17
+ type: :runtime
17
18
  version_requirement:
18
19
  version_requirements: !ruby/object:Gem::Requirement
19
20
  requirements:
@@ -60,7 +61,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
60
61
  requirements: []
61
62
 
62
63
  rubyforge_project: simplecrawler
63
- rubygems_version: 1.0.0
64
+ rubygems_version: 1.3.1
64
65
  signing_key:
65
66
  specification_version: 2
66
67
  summary: A generic library for web crawling.