simplecrawler 0.1.4 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
data/lib/simplecrawler.rb CHANGED
@@ -23,11 +23,11 @@ module SimpleCrawler
23
23
  require File.dirname(__FILE__) + '/document'
24
24
 
25
25
  MARKUP_MIME_TYPES = ["text/html", "text/xml", "application/xml", "application/xhtml+xml"]
26
- VERSION = "0.1.4"
26
+ VERSION = "0.1.6"
27
27
 
28
28
  class Crawler
29
29
 
30
- attr_accessor :user_agent, :skip_patterns, :load_binary_data, :queue, :visited, :site_uri, :maxcount, :current_count
30
+ attr_accessor :user_agent, :skip_patterns, :include_patterns, :load_binary_data, :queue, :visited, :site_uri, :maxcount, :current_count
31
31
 
32
32
  def initialize(url)
33
33
  @load_binary_data = false #default, skip loading of pagedata for binary files into Document.data
@@ -80,6 +80,19 @@ module SimpleCrawler
80
80
  end
81
81
  end
82
82
 
83
+ #Check if uri is in at least one of the include patterns
84
+ if @include_patterns
85
+ match_found = false
86
+ for include_pattern in @include_patterns
87
+ re = Regexp.new(include_pattern)
88
+ if re.match(uri.path)
89
+ match_found = true
90
+ end
91
+ end
92
+
93
+ return true unless match_found
94
+ end
95
+
83
96
  return false
84
97
  end
85
98
 
@@ -106,15 +119,17 @@ module SimpleCrawler
106
119
  uri.path = path if path != "/"
107
120
  doc.uri = uri
108
121
 
109
- log("Trying #{uri}")
122
+ log("Opening #{uri}")
110
123
 
111
124
  file = open(uri)
112
125
 
113
126
  mime_type = file.meta["content-type"].split(";")[0] if file.meta["content-type"]
114
127
 
115
128
  if MARKUP_MIME_TYPES.include?(mime_type.downcase) or @load_binary_data
129
+ log("Loading data from #{uri}")
116
130
  doc.data = file.read
117
131
  else
132
+ log("Skipping data for #{uri}")
118
133
  doc.data = nil
119
134
  end
120
135
 
@@ -140,6 +155,7 @@ module SimpleCrawler
140
155
  begin
141
156
  uri = URI.parse(link.attributes["href"])
142
157
  add_uri(uri)
158
+ log(" Added #{uri}")
143
159
  rescue
144
160
  #skip this link
145
161
  end
@@ -50,6 +50,14 @@ class SimpleCrawlerTest < Test::Unit::TestCase
50
50
  assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/doc.htm"))
51
51
  end
52
52
 
53
+ def test_include_pattern
54
+ @simplecrawler.include_patterns = ["\\/test\\/", "docs"]
55
+ assert @simplecrawler.skip_uri?(URI.parse("http://www.example.com/word.doc"))
56
+ assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/test/doc.htm"))
57
+ assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/docs/doc.htm"))
58
+ end
59
+
60
+
53
61
 
54
62
  def test_addded_paths_shuld_be_distinct
55
63
  @simplecrawler.add_uri("http://www.example.com/") # This path is already in the queue
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simplecrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Krantz
@@ -9,11 +9,12 @@ autorequire: simplecrawler
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-09-17 00:00:00 +02:00
12
+ date: 2008-11-28 00:00:00 +01:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: hpricot
17
+ type: :runtime
17
18
  version_requirement:
18
19
  version_requirements: !ruby/object:Gem::Requirement
19
20
  requirements:
@@ -60,7 +61,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
60
61
  requirements: []
61
62
 
62
63
  rubyforge_project: simplecrawler
63
- rubygems_version: 1.0.0
64
+ rubygems_version: 1.3.1
64
65
  signing_key:
65
66
  specification_version: 2
66
67
  summary: A generic library for web crawling.