simplecrawler 0.1.7 → 0.1.8

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,14 @@
1
+ Copright 2007, Peter Krantz
2
+
3
+ This program is free software; you can redistribute it and/or modify
4
+ it under the terms of the GNU Lesser General Public License (LGPL) as published by
5
+ the Free Software Foundation; either version 3 of the License, or
6
+ (at your option) any later version.
7
+
8
+ This program is distributed in the hope that it will be useful,
9
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ GNU General Public License for more details.
12
+
13
+ You should have received a copy of the GNU General Public License
14
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
@@ -1,12 +1,17 @@
1
- # == Simple Crawler
1
+ # == Simple Crawler
2
2
  # :title: SimpleCrawler - a generic web crawler library in Ruby
3
3
  # Author:: Peter Krantz (http://www.peterkrantz.com)
4
4
  # License:: LGPL (See LICENSE file)
5
5
  #
6
- # The SimpleCrawler module is a library for crawling web sites. The crawler provides comprehensive data from the page crawled which can be used for page analysis, indexing, accessibility checks etc. Restrictions can be specified to limit crawling of binary files.
6
+ # The SimpleCrawler module is a library for crawling web sites. The crawler
7
+ # provides comprehensive data from the page crawled which can be used for page
8
+ # analysis, indexing, accessibility checks etc. Restrictions can be specified
9
+ # to limit crawling of binary files.
7
10
  #
8
11
  # == Output
9
- # The SimpleCrawler::Crawler class yields a SimpleCrawler::Document object instance. This object contains information about a specific URI such as http headers and response data etc.
12
+ # The SimpleCrawler::Crawler class yields a SimpleCrawler::Document object
13
+ # instance. This object contains information about a specific URI such as http
14
+ # headers and response data etc.
10
15
  #
11
16
  # == Contributions
12
17
  # None yet :-) Why don't you go ahead and be first?
@@ -16,171 +21,175 @@
16
21
 
17
22
  module SimpleCrawler
18
23
 
19
- require 'uri'
20
- require 'rubygems'
21
- require 'hpricot'
22
- require 'open-uri'
23
- require File.dirname(__FILE__) + '/document'
24
-
25
- MARKUP_MIME_TYPES = ["text/html", "text/xml", "application/xml", "application/xhtml+xml"]
26
- VERSION = "0.1.7"
27
-
28
- class Crawler
29
-
30
- attr_accessor :user_agent, :skip_patterns, :include_patterns, :load_binary_data, :queue, :visited, :site_uri, :maxcount, :current_count
31
-
32
- def initialize(url)
33
- @load_binary_data = false #default, skip loading of pagedata for binary files into Document.data
34
- @site_uri = URI.parse(url)
35
- @site_uri.path = "/" if @site_uri.path == ""
36
- @visited = Hash.new
37
- @queue = Array.new
38
- @current_count = 0
39
- add_uri(@site_uri)
40
- end
41
-
42
-
43
- # Override this method for your own logging needs.
44
- def log(message)
45
- puts message
46
- end
47
-
48
- # Check if a path should be ignored because it matches a skip pattern or is already visited.
49
- def skip_uri?(uri)
50
-
51
- #Check if maxcount is reached
52
- if @maxcount
53
- if @current_count >= @maxcount
54
- return true
55
- end
56
- end
57
-
58
- #Check if path belongs to site
59
- unless (uri.relative? or uri.host == @site_uri.host)
60
- return true
61
- end
62
-
63
- #Check if fragment identifier (e.g. #content)
64
- if uri.path.length == 0 and uri.fragment.length > 0
65
- return true
66
- end
67
-
68
- #Check if uri already visited in this crawl or if it is queued for crawling
69
- if @visited.has_key?(uri.path) or @queue.include?(uri.path)
70
- return true
71
- end
72
-
73
- #Check if uri is in a skip pattern
74
- if @skip_patterns
75
- for skip_pattern in @skip_patterns
76
- re = Regexp.new(skip_pattern)
77
- if re.match(uri.path)
78
- return true
79
- end
80
- end
81
- end
82
-
83
- #Check if uri is in at least one of the include patterns
84
- if @include_patterns
85
- match_found = false
86
- for include_pattern in @include_patterns
87
- re = Regexp.new(include_pattern)
88
- if re.match(uri.path)
89
- match_found = true
90
- end
91
- end
92
-
93
- return true unless match_found
94
- end
95
-
96
- return false
97
- end
98
-
99
-
100
- def add_uri(uri)
101
-
102
- if uri.class == String
103
- uri = URI.parse(uri.strip)
104
- end
105
-
106
- unless skip_uri?(uri)
107
- @queue.push uri.path
108
- @current_count = @current_count + 1
109
- @visited[uri.path] = false
110
- log(" Added #{uri}")
111
- end
112
-
113
- end
114
-
115
-
116
- def get_doc(path)
117
- doc = Document.new
118
- begin
119
- uri = @site_uri.clone
120
- uri.path = path if path != "/"
121
- doc.uri = uri
122
- doc.fetched_at = Time.now
123
-
124
- log("Opening #{uri}")
125
-
126
- file = open(uri)
127
-
128
- mime_type = file.meta["content-type"].split(";")[0] if file.meta["content-type"]
129
-
130
- if MARKUP_MIME_TYPES.include?(mime_type.downcase) or @load_binary_data
131
- log("Loading data from #{uri}")
132
- doc.data = file.read
133
- else
134
- log("Skipping data for #{uri}")
135
- doc.data = nil
136
- end
137
-
138
- doc.headers = file.meta
139
- doc.http_status = file.status
140
-
141
- rescue => error
142
- log("Error fetching #{uri}: #{error.message}")
143
- if error.message[0..2] =~ /\d\d\d/ then
144
- doc.http_status = [error.message[0..2], error.message[3..-1]]
145
- return doc
146
- else
147
- raise error
148
- end
149
- end
150
- return doc
151
- end
152
-
153
-
154
- def queue_local_links(doc)
155
- return if doc.data == nil
156
- log("Queuing links for #{doc.uri}")
157
- Hpricot.buffer_size = 524288 #Allow for asp.net bastard-sized viewstate attributes...
158
- doc = Hpricot(doc.data)
159
- links = doc.search("a[@href]")
160
- for link in links
161
- if link.attributes["href"].length > 0 then
162
- begin
163
- uri = URI.parse(link.attributes["href"])
164
- add_uri(uri)
165
- rescue
166
- #skip this link
167
- end
168
- end
169
- end
170
- doc = nil
171
- end
172
-
173
-
174
- # Initiate crawling.
175
- def crawl()
176
- while (!@queue.empty?)
177
- uri = @queue.shift
178
- current_doc = get_doc(uri)
179
- yield current_doc
180
- queue_local_links(current_doc)
181
- @visited[uri] = true
182
- end
183
- end
184
-
185
- end
24
+ require 'uri'
25
+ require 'rubygems'
26
+ require 'hpricot'
27
+ require 'open-uri'
28
+ require File.dirname(__FILE__) + '/document'
29
+
30
+ MARKUP_MIME_TYPES = ["text/html", "text/xml", "application/xml", "application/xhtml+xml"]
31
+ VERSION = "0.1.8"
32
+
33
+ class Crawler
34
+
35
+ attr_accessor :user_agent, :skip_patterns, :include_patterns, :load_binary_data, :queue, :visited, :site_uri, :maxcount, :current_count
36
+
37
+ def initialize(url)
38
+ @load_binary_data = false #default, skip loading of pagedata for binary files into Document.data
39
+ @site_uri = URI.parse(url)
40
+ @site_uri.path = "/" if @site_uri.path == ""
41
+ @visited = Hash.new
42
+ @queue = Array.new
43
+ @current_count = 0
44
+ add_uri(@site_uri)
45
+ end
46
+
47
+
48
+ # Override this method for your own logging needs.
49
+ def log(message)
50
+ puts message
51
+ end
52
+
53
+ # Check if a path should be ignored because it matches a skip pattern or is already visited.
54
+ def skip_uri?(uri)
55
+
56
+ #Check if maxcount is reached
57
+ if @maxcount
58
+ if @current_count >= @maxcount
59
+ return true
60
+ end
61
+ end
62
+
63
+ #Check if path belongs to site
64
+ unless (uri.relative? or uri.host == @site_uri.host)
65
+ return true
66
+ end
67
+
68
+ #Check if fragment identifier (e.g. #content)
69
+ if uri.request_uri.length == 0 and uri.fragment.length > 0
70
+ return true
71
+ end
72
+
73
+ #Check if uri already visited in this crawl or if it is queued for crawling
74
+ if @visited.has_key?(uri.request_uri) or @queue.include?(uri.request_uri)
75
+ return true
76
+ end
77
+
78
+ #Check if uri is in a skip pattern
79
+ if @skip_patterns
80
+ for skip_pattern in @skip_patterns
81
+ re = Regexp.new(skip_pattern)
82
+ if re.match(uri.request_uri)
83
+ return true
84
+ end
85
+ end
86
+ end
87
+
88
+ #Check if uri is in at least one of the include patterns
89
+ if @include_patterns
90
+ match_found = false
91
+ for include_pattern in @include_patterns
92
+ re = Regexp.new(include_pattern)
93
+ if re.match(uri.request_uri)
94
+ match_found = true
95
+ end
96
+ end
97
+
98
+ return true unless match_found
99
+ end
100
+
101
+ return false
102
+ end
103
+
104
+
105
+ def add_uri(uri)
106
+
107
+ if uri.class == String
108
+ uri = URI.parse(uri.strip)
109
+ end
110
+
111
+ unless skip_uri?(uri)
112
+ @queue.push uri.request_uri
113
+ @current_count = @current_count + 1
114
+ @visited[uri.request_uri] = false
115
+ log(" Added #{uri}")
116
+ end
117
+
118
+ end
119
+
120
+
121
+ def get_doc(request_uri)
122
+ doc = Document.new
123
+ begin
124
+ log(" Getting #{request_uri}")
125
+ request_uri = URI.parse(request_uri)
126
+
127
+ uri = @site_uri.clone
128
+ uri.path = request_uri.path #if request_uri.path.to_s != "/"
129
+ uri.query = request_uri.query
130
+ doc.uri = uri
131
+ doc.fetched_at = Time.now
132
+
133
+ log("Opening #{uri}")
134
+
135
+ file = open(uri)
136
+
137
+ mime_type = file.meta["content-type"].split(";")[0] if file.meta["content-type"]
138
+
139
+ if MARKUP_MIME_TYPES.include?(mime_type.downcase) or @load_binary_data
140
+ log("Loading data from #{uri}")
141
+ doc.data = file.read
142
+ else
143
+ log("Skipping data for #{uri}")
144
+ doc.data = nil
145
+ end
146
+
147
+ doc.headers = file.meta
148
+ doc.http_status = file.status
149
+
150
+ rescue => error
151
+ log("Error fetching #{uri}: #{error.message}")
152
+ if error.message[0..2] =~ /\d\d\d/ then
153
+ doc.http_status = [error.message[0..2], error.message[3..-1]]
154
+ return doc
155
+ else
156
+ raise error
157
+ end
158
+ end
159
+ return doc
160
+ end
161
+
162
+
163
+ def queue_local_links(doc)
164
+ return if doc.data == nil
165
+ log("Queuing links for #{doc.uri}")
166
+ Hpricot.buffer_size = 524288 #Allow for asp.net bastard-sized viewstate attributes...
167
+ doc = Hpricot(doc.data)
168
+ links = doc.search("a[@href]")
169
+ for link in links
170
+ if link.attributes["href"].length > 0 then
171
+ begin
172
+ uri = URI.parse(link.attributes["href"])
173
+ add_uri(uri)
174
+ rescue
175
+ #skip this link
176
+ end
177
+ end
178
+ end
179
+ doc = nil
180
+ end
181
+
182
+
183
+ # Initiate crawling.
184
+ def crawl()
185
+ while (!@queue.empty?)
186
+ uri = @queue.shift
187
+ current_doc = get_doc(uri)
188
+ yield current_doc
189
+ queue_local_links(current_doc)
190
+ @visited[uri] = true
191
+ end
192
+ end
193
+
194
+ end
186
195
  end
@@ -3,11 +3,11 @@ require 'test/unit'
3
3
  require 'uri'
4
4
 
5
5
  class SimpleCrawlerTest < Test::Unit::TestCase
6
-
6
+
7
7
  def setup
8
8
  @simplecrawler = SimpleCrawler::Crawler.new("http://www.example.com/")
9
9
  end
10
-
10
+
11
11
 
12
12
  def test_initialize_crawler
13
13
  @crawler = SimpleCrawler::Crawler.new("http://www.example.com/")
@@ -30,6 +30,11 @@ class SimpleCrawlerTest < Test::Unit::TestCase
30
30
  end
31
31
 
32
32
 
33
+ def test_include_pattern_query
34
+ @simplecrawler.include_patterns = ["\\/test\\?a=b"]
35
+ assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/test?a=b"))
36
+ end
37
+
33
38
 
34
39
  def test_maxcount_limit
35
40
  @simplecrawler.maxcount = 2
@@ -49,14 +54,14 @@ class SimpleCrawlerTest < Test::Unit::TestCase
49
54
  assert @simplecrawler.skip_uri?(URI.parse("http://www.example.com/word.doc"))
50
55
  assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/doc.htm"))
51
56
  end
52
-
57
+
53
58
  def test_include_pattern
54
59
  @simplecrawler.include_patterns = ["\\/test\\/", "docs"]
55
60
  assert @simplecrawler.skip_uri?(URI.parse("http://www.example.com/word.doc"))
56
61
  assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/test/doc.htm"))
57
62
  assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/docs/doc.htm"))
58
63
  end
59
-
64
+
60
65
 
61
66
 
62
67
  def test_addded_paths_shuld_be_distinct
metadata CHANGED
@@ -1,72 +1,66 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: simplecrawler
3
- version: !ruby/object:Gem::Version
4
- version: 0.1.7
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.8
5
+ prerelease:
5
6
  platform: ruby
6
- authors:
7
+ authors:
7
8
  - Peter Krantz
8
- autorequire: simplecrawler
9
+ autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
-
12
- date: 2009-05-04 00:00:00 +02:00
12
+ date: 2011-11-02 00:00:00.000000000 +01:00
13
13
  default_executable:
14
- dependencies:
15
- - !ruby/object:Gem::Dependency
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
16
  name: hpricot
17
+ requirement: &2161765200 !ruby/object:Gem::Requirement
18
+ none: false
19
+ requirements:
20
+ - - ! '>='
21
+ - !ruby/object:Gem::Version
22
+ version: '0.5'
17
23
  type: :runtime
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
20
- requirements:
21
- - - ">="
22
- - !ruby/object:Gem::Version
23
- version: "0.5"
24
- version:
25
- description:
24
+ prerelease: false
25
+ version_requirements: *2161765200
26
+ description: ! "The SimpleCrawler module is a library for crawling web\n sites. The
27
+ crawler provides comprehensive data from the page crawled which\n can be used for
28
+ page analysis, indexing, accessibility checks etc.\n Restrictions can be specified
29
+ to limit crawling of binary files."
26
30
  email: peter.krantzNODAMNSPAM@gmail.com
27
31
  executables: []
28
-
29
32
  extensions: []
30
-
31
33
  extra_rdoc_files: []
32
-
33
- files:
34
- - README
34
+ files:
35
35
  - lib/document.rb
36
36
  - lib/simplecrawler.rb
37
+ - LICENSE
38
+ - README
37
39
  - tests/simplecrawler_test.rb
38
- - examples/accessibility_report.rb
39
- - examples/crawl.rb
40
- - examples/find_broken_links.rb
41
- - examples/find_pdfs.rb
42
- - examples/list_site_links.rb
43
- - examples/result.htm
44
- - examples/riksdagen.txt
45
40
  has_rdoc: true
46
41
  homepage: http://www.peterkrantz.com/simplecrawler/wiki/
42
+ licenses: []
47
43
  post_install_message:
48
44
  rdoc_options: []
49
-
50
- require_paths:
45
+ require_paths:
51
46
  - lib
52
- required_ruby_version: !ruby/object:Gem::Requirement
53
- requirements:
54
- - - ">="
55
- - !ruby/object:Gem::Version
47
+ required_ruby_version: !ruby/object:Gem::Requirement
48
+ none: false
49
+ requirements:
50
+ - - ! '>='
51
+ - !ruby/object:Gem::Version
56
52
  version: 1.8.2
57
- version:
58
- required_rubygems_version: !ruby/object:Gem::Requirement
59
- requirements:
60
- - - ">="
61
- - !ruby/object:Gem::Version
62
- version: "0"
63
- version:
53
+ required_rubygems_version: !ruby/object:Gem::Requirement
54
+ none: false
55
+ requirements:
56
+ - - ! '>='
57
+ - !ruby/object:Gem::Version
58
+ version: '0'
64
59
  requirements: []
65
-
66
60
  rubyforge_project: simplecrawler
67
- rubygems_version: 1.3.1
61
+ rubygems_version: 1.6.2
68
62
  signing_key:
69
- specification_version: 2
63
+ specification_version: 3
70
64
  summary: A generic library for web crawling.
71
- test_files:
65
+ test_files:
72
66
  - tests/simplecrawler_test.rb