simplecrawler 0.1.7 → 0.1.8
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +14 -0
- data/lib/simplecrawler.rb +179 -170
- data/tests/simplecrawler_test.rb +9 -4
- metadata +40 -46
- data/examples/accessibility_report.rb +0 -44
- data/examples/crawl.rb +0 -12
- data/examples/find_broken_links.rb +0 -21
- data/examples/find_pdfs.rb +0 -20
- data/examples/list_site_links.rb +0 -11
- data/examples/result.htm +0 -1282
- data/examples/riksdagen.txt +0 -66
data/LICENSE
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
Copright 2007, Peter Krantz
|
2
|
+
|
3
|
+
This program is free software; you can redistribute it and/or modify
|
4
|
+
it under the terms of the GNU Lesser General Public License (LGPL) as published by
|
5
|
+
the Free Software Foundation; either version 3 of the License, or
|
6
|
+
(at your option) any later version.
|
7
|
+
|
8
|
+
This program is distributed in the hope that it will be useful,
|
9
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
GNU General Public License for more details.
|
12
|
+
|
13
|
+
You should have received a copy of the GNU General Public License
|
14
|
+
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
data/lib/simplecrawler.rb
CHANGED
@@ -1,12 +1,17 @@
|
|
1
|
-
# == Simple Crawler
|
1
|
+
# == Simple Crawler
|
2
2
|
# :title: SimpleCrawler - a generic web crawler library in Ruby
|
3
3
|
# Author:: Peter Krantz (http://www.peterkrantz.com)
|
4
4
|
# License:: LGPL (See LICENSE file)
|
5
5
|
#
|
6
|
-
# The SimpleCrawler module is a library for crawling web sites. The crawler
|
6
|
+
# The SimpleCrawler module is a library for crawling web sites. The crawler
|
7
|
+
# provides comprehensive data from the page crawled which can be used for page
|
8
|
+
# analysis, indexing, accessibility checks etc. Restrictions can be specified
|
9
|
+
# to limit crawling of binary files.
|
7
10
|
#
|
8
11
|
# == Output
|
9
|
-
# The SimpleCrawler::Crawler class yields a SimpleCrawler::Document object
|
12
|
+
# The SimpleCrawler::Crawler class yields a SimpleCrawler::Document object
|
13
|
+
# instance. This object contains information about a specific URI such as http
|
14
|
+
# headers and response data etc.
|
10
15
|
#
|
11
16
|
# == Contributions
|
12
17
|
# None yet :-) Why don't you go ahead and be first?
|
@@ -16,171 +21,175 @@
|
|
16
21
|
|
17
22
|
module SimpleCrawler
|
18
23
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
24
|
+
require 'uri'
|
25
|
+
require 'rubygems'
|
26
|
+
require 'hpricot'
|
27
|
+
require 'open-uri'
|
28
|
+
require File.dirname(__FILE__) + '/document'
|
29
|
+
|
30
|
+
MARKUP_MIME_TYPES = ["text/html", "text/xml", "application/xml", "application/xhtml+xml"]
|
31
|
+
VERSION = "0.1.8"
|
32
|
+
|
33
|
+
class Crawler
|
34
|
+
|
35
|
+
attr_accessor :user_agent, :skip_patterns, :include_patterns, :load_binary_data, :queue, :visited, :site_uri, :maxcount, :current_count
|
36
|
+
|
37
|
+
def initialize(url)
|
38
|
+
@load_binary_data = false #default, skip loading of pagedata for binary files into Document.data
|
39
|
+
@site_uri = URI.parse(url)
|
40
|
+
@site_uri.path = "/" if @site_uri.path == ""
|
41
|
+
@visited = Hash.new
|
42
|
+
@queue = Array.new
|
43
|
+
@current_count = 0
|
44
|
+
add_uri(@site_uri)
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
# Override this method for your own logging needs.
|
49
|
+
def log(message)
|
50
|
+
puts message
|
51
|
+
end
|
52
|
+
|
53
|
+
# Check if a path should be ignored because it matches a skip pattern or is already visited.
|
54
|
+
def skip_uri?(uri)
|
55
|
+
|
56
|
+
#Check if maxcount is reached
|
57
|
+
if @maxcount
|
58
|
+
if @current_count >= @maxcount
|
59
|
+
return true
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
#Check if path belongs to site
|
64
|
+
unless (uri.relative? or uri.host == @site_uri.host)
|
65
|
+
return true
|
66
|
+
end
|
67
|
+
|
68
|
+
#Check if fragment identifier (e.g. #content)
|
69
|
+
if uri.request_uri.length == 0 and uri.fragment.length > 0
|
70
|
+
return true
|
71
|
+
end
|
72
|
+
|
73
|
+
#Check if uri already visited in this crawl or if it is queued for crawling
|
74
|
+
if @visited.has_key?(uri.request_uri) or @queue.include?(uri.request_uri)
|
75
|
+
return true
|
76
|
+
end
|
77
|
+
|
78
|
+
#Check if uri is in a skip pattern
|
79
|
+
if @skip_patterns
|
80
|
+
for skip_pattern in @skip_patterns
|
81
|
+
re = Regexp.new(skip_pattern)
|
82
|
+
if re.match(uri.request_uri)
|
83
|
+
return true
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
#Check if uri is in at least one of the include patterns
|
89
|
+
if @include_patterns
|
90
|
+
match_found = false
|
91
|
+
for include_pattern in @include_patterns
|
92
|
+
re = Regexp.new(include_pattern)
|
93
|
+
if re.match(uri.request_uri)
|
94
|
+
match_found = true
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
return true unless match_found
|
99
|
+
end
|
100
|
+
|
101
|
+
return false
|
102
|
+
end
|
103
|
+
|
104
|
+
|
105
|
+
def add_uri(uri)
|
106
|
+
|
107
|
+
if uri.class == String
|
108
|
+
uri = URI.parse(uri.strip)
|
109
|
+
end
|
110
|
+
|
111
|
+
unless skip_uri?(uri)
|
112
|
+
@queue.push uri.request_uri
|
113
|
+
@current_count = @current_count + 1
|
114
|
+
@visited[uri.request_uri] = false
|
115
|
+
log(" Added #{uri}")
|
116
|
+
end
|
117
|
+
|
118
|
+
end
|
119
|
+
|
120
|
+
|
121
|
+
def get_doc(request_uri)
|
122
|
+
doc = Document.new
|
123
|
+
begin
|
124
|
+
log(" Getting #{request_uri}")
|
125
|
+
request_uri = URI.parse(request_uri)
|
126
|
+
|
127
|
+
uri = @site_uri.clone
|
128
|
+
uri.path = request_uri.path #if request_uri.path.to_s != "/"
|
129
|
+
uri.query = request_uri.query
|
130
|
+
doc.uri = uri
|
131
|
+
doc.fetched_at = Time.now
|
132
|
+
|
133
|
+
log("Opening #{uri}")
|
134
|
+
|
135
|
+
file = open(uri)
|
136
|
+
|
137
|
+
mime_type = file.meta["content-type"].split(";")[0] if file.meta["content-type"]
|
138
|
+
|
139
|
+
if MARKUP_MIME_TYPES.include?(mime_type.downcase) or @load_binary_data
|
140
|
+
log("Loading data from #{uri}")
|
141
|
+
doc.data = file.read
|
142
|
+
else
|
143
|
+
log("Skipping data for #{uri}")
|
144
|
+
doc.data = nil
|
145
|
+
end
|
146
|
+
|
147
|
+
doc.headers = file.meta
|
148
|
+
doc.http_status = file.status
|
149
|
+
|
150
|
+
rescue => error
|
151
|
+
log("Error fetching #{uri}: #{error.message}")
|
152
|
+
if error.message[0..2] =~ /\d\d\d/ then
|
153
|
+
doc.http_status = [error.message[0..2], error.message[3..-1]]
|
154
|
+
return doc
|
155
|
+
else
|
156
|
+
raise error
|
157
|
+
end
|
158
|
+
end
|
159
|
+
return doc
|
160
|
+
end
|
161
|
+
|
162
|
+
|
163
|
+
def queue_local_links(doc)
|
164
|
+
return if doc.data == nil
|
165
|
+
log("Queuing links for #{doc.uri}")
|
166
|
+
Hpricot.buffer_size = 524288 #Allow for asp.net bastard-sized viewstate attributes...
|
167
|
+
doc = Hpricot(doc.data)
|
168
|
+
links = doc.search("a[@href]")
|
169
|
+
for link in links
|
170
|
+
if link.attributes["href"].length > 0 then
|
171
|
+
begin
|
172
|
+
uri = URI.parse(link.attributes["href"])
|
173
|
+
add_uri(uri)
|
174
|
+
rescue
|
175
|
+
#skip this link
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
doc = nil
|
180
|
+
end
|
181
|
+
|
182
|
+
|
183
|
+
# Initiate crawling.
|
184
|
+
def crawl()
|
185
|
+
while (!@queue.empty?)
|
186
|
+
uri = @queue.shift
|
187
|
+
current_doc = get_doc(uri)
|
188
|
+
yield current_doc
|
189
|
+
queue_local_links(current_doc)
|
190
|
+
@visited[uri] = true
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
end
|
186
195
|
end
|
data/tests/simplecrawler_test.rb
CHANGED
@@ -3,11 +3,11 @@ require 'test/unit'
|
|
3
3
|
require 'uri'
|
4
4
|
|
5
5
|
class SimpleCrawlerTest < Test::Unit::TestCase
|
6
|
-
|
6
|
+
|
7
7
|
def setup
|
8
8
|
@simplecrawler = SimpleCrawler::Crawler.new("http://www.example.com/")
|
9
9
|
end
|
10
|
-
|
10
|
+
|
11
11
|
|
12
12
|
def test_initialize_crawler
|
13
13
|
@crawler = SimpleCrawler::Crawler.new("http://www.example.com/")
|
@@ -30,6 +30,11 @@ class SimpleCrawlerTest < Test::Unit::TestCase
|
|
30
30
|
end
|
31
31
|
|
32
32
|
|
33
|
+
def test_include_pattern_query
|
34
|
+
@simplecrawler.include_patterns = ["\\/test\\?a=b"]
|
35
|
+
assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/test?a=b"))
|
36
|
+
end
|
37
|
+
|
33
38
|
|
34
39
|
def test_maxcount_limit
|
35
40
|
@simplecrawler.maxcount = 2
|
@@ -49,14 +54,14 @@ class SimpleCrawlerTest < Test::Unit::TestCase
|
|
49
54
|
assert @simplecrawler.skip_uri?(URI.parse("http://www.example.com/word.doc"))
|
50
55
|
assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/doc.htm"))
|
51
56
|
end
|
52
|
-
|
57
|
+
|
53
58
|
def test_include_pattern
|
54
59
|
@simplecrawler.include_patterns = ["\\/test\\/", "docs"]
|
55
60
|
assert @simplecrawler.skip_uri?(URI.parse("http://www.example.com/word.doc"))
|
56
61
|
assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/test/doc.htm"))
|
57
62
|
assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/docs/doc.htm"))
|
58
63
|
end
|
59
|
-
|
64
|
+
|
60
65
|
|
61
66
|
|
62
67
|
def test_addded_paths_shuld_be_distinct
|
metadata
CHANGED
@@ -1,72 +1,66 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: simplecrawler
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.8
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
|
-
authors:
|
7
|
+
authors:
|
7
8
|
- Peter Krantz
|
8
|
-
autorequire:
|
9
|
+
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
|
12
|
-
date: 2009-05-04 00:00:00 +02:00
|
12
|
+
date: 2011-11-02 00:00:00.000000000 +01:00
|
13
13
|
default_executable:
|
14
|
-
dependencies:
|
15
|
-
- !ruby/object:Gem::Dependency
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
16
|
name: hpricot
|
17
|
+
requirement: &2161765200 !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ! '>='
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '0.5'
|
17
23
|
type: :runtime
|
18
|
-
|
19
|
-
version_requirements:
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
version:
|
25
|
-
description:
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: *2161765200
|
26
|
+
description: ! "The SimpleCrawler module is a library for crawling web\n sites. The
|
27
|
+
crawler provides comprehensive data from the page crawled which\n can be used for
|
28
|
+
page analysis, indexing, accessibility checks etc.\n Restrictions can be specified
|
29
|
+
to limit crawling of binary files."
|
26
30
|
email: peter.krantzNODAMNSPAM@gmail.com
|
27
31
|
executables: []
|
28
|
-
|
29
32
|
extensions: []
|
30
|
-
|
31
33
|
extra_rdoc_files: []
|
32
|
-
|
33
|
-
files:
|
34
|
-
- README
|
34
|
+
files:
|
35
35
|
- lib/document.rb
|
36
36
|
- lib/simplecrawler.rb
|
37
|
+
- LICENSE
|
38
|
+
- README
|
37
39
|
- tests/simplecrawler_test.rb
|
38
|
-
- examples/accessibility_report.rb
|
39
|
-
- examples/crawl.rb
|
40
|
-
- examples/find_broken_links.rb
|
41
|
-
- examples/find_pdfs.rb
|
42
|
-
- examples/list_site_links.rb
|
43
|
-
- examples/result.htm
|
44
|
-
- examples/riksdagen.txt
|
45
40
|
has_rdoc: true
|
46
41
|
homepage: http://www.peterkrantz.com/simplecrawler/wiki/
|
42
|
+
licenses: []
|
47
43
|
post_install_message:
|
48
44
|
rdoc_options: []
|
49
|
-
|
50
|
-
require_paths:
|
45
|
+
require_paths:
|
51
46
|
- lib
|
52
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
53
|
-
|
54
|
-
|
55
|
-
|
47
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
48
|
+
none: false
|
49
|
+
requirements:
|
50
|
+
- - ! '>='
|
51
|
+
- !ruby/object:Gem::Version
|
56
52
|
version: 1.8.2
|
57
|
-
|
58
|
-
|
59
|
-
requirements:
|
60
|
-
- -
|
61
|
-
- !ruby/object:Gem::Version
|
62
|
-
version:
|
63
|
-
version:
|
53
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
54
|
+
none: false
|
55
|
+
requirements:
|
56
|
+
- - ! '>='
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: '0'
|
64
59
|
requirements: []
|
65
|
-
|
66
60
|
rubyforge_project: simplecrawler
|
67
|
-
rubygems_version: 1.
|
61
|
+
rubygems_version: 1.6.2
|
68
62
|
signing_key:
|
69
|
-
specification_version:
|
63
|
+
specification_version: 3
|
70
64
|
summary: A generic library for web crawling.
|
71
|
-
test_files:
|
65
|
+
test_files:
|
72
66
|
- tests/simplecrawler_test.rb
|