simplecrawler 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +14 -0
- data/lib/simplecrawler.rb +179 -170
- data/tests/simplecrawler_test.rb +9 -4
- metadata +40 -46
- data/examples/accessibility_report.rb +0 -44
- data/examples/crawl.rb +0 -12
- data/examples/find_broken_links.rb +0 -21
- data/examples/find_pdfs.rb +0 -20
- data/examples/list_site_links.rb +0 -11
- data/examples/result.htm +0 -1282
- data/examples/riksdagen.txt +0 -66
data/LICENSE
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
Copright 2007, Peter Krantz
|
2
|
+
|
3
|
+
This program is free software; you can redistribute it and/or modify
|
4
|
+
it under the terms of the GNU Lesser General Public License (LGPL) as published by
|
5
|
+
the Free Software Foundation; either version 3 of the License, or
|
6
|
+
(at your option) any later version.
|
7
|
+
|
8
|
+
This program is distributed in the hope that it will be useful,
|
9
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
GNU General Public License for more details.
|
12
|
+
|
13
|
+
You should have received a copy of the GNU General Public License
|
14
|
+
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
data/lib/simplecrawler.rb
CHANGED
@@ -1,12 +1,17 @@
|
|
1
|
-
# == Simple Crawler
|
1
|
+
# == Simple Crawler
|
2
2
|
# :title: SimpleCrawler - a generic web crawler library in Ruby
|
3
3
|
# Author:: Peter Krantz (http://www.peterkrantz.com)
|
4
4
|
# License:: LGPL (See LICENSE file)
|
5
5
|
#
|
6
|
-
# The SimpleCrawler module is a library for crawling web sites. The crawler
|
6
|
+
# The SimpleCrawler module is a library for crawling web sites. The crawler
|
7
|
+
# provides comprehensive data from the page crawled which can be used for page
|
8
|
+
# analysis, indexing, accessibility checks etc. Restrictions can be specified
|
9
|
+
# to limit crawling of binary files.
|
7
10
|
#
|
8
11
|
# == Output
|
9
|
-
# The SimpleCrawler::Crawler class yields a SimpleCrawler::Document object
|
12
|
+
# The SimpleCrawler::Crawler class yields a SimpleCrawler::Document object
|
13
|
+
# instance. This object contains information about a specific URI such as http
|
14
|
+
# headers and response data etc.
|
10
15
|
#
|
11
16
|
# == Contributions
|
12
17
|
# None yet :-) Why don't you go ahead and be first?
|
@@ -16,171 +21,175 @@
|
|
16
21
|
|
17
22
|
module SimpleCrawler
|
18
23
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
24
|
+
require 'uri'
|
25
|
+
require 'rubygems'
|
26
|
+
require 'hpricot'
|
27
|
+
require 'open-uri'
|
28
|
+
require File.dirname(__FILE__) + '/document'
|
29
|
+
|
30
|
+
MARKUP_MIME_TYPES = ["text/html", "text/xml", "application/xml", "application/xhtml+xml"]
|
31
|
+
VERSION = "0.1.8"
|
32
|
+
|
33
|
+
class Crawler
|
34
|
+
|
35
|
+
attr_accessor :user_agent, :skip_patterns, :include_patterns, :load_binary_data, :queue, :visited, :site_uri, :maxcount, :current_count
|
36
|
+
|
37
|
+
def initialize(url)
|
38
|
+
@load_binary_data = false #default, skip loading of pagedata for binary files into Document.data
|
39
|
+
@site_uri = URI.parse(url)
|
40
|
+
@site_uri.path = "/" if @site_uri.path == ""
|
41
|
+
@visited = Hash.new
|
42
|
+
@queue = Array.new
|
43
|
+
@current_count = 0
|
44
|
+
add_uri(@site_uri)
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
# Override this method for your own logging needs.
|
49
|
+
def log(message)
|
50
|
+
puts message
|
51
|
+
end
|
52
|
+
|
53
|
+
# Check if a path should be ignored because it matches a skip pattern or is already visited.
|
54
|
+
def skip_uri?(uri)
|
55
|
+
|
56
|
+
#Check if maxcount is reached
|
57
|
+
if @maxcount
|
58
|
+
if @current_count >= @maxcount
|
59
|
+
return true
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
#Check if path belongs to site
|
64
|
+
unless (uri.relative? or uri.host == @site_uri.host)
|
65
|
+
return true
|
66
|
+
end
|
67
|
+
|
68
|
+
#Check if fragment identifier (e.g. #content)
|
69
|
+
if uri.request_uri.length == 0 and uri.fragment.length > 0
|
70
|
+
return true
|
71
|
+
end
|
72
|
+
|
73
|
+
#Check if uri already visited in this crawl or if it is queued for crawling
|
74
|
+
if @visited.has_key?(uri.request_uri) or @queue.include?(uri.request_uri)
|
75
|
+
return true
|
76
|
+
end
|
77
|
+
|
78
|
+
#Check if uri is in a skip pattern
|
79
|
+
if @skip_patterns
|
80
|
+
for skip_pattern in @skip_patterns
|
81
|
+
re = Regexp.new(skip_pattern)
|
82
|
+
if re.match(uri.request_uri)
|
83
|
+
return true
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
#Check if uri is in at least one of the include patterns
|
89
|
+
if @include_patterns
|
90
|
+
match_found = false
|
91
|
+
for include_pattern in @include_patterns
|
92
|
+
re = Regexp.new(include_pattern)
|
93
|
+
if re.match(uri.request_uri)
|
94
|
+
match_found = true
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
return true unless match_found
|
99
|
+
end
|
100
|
+
|
101
|
+
return false
|
102
|
+
end
|
103
|
+
|
104
|
+
|
105
|
+
def add_uri(uri)
|
106
|
+
|
107
|
+
if uri.class == String
|
108
|
+
uri = URI.parse(uri.strip)
|
109
|
+
end
|
110
|
+
|
111
|
+
unless skip_uri?(uri)
|
112
|
+
@queue.push uri.request_uri
|
113
|
+
@current_count = @current_count + 1
|
114
|
+
@visited[uri.request_uri] = false
|
115
|
+
log(" Added #{uri}")
|
116
|
+
end
|
117
|
+
|
118
|
+
end
|
119
|
+
|
120
|
+
|
121
|
+
def get_doc(request_uri)
|
122
|
+
doc = Document.new
|
123
|
+
begin
|
124
|
+
log(" Getting #{request_uri}")
|
125
|
+
request_uri = URI.parse(request_uri)
|
126
|
+
|
127
|
+
uri = @site_uri.clone
|
128
|
+
uri.path = request_uri.path #if request_uri.path.to_s != "/"
|
129
|
+
uri.query = request_uri.query
|
130
|
+
doc.uri = uri
|
131
|
+
doc.fetched_at = Time.now
|
132
|
+
|
133
|
+
log("Opening #{uri}")
|
134
|
+
|
135
|
+
file = open(uri)
|
136
|
+
|
137
|
+
mime_type = file.meta["content-type"].split(";")[0] if file.meta["content-type"]
|
138
|
+
|
139
|
+
if MARKUP_MIME_TYPES.include?(mime_type.downcase) or @load_binary_data
|
140
|
+
log("Loading data from #{uri}")
|
141
|
+
doc.data = file.read
|
142
|
+
else
|
143
|
+
log("Skipping data for #{uri}")
|
144
|
+
doc.data = nil
|
145
|
+
end
|
146
|
+
|
147
|
+
doc.headers = file.meta
|
148
|
+
doc.http_status = file.status
|
149
|
+
|
150
|
+
rescue => error
|
151
|
+
log("Error fetching #{uri}: #{error.message}")
|
152
|
+
if error.message[0..2] =~ /\d\d\d/ then
|
153
|
+
doc.http_status = [error.message[0..2], error.message[3..-1]]
|
154
|
+
return doc
|
155
|
+
else
|
156
|
+
raise error
|
157
|
+
end
|
158
|
+
end
|
159
|
+
return doc
|
160
|
+
end
|
161
|
+
|
162
|
+
|
163
|
+
def queue_local_links(doc)
|
164
|
+
return if doc.data == nil
|
165
|
+
log("Queuing links for #{doc.uri}")
|
166
|
+
Hpricot.buffer_size = 524288 #Allow for asp.net bastard-sized viewstate attributes...
|
167
|
+
doc = Hpricot(doc.data)
|
168
|
+
links = doc.search("a[@href]")
|
169
|
+
for link in links
|
170
|
+
if link.attributes["href"].length > 0 then
|
171
|
+
begin
|
172
|
+
uri = URI.parse(link.attributes["href"])
|
173
|
+
add_uri(uri)
|
174
|
+
rescue
|
175
|
+
#skip this link
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
doc = nil
|
180
|
+
end
|
181
|
+
|
182
|
+
|
183
|
+
# Initiate crawling.
|
184
|
+
def crawl()
|
185
|
+
while (!@queue.empty?)
|
186
|
+
uri = @queue.shift
|
187
|
+
current_doc = get_doc(uri)
|
188
|
+
yield current_doc
|
189
|
+
queue_local_links(current_doc)
|
190
|
+
@visited[uri] = true
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
end
|
186
195
|
end
|
data/tests/simplecrawler_test.rb
CHANGED
@@ -3,11 +3,11 @@ require 'test/unit'
|
|
3
3
|
require 'uri'
|
4
4
|
|
5
5
|
class SimpleCrawlerTest < Test::Unit::TestCase
|
6
|
-
|
6
|
+
|
7
7
|
def setup
|
8
8
|
@simplecrawler = SimpleCrawler::Crawler.new("http://www.example.com/")
|
9
9
|
end
|
10
|
-
|
10
|
+
|
11
11
|
|
12
12
|
def test_initialize_crawler
|
13
13
|
@crawler = SimpleCrawler::Crawler.new("http://www.example.com/")
|
@@ -30,6 +30,11 @@ class SimpleCrawlerTest < Test::Unit::TestCase
|
|
30
30
|
end
|
31
31
|
|
32
32
|
|
33
|
+
def test_include_pattern_query
|
34
|
+
@simplecrawler.include_patterns = ["\\/test\\?a=b"]
|
35
|
+
assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/test?a=b"))
|
36
|
+
end
|
37
|
+
|
33
38
|
|
34
39
|
def test_maxcount_limit
|
35
40
|
@simplecrawler.maxcount = 2
|
@@ -49,14 +54,14 @@ class SimpleCrawlerTest < Test::Unit::TestCase
|
|
49
54
|
assert @simplecrawler.skip_uri?(URI.parse("http://www.example.com/word.doc"))
|
50
55
|
assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/doc.htm"))
|
51
56
|
end
|
52
|
-
|
57
|
+
|
53
58
|
def test_include_pattern
|
54
59
|
@simplecrawler.include_patterns = ["\\/test\\/", "docs"]
|
55
60
|
assert @simplecrawler.skip_uri?(URI.parse("http://www.example.com/word.doc"))
|
56
61
|
assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/test/doc.htm"))
|
57
62
|
assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/docs/doc.htm"))
|
58
63
|
end
|
59
|
-
|
64
|
+
|
60
65
|
|
61
66
|
|
62
67
|
def test_addded_paths_shuld_be_distinct
|
metadata
CHANGED
@@ -1,72 +1,66 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: simplecrawler
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.8
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
|
-
authors:
|
7
|
+
authors:
|
7
8
|
- Peter Krantz
|
8
|
-
autorequire:
|
9
|
+
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
|
12
|
-
date: 2009-05-04 00:00:00 +02:00
|
12
|
+
date: 2011-11-02 00:00:00.000000000 +01:00
|
13
13
|
default_executable:
|
14
|
-
dependencies:
|
15
|
-
- !ruby/object:Gem::Dependency
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
16
|
name: hpricot
|
17
|
+
requirement: &2161765200 !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ! '>='
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '0.5'
|
17
23
|
type: :runtime
|
18
|
-
|
19
|
-
version_requirements:
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
version:
|
25
|
-
description:
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: *2161765200
|
26
|
+
description: ! "The SimpleCrawler module is a library for crawling web\n sites. The
|
27
|
+
crawler provides comprehensive data from the page crawled which\n can be used for
|
28
|
+
page analysis, indexing, accessibility checks etc.\n Restrictions can be specified
|
29
|
+
to limit crawling of binary files."
|
26
30
|
email: peter.krantzNODAMNSPAM@gmail.com
|
27
31
|
executables: []
|
28
|
-
|
29
32
|
extensions: []
|
30
|
-
|
31
33
|
extra_rdoc_files: []
|
32
|
-
|
33
|
-
files:
|
34
|
-
- README
|
34
|
+
files:
|
35
35
|
- lib/document.rb
|
36
36
|
- lib/simplecrawler.rb
|
37
|
+
- LICENSE
|
38
|
+
- README
|
37
39
|
- tests/simplecrawler_test.rb
|
38
|
-
- examples/accessibility_report.rb
|
39
|
-
- examples/crawl.rb
|
40
|
-
- examples/find_broken_links.rb
|
41
|
-
- examples/find_pdfs.rb
|
42
|
-
- examples/list_site_links.rb
|
43
|
-
- examples/result.htm
|
44
|
-
- examples/riksdagen.txt
|
45
40
|
has_rdoc: true
|
46
41
|
homepage: http://www.peterkrantz.com/simplecrawler/wiki/
|
42
|
+
licenses: []
|
47
43
|
post_install_message:
|
48
44
|
rdoc_options: []
|
49
|
-
|
50
|
-
require_paths:
|
45
|
+
require_paths:
|
51
46
|
- lib
|
52
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
53
|
-
|
54
|
-
|
55
|
-
|
47
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
48
|
+
none: false
|
49
|
+
requirements:
|
50
|
+
- - ! '>='
|
51
|
+
- !ruby/object:Gem::Version
|
56
52
|
version: 1.8.2
|
57
|
-
|
58
|
-
|
59
|
-
requirements:
|
60
|
-
- -
|
61
|
-
- !ruby/object:Gem::Version
|
62
|
-
version:
|
63
|
-
version:
|
53
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
54
|
+
none: false
|
55
|
+
requirements:
|
56
|
+
- - ! '>='
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: '0'
|
64
59
|
requirements: []
|
65
|
-
|
66
60
|
rubyforge_project: simplecrawler
|
67
|
-
rubygems_version: 1.
|
61
|
+
rubygems_version: 1.6.2
|
68
62
|
signing_key:
|
69
|
-
specification_version:
|
63
|
+
specification_version: 3
|
70
64
|
summary: A generic library for web crawling.
|
71
|
-
test_files:
|
65
|
+
test_files:
|
72
66
|
- tests/simplecrawler_test.rb
|