simplecrawler 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +4 -0
- data/examples/accessibility_report.rb +23 -0
- data/examples/crawl.rb +12 -0
- data/examples/list_site_links.rb +11 -0
- data/lib/simplecrawler.rb +173 -0
- data/tests/simplecrawler_test.rb +101 -0
- metadata +59 -0
data/README
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require '../lib/simplecrawler.rb'
|
2
|
+
require 'raakt'
|
3
|
+
require 'ruport'
|
4
|
+
|
5
|
+
# Set up a new crawler
|
6
|
+
sc = SimpleCrawler::Crawler.new(ARGV[0])
|
7
|
+
sc.skip_patterns = ["\\.doc$", "\\.pdf$", "\\.xls$", "\\.pdf$", "\\.zip$"]
|
8
|
+
sc.maxcount = 10
|
9
|
+
|
10
|
+
report_data = Ruport::Data::Table.new :column_names => ["Url", "Error count"]
|
11
|
+
|
12
|
+
sc.crawl { |document|
|
13
|
+
|
14
|
+
# Run basic accessibility check
|
15
|
+
raakt = Raakt::Test.new(document.data)
|
16
|
+
result = raakt.all
|
17
|
+
puts "#{result.length}\t#{document.uri}"
|
18
|
+
if result.length > 0
|
19
|
+
report_data << [document.uri, result.length]
|
20
|
+
end
|
21
|
+
}
|
22
|
+
|
23
|
+
puts report_data
|
data/examples/crawl.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require '../lib/simplecrawler.rb'
|
2
|
+
|
3
|
+
# Set up a new crawler
|
4
|
+
sc = SimpleCrawler::Crawler.new(ARGV[0])
|
5
|
+
sc.skip_patterns = ["\\.doc$", "\\.pdf$", "\\.xls$", "\\.pdf$", "\\.zip$"]
|
6
|
+
|
7
|
+
sc.crawl { |document|
|
8
|
+
|
9
|
+
# Print links for entire site
|
10
|
+
puts document.uri
|
11
|
+
|
12
|
+
}
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require '../lib/simplecrawler.rb'
|
2
|
+
|
3
|
+
# Set up a new crawler
|
4
|
+
sc = SimpleCrawler::Crawler.new(ARGV[0])
|
5
|
+
sc.skip_patterns = ["\\.doc$", "\\.pdf$", "\\.xls$", "\\.pdf$", "\\.zip$"]
|
6
|
+
|
7
|
+
sc.crawl { |document|
|
8
|
+
# List links for entire site
|
9
|
+
puts document.uri
|
10
|
+
}
|
11
|
+
|
@@ -0,0 +1,173 @@
|
|
1
|
+
# == Simple Crawler
|
2
|
+
# :title: SimpleCrawler - a generic web crawler library in Ruby
|
3
|
+
# Author:: Peter Krantz (http://www.peterkrantz.com)
|
4
|
+
# License:: LGPL (See LICENSE file)
|
5
|
+
#
|
6
|
+
# The SimpleCrawler module is a library for crawling web sites. The crawler provides comprehensive data from the page crawled which can be used for page analysis, indexing, accessibility checks etc. Restrictions can be specified to limit crawling of binary files.
|
7
|
+
#
|
8
|
+
# == Output
|
9
|
+
# The SimpleCrawler::Crawler class yields a SimpleCrawler::Document object instance. This object contains information about a specific URI such as http headers and response data etc.
|
10
|
+
#
|
11
|
+
# == Contributions
|
12
|
+
# None yet :-) Why don't you go ahead and be first?
|
13
|
+
#
|
14
|
+
# == Example usage
|
15
|
+
# See the "Simple Crawler wiki"[http://www.peterkrantz.com/simplecrawler/wiki/].
|
16
|
+
|
17
|
+
module SimpleCrawler
|
18
|
+
|
19
|
+
require 'uri'
|
20
|
+
require 'rubygems'
|
21
|
+
require 'hpricot'
|
22
|
+
require 'open-uri'
|
23
|
+
|
24
|
+
MARKUP_MIME_TYPES = ["text/html", "text/xml", "application/xml", "application/xhtml+xml"]
|
25
|
+
|
26
|
+
VERSION = "0.1.0"
|
27
|
+
|
28
|
+
class Document
|
29
|
+
attr_accessor :uri, :data, :headers, :fetched_at
|
30
|
+
|
31
|
+
def to_s
|
32
|
+
puts "Document"
|
33
|
+
puts " .uri:\t\t#{uri}"
|
34
|
+
puts " .fetched_at:\t#{fetched_at}"
|
35
|
+
puts " .headers:"
|
36
|
+
for header in headers
|
37
|
+
puts " #{header[0]}: #{header[1]}"
|
38
|
+
end
|
39
|
+
puts " .data.length:\t#{(data.length)}"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
|
44
|
+
|
45
|
+
|
46
|
+
class Crawler
|
47
|
+
|
48
|
+
attr_accessor :user_agent, :skip_patterns, :load_binary_data, :queue, :visited, :site_uri, :maxcount, :current_count
|
49
|
+
|
50
|
+
def initialize(url)
|
51
|
+
@load_binary_data = false #default, skip loading of pagedata for binary files into Document.data
|
52
|
+
@site_uri = URI.parse(url)
|
53
|
+
@site_uri.path = "/" if @site_uri.path == ""
|
54
|
+
@visited = Hash.new
|
55
|
+
@queue = Array.new
|
56
|
+
@current_count = 0
|
57
|
+
add_uri(@site_uri)
|
58
|
+
end
|
59
|
+
|
60
|
+
|
61
|
+
# Check if a path should be ignored because it matches a skip pattern or is already visited.
|
62
|
+
def skip_uri?(uri)
|
63
|
+
|
64
|
+
#Check if maxcount is reached
|
65
|
+
if @maxcount
|
66
|
+
if @current_count >= @maxcount
|
67
|
+
return true
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
#Check if path belongs to site
|
72
|
+
unless (uri.relative? or uri.host == @site_uri.host)
|
73
|
+
return true
|
74
|
+
end
|
75
|
+
|
76
|
+
#Check if fragment identifier (e.g. #content)
|
77
|
+
if uri.path.length == 0 and uri.fragment.length > 0
|
78
|
+
return true
|
79
|
+
end
|
80
|
+
|
81
|
+
#Check if uri already visited in this crawl or if it is queued for crawling
|
82
|
+
if @visited.has_key?(uri.path) or @queue.include?(uri.path)
|
83
|
+
return true
|
84
|
+
end
|
85
|
+
|
86
|
+
#Check if uri is in a skip pattern
|
87
|
+
if @skip_patterns
|
88
|
+
for skip_pattern in @skip_patterns
|
89
|
+
re = Regexp.new(skip_pattern)
|
90
|
+
if re.match(uri.path)
|
91
|
+
return true
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
return false
|
97
|
+
end
|
98
|
+
|
99
|
+
|
100
|
+
def add_uri(uri)
|
101
|
+
|
102
|
+
if uri.class == String
|
103
|
+
uri = URI.parse(uri.strip)
|
104
|
+
end
|
105
|
+
|
106
|
+
unless skip_uri?(uri)
|
107
|
+
@queue.push uri.path
|
108
|
+
@current_count = @current_count + 1
|
109
|
+
@visited[uri.path] = false
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
113
|
+
|
114
|
+
|
115
|
+
def get_doc(path)
|
116
|
+
doc = Document.new
|
117
|
+
begin
|
118
|
+
uri = @site_uri.clone
|
119
|
+
uri.path = uri.path + path if path != "/"
|
120
|
+
doc.uri = uri
|
121
|
+
|
122
|
+
file = open(uri)
|
123
|
+
|
124
|
+
mime_type = file.meta["content-type"].split(";")[0] if file.meta["content-type"]
|
125
|
+
|
126
|
+
if MARKUP_MIME_TYPES.include?(mime_type.downcase) or @load_binary_data
|
127
|
+
doc.data = file.read
|
128
|
+
else
|
129
|
+
doc.data = nil
|
130
|
+
end
|
131
|
+
|
132
|
+
doc.headers = file.meta
|
133
|
+
doc.fetched_at = Time.now
|
134
|
+
rescue Exception
|
135
|
+
puts "Error: #{$!}"
|
136
|
+
return ""
|
137
|
+
end
|
138
|
+
return doc
|
139
|
+
end
|
140
|
+
|
141
|
+
|
142
|
+
def queue_local_links(doc)
|
143
|
+
return if doc.data == nil
|
144
|
+
Hpricot.buffer_size = 262144 #Allow for asp.net bastard-sized viewstate attributes...
|
145
|
+
doc = Hpricot(doc.data)
|
146
|
+
links = doc.search("a[@href]")
|
147
|
+
for link in links
|
148
|
+
if link.attributes["href"].length > 0 then
|
149
|
+
begin
|
150
|
+
uri = URI.parse(link.attributes["href"])
|
151
|
+
add_uri(uri)
|
152
|
+
rescue
|
153
|
+
#skip this link
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
doc = nil
|
158
|
+
end
|
159
|
+
|
160
|
+
|
161
|
+
# Initiate crawling.
|
162
|
+
def crawl()
|
163
|
+
while (!@queue.empty?)
|
164
|
+
uri = @queue.shift
|
165
|
+
current_doc = get_doc(uri)
|
166
|
+
yield current_doc
|
167
|
+
queue_local_links(current_doc)
|
168
|
+
@visited[uri] = true
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|
173
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../lib/simplecrawler'
|
2
|
+
require 'test/unit'
|
3
|
+
require 'uri'
|
4
|
+
|
5
|
+
class SimpleCrawlerTest < Test::Unit::TestCase
|
6
|
+
|
7
|
+
def setup
|
8
|
+
@simplecrawler = SimpleCrawler::Crawler.new("http://www.example.com/")
|
9
|
+
end
|
10
|
+
|
11
|
+
|
12
|
+
def test_initialize_crawler
|
13
|
+
@crawler = SimpleCrawler::Crawler.new("http://www.example.com/")
|
14
|
+
assert @crawler.queue.length == 1
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
def test_initialize_crawler_without_uri_path
|
19
|
+
@crawler = SimpleCrawler::Crawler.new("http://www.example.com")
|
20
|
+
assert @crawler.queue.length == 1
|
21
|
+
|
22
|
+
# Default path (/) should be appended
|
23
|
+
assert @crawler.queue[0][-1..-1] == "/"
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
def test_maxcount_limit
|
29
|
+
@simplecrawler.maxcount = 2
|
30
|
+
@simplecrawler.add_uri("http://www.example.com/second/")
|
31
|
+
@simplecrawler.add_uri("http://www.example.com/third/")
|
32
|
+
assert_equal 2, @simplecrawler.queue.length
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_maxcount_unlimited
|
36
|
+
@simplecrawler.add_uri("http://www.example.com/second/")
|
37
|
+
@simplecrawler.add_uri("http://www.example.com/third/")
|
38
|
+
assert @simplecrawler.queue.length == 3
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_skip_uri
|
42
|
+
@simplecrawler.skip_patterns = ["\\.doc$"]
|
43
|
+
assert @simplecrawler.skip_uri?(URI.parse("http://www.example.com/word.doc"))
|
44
|
+
assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/doc.htm"))
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
def test_addded_paths_shuld_be_distinct
|
49
|
+
@simplecrawler.add_uri("http://www.example.com/") # This path is already in the queue
|
50
|
+
assert_equal 1, @simplecrawler.queue.length
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_add_uri
|
54
|
+
@simplecrawler.add_uri("http://www.example.com/new/")
|
55
|
+
|
56
|
+
# The queue should now contain the initial base url and the newly added path
|
57
|
+
assert_equal 2, @simplecrawler.queue.length
|
58
|
+
end
|
59
|
+
|
60
|
+
|
61
|
+
def test_add_uri_with_space
|
62
|
+
@simplecrawler.add_uri("http://www.example.com/new/ ")
|
63
|
+
|
64
|
+
# The queue should now contain the initial base url and the newly added path without spaces
|
65
|
+
assert_equal 2, @simplecrawler.queue.length
|
66
|
+
assert @simplecrawler.queue[1][-1..-1] != " "
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
|
71
|
+
def test_queue_local_link
|
72
|
+
doc = SimpleCrawler::Document.new
|
73
|
+
doc.data = "<html><head></head><body><a href=\"http://www.example.com/new/\">Test</a></body></html>"
|
74
|
+
@simplecrawler.queue_local_links(doc)
|
75
|
+
assert_equal 2, @simplecrawler.queue.length
|
76
|
+
end
|
77
|
+
|
78
|
+
|
79
|
+
def test_queue_local_fragment_identifier_skipped
|
80
|
+
doc = SimpleCrawler::Document.new
|
81
|
+
doc.data = "<html><head></head><body><a href=\"#new\">Test</a></body></html>"
|
82
|
+
@simplecrawler.queue_local_links(doc)
|
83
|
+
assert_equal 1, @simplecrawler.queue.length
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
def test_queue_local_links_for_empty_doc
|
88
|
+
doc = SimpleCrawler::Document.new
|
89
|
+
doc.data = ""
|
90
|
+
@simplecrawler.queue_local_links(doc)
|
91
|
+
assert_equal 1, @simplecrawler.queue.length
|
92
|
+
end
|
93
|
+
|
94
|
+
|
95
|
+
def test_queue_local_links_for_nil_doc
|
96
|
+
doc = SimpleCrawler::Document.new
|
97
|
+
doc.data = nil
|
98
|
+
@simplecrawler.queue_local_links(doc)
|
99
|
+
assert_equal 1, @simplecrawler.queue.length
|
100
|
+
end
|
101
|
+
end
|
metadata
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.2
|
3
|
+
specification_version: 1
|
4
|
+
name: simplecrawler
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.1.0
|
7
|
+
date: 2007-08-27 00:00:00 +02:00
|
8
|
+
summary: A generic library for web crawling.
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: peter.krantzNODAMNSPAM@gmail.com
|
12
|
+
homepage: http://www.peterkrantz.com/simplecrawler/wiki/
|
13
|
+
rubyforge_project: simplecrawler
|
14
|
+
description:
|
15
|
+
autorequire: simplecrawler
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.8.2
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
post_install_message:
|
29
|
+
authors:
|
30
|
+
- Peter Krantz
|
31
|
+
files:
|
32
|
+
- README
|
33
|
+
- lib/simplecrawler.rb
|
34
|
+
- tests/simplecrawler_test.rb
|
35
|
+
- examples/accessibility_report.rb
|
36
|
+
- examples/crawl.rb
|
37
|
+
- examples/list_site_links.rb
|
38
|
+
test_files:
|
39
|
+
- tests/simplecrawler_test.rb
|
40
|
+
rdoc_options: []
|
41
|
+
|
42
|
+
extra_rdoc_files: []
|
43
|
+
|
44
|
+
executables: []
|
45
|
+
|
46
|
+
extensions: []
|
47
|
+
|
48
|
+
requirements: []
|
49
|
+
|
50
|
+
dependencies:
|
51
|
+
- !ruby/object:Gem::Dependency
|
52
|
+
name: hpricot
|
53
|
+
version_requirement:
|
54
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: "0.5"
|
59
|
+
version:
|