simplecrawler 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,4 @@
1
+ SimpleCrawler - a web crawling library for Ruby
2
+
3
+ see http://www.peterkrantz.com/simplecrawler/wiki/ for more information
4
+
@@ -0,0 +1,23 @@
1
+ require '../lib/simplecrawler.rb'
2
+ require 'raakt'
3
+ require 'ruport'
4
+
5
+ # Set up a new crawler
6
+ sc = SimpleCrawler::Crawler.new(ARGV[0])
7
+ sc.skip_patterns = ["\\.doc$", "\\.pdf$", "\\.xls$", "\\.pdf$", "\\.zip$"]
8
+ sc.maxcount = 10
9
+
10
+ report_data = Ruport::Data::Table.new :column_names => ["Url", "Error count"]
11
+
12
+ sc.crawl { |document|
13
+
14
+ # Run basic accessibility check
15
+ raakt = Raakt::Test.new(document.data)
16
+ result = raakt.all
17
+ puts "#{result.length}\t#{document.uri}"
18
+ if result.length > 0
19
+ report_data << [document.uri, result.length]
20
+ end
21
+ }
22
+
23
+ puts report_data
@@ -0,0 +1,12 @@
1
+ require '../lib/simplecrawler.rb'
2
+
3
+ # Set up a new crawler
4
+ sc = SimpleCrawler::Crawler.new(ARGV[0])
5
+ sc.skip_patterns = ["\\.doc$", "\\.pdf$", "\\.xls$", "\\.pdf$", "\\.zip$"]
6
+
7
+ sc.crawl { |document|
8
+
9
+ # Print links for entire site
10
+ puts document.uri
11
+
12
+ }
@@ -0,0 +1,11 @@
1
+ require '../lib/simplecrawler.rb'
2
+
3
+ # Set up a new crawler
4
+ sc = SimpleCrawler::Crawler.new(ARGV[0])
5
+ sc.skip_patterns = ["\\.doc$", "\\.pdf$", "\\.xls$", "\\.pdf$", "\\.zip$"]
6
+
7
+ sc.crawl { |document|
8
+ # List links for entire site
9
+ puts document.uri
10
+ }
11
+
@@ -0,0 +1,173 @@
1
+ # == Simple Crawler
2
+ # :title: SimpleCrawler - a generic web crawler library in Ruby
3
+ # Author:: Peter Krantz (http://www.peterkrantz.com)
4
+ # License:: LGPL (See LICENSE file)
5
+ #
6
+ # The SimpleCrawler module is a library for crawling web sites. The crawler provides comprehensive data from the page crawled which can be used for page analysis, indexing, accessibility checks etc. Restrictions can be specified to limit crawling of binary files.
7
+ #
8
+ # == Output
9
+ # The SimpleCrawler::Crawler class yields a SimpleCrawler::Document object instance. This object contains information about a specific URI such as http headers and response data etc.
10
+ #
11
+ # == Contributions
12
+ # None yet :-) Why don't you go ahead and be first?
13
+ #
14
+ # == Example usage
15
+ # See the "Simple Crawler wiki"[http://www.peterkrantz.com/simplecrawler/wiki/].
16
+
17
+ module SimpleCrawler
18
+
19
+ require 'uri'
20
+ require 'rubygems'
21
+ require 'hpricot'
22
+ require 'open-uri'
23
+
24
+ MARKUP_MIME_TYPES = ["text/html", "text/xml", "application/xml", "application/xhtml+xml"]
25
+
26
+ VERSION = "0.1.0"
27
+
28
+ class Document
29
+ attr_accessor :uri, :data, :headers, :fetched_at
30
+
31
+ def to_s
32
+ puts "Document"
33
+ puts " .uri:\t\t#{uri}"
34
+ puts " .fetched_at:\t#{fetched_at}"
35
+ puts " .headers:"
36
+ for header in headers
37
+ puts " #{header[0]}: #{header[1]}"
38
+ end
39
+ puts " .data.length:\t#{(data.length)}"
40
+ end
41
+ end
42
+
43
+
44
+
45
+
46
+ class Crawler
47
+
48
+ attr_accessor :user_agent, :skip_patterns, :load_binary_data, :queue, :visited, :site_uri, :maxcount, :current_count
49
+
50
+ def initialize(url)
51
+ @load_binary_data = false #default, skip loading of pagedata for binary files into Document.data
52
+ @site_uri = URI.parse(url)
53
+ @site_uri.path = "/" if @site_uri.path == ""
54
+ @visited = Hash.new
55
+ @queue = Array.new
56
+ @current_count = 0
57
+ add_uri(@site_uri)
58
+ end
59
+
60
+
61
+ # Check if a path should be ignored because it matches a skip pattern or is already visited.
62
+ def skip_uri?(uri)
63
+
64
+ #Check if maxcount is reached
65
+ if @maxcount
66
+ if @current_count >= @maxcount
67
+ return true
68
+ end
69
+ end
70
+
71
+ #Check if path belongs to site
72
+ unless (uri.relative? or uri.host == @site_uri.host)
73
+ return true
74
+ end
75
+
76
+ #Check if fragment identifier (e.g. #content)
77
+ if uri.path.length == 0 and uri.fragment.length > 0
78
+ return true
79
+ end
80
+
81
+ #Check if uri already visited in this crawl or if it is queued for crawling
82
+ if @visited.has_key?(uri.path) or @queue.include?(uri.path)
83
+ return true
84
+ end
85
+
86
+ #Check if uri is in a skip pattern
87
+ if @skip_patterns
88
+ for skip_pattern in @skip_patterns
89
+ re = Regexp.new(skip_pattern)
90
+ if re.match(uri.path)
91
+ return true
92
+ end
93
+ end
94
+ end
95
+
96
+ return false
97
+ end
98
+
99
+
100
+ def add_uri(uri)
101
+
102
+ if uri.class == String
103
+ uri = URI.parse(uri.strip)
104
+ end
105
+
106
+ unless skip_uri?(uri)
107
+ @queue.push uri.path
108
+ @current_count = @current_count + 1
109
+ @visited[uri.path] = false
110
+ end
111
+
112
+ end
113
+
114
+
115
+ def get_doc(path)
116
+ doc = Document.new
117
+ begin
118
+ uri = @site_uri.clone
119
+ uri.path = uri.path + path if path != "/"
120
+ doc.uri = uri
121
+
122
+ file = open(uri)
123
+
124
+ mime_type = file.meta["content-type"].split(";")[0] if file.meta["content-type"]
125
+
126
+ if MARKUP_MIME_TYPES.include?(mime_type.downcase) or @load_binary_data
127
+ doc.data = file.read
128
+ else
129
+ doc.data = nil
130
+ end
131
+
132
+ doc.headers = file.meta
133
+ doc.fetched_at = Time.now
134
+ rescue Exception
135
+ puts "Error: #{$!}"
136
+ return ""
137
+ end
138
+ return doc
139
+ end
140
+
141
+
142
+ def queue_local_links(doc)
143
+ return if doc.data == nil
144
+ Hpricot.buffer_size = 262144 #Allow for asp.net bastard-sized viewstate attributes...
145
+ doc = Hpricot(doc.data)
146
+ links = doc.search("a[@href]")
147
+ for link in links
148
+ if link.attributes["href"].length > 0 then
149
+ begin
150
+ uri = URI.parse(link.attributes["href"])
151
+ add_uri(uri)
152
+ rescue
153
+ #skip this link
154
+ end
155
+ end
156
+ end
157
+ doc = nil
158
+ end
159
+
160
+
161
+ # Initiate crawling.
162
+ def crawl()
163
+ while (!@queue.empty?)
164
+ uri = @queue.shift
165
+ current_doc = get_doc(uri)
166
+ yield current_doc
167
+ queue_local_links(current_doc)
168
+ @visited[uri] = true
169
+ end
170
+ end
171
+
172
+ end
173
+ end
@@ -0,0 +1,101 @@
1
+ require File.dirname(__FILE__) + '/../lib/simplecrawler'
2
+ require 'test/unit'
3
+ require 'uri'
4
+
5
+ class SimpleCrawlerTest < Test::Unit::TestCase
6
+
7
+ def setup
8
+ @simplecrawler = SimpleCrawler::Crawler.new("http://www.example.com/")
9
+ end
10
+
11
+
12
+ def test_initialize_crawler
13
+ @crawler = SimpleCrawler::Crawler.new("http://www.example.com/")
14
+ assert @crawler.queue.length == 1
15
+ end
16
+
17
+
18
+ def test_initialize_crawler_without_uri_path
19
+ @crawler = SimpleCrawler::Crawler.new("http://www.example.com")
20
+ assert @crawler.queue.length == 1
21
+
22
+ # Default path (/) should be appended
23
+ assert @crawler.queue[0][-1..-1] == "/"
24
+ end
25
+
26
+
27
+
28
+ def test_maxcount_limit
29
+ @simplecrawler.maxcount = 2
30
+ @simplecrawler.add_uri("http://www.example.com/second/")
31
+ @simplecrawler.add_uri("http://www.example.com/third/")
32
+ assert_equal 2, @simplecrawler.queue.length
33
+ end
34
+
35
+ def test_maxcount_unlimited
36
+ @simplecrawler.add_uri("http://www.example.com/second/")
37
+ @simplecrawler.add_uri("http://www.example.com/third/")
38
+ assert @simplecrawler.queue.length == 3
39
+ end
40
+
41
+ def test_skip_uri
42
+ @simplecrawler.skip_patterns = ["\\.doc$"]
43
+ assert @simplecrawler.skip_uri?(URI.parse("http://www.example.com/word.doc"))
44
+ assert_equal false, @simplecrawler.skip_uri?(URI.parse("http://www.example.com/doc.htm"))
45
+ end
46
+
47
+
48
+ def test_addded_paths_shuld_be_distinct
49
+ @simplecrawler.add_uri("http://www.example.com/") # This path is already in the queue
50
+ assert_equal 1, @simplecrawler.queue.length
51
+ end
52
+
53
+ def test_add_uri
54
+ @simplecrawler.add_uri("http://www.example.com/new/")
55
+
56
+ # The queue should now contain the initial base url and the newly added path
57
+ assert_equal 2, @simplecrawler.queue.length
58
+ end
59
+
60
+
61
+ def test_add_uri_with_space
62
+ @simplecrawler.add_uri("http://www.example.com/new/ ")
63
+
64
+ # The queue should now contain the initial base url and the newly added path without spaces
65
+ assert_equal 2, @simplecrawler.queue.length
66
+ assert @simplecrawler.queue[1][-1..-1] != " "
67
+ end
68
+
69
+
70
+
71
+ def test_queue_local_link
72
+ doc = SimpleCrawler::Document.new
73
+ doc.data = "<html><head></head><body><a href=\"http://www.example.com/new/\">Test</a></body></html>"
74
+ @simplecrawler.queue_local_links(doc)
75
+ assert_equal 2, @simplecrawler.queue.length
76
+ end
77
+
78
+
79
+ def test_queue_local_fragment_identifier_skipped
80
+ doc = SimpleCrawler::Document.new
81
+ doc.data = "<html><head></head><body><a href=\"#new\">Test</a></body></html>"
82
+ @simplecrawler.queue_local_links(doc)
83
+ assert_equal 1, @simplecrawler.queue.length
84
+ end
85
+
86
+
87
+ def test_queue_local_links_for_empty_doc
88
+ doc = SimpleCrawler::Document.new
89
+ doc.data = ""
90
+ @simplecrawler.queue_local_links(doc)
91
+ assert_equal 1, @simplecrawler.queue.length
92
+ end
93
+
94
+
95
+ def test_queue_local_links_for_nil_doc
96
+ doc = SimpleCrawler::Document.new
97
+ doc.data = nil
98
+ @simplecrawler.queue_local_links(doc)
99
+ assert_equal 1, @simplecrawler.queue.length
100
+ end
101
+ end
metadata ADDED
@@ -0,0 +1,59 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.2
3
+ specification_version: 1
4
+ name: simplecrawler
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.1.0
7
+ date: 2007-08-27 00:00:00 +02:00
8
+ summary: A generic library for web crawling.
9
+ require_paths:
10
+ - lib
11
+ email: peter.krantzNODAMNSPAM@gmail.com
12
+ homepage: http://www.peterkrantz.com/simplecrawler/wiki/
13
+ rubyforge_project: simplecrawler
14
+ description:
15
+ autorequire: simplecrawler
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.8.2
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Peter Krantz
31
+ files:
32
+ - README
33
+ - lib/simplecrawler.rb
34
+ - tests/simplecrawler_test.rb
35
+ - examples/accessibility_report.rb
36
+ - examples/crawl.rb
37
+ - examples/list_site_links.rb
38
+ test_files:
39
+ - tests/simplecrawler_test.rb
40
+ rdoc_options: []
41
+
42
+ extra_rdoc_files: []
43
+
44
+ executables: []
45
+
46
+ extensions: []
47
+
48
+ requirements: []
49
+
50
+ dependencies:
51
+ - !ruby/object:Gem::Dependency
52
+ name: hpricot
53
+ version_requirement:
54
+ version_requirements: !ruby/object:Gem::Version::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ version: "0.5"
59
+ version: