simplecrawler 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/examples/accessibility_report.rb +11 -3
- data/examples/find_pdfs.rb +20 -0
- data/lib/document.rb +16 -0
- data/lib/simplecrawler.rb +11 -22
- data/tests/simplecrawler_test.rb +1 -0
- metadata +4 -2
@@ -1,3 +1,8 @@
|
|
1
|
+
# == Basic accessibility report - SimpleCrawler example
|
2
|
+
# Author:: Peter Krantz (http://www.peterkrantz.com)
|
3
|
+
#
|
4
|
+
# This is an example of how SimpleCrawler can be used together with Raakt and Ruport to check basic accessibility of an entire website. For details on the error message id:s generated in the report see http://www.peterkrantz.com/raakt/wiki/error-message-ids
|
5
|
+
#
|
1
6
|
require '../lib/simplecrawler.rb'
|
2
7
|
require 'raakt'
|
3
8
|
require 'ruport'
|
@@ -5,9 +10,9 @@ require 'ruport'
|
|
5
10
|
# Set up a new crawler
|
6
11
|
sc = SimpleCrawler::Crawler.new(ARGV[0])
|
7
12
|
sc.skip_patterns = ["\\.doc$", "\\.pdf$", "\\.xls$", "\\.pdf$", "\\.zip$"]
|
8
|
-
sc.maxcount =
|
13
|
+
sc.maxcount = 100
|
9
14
|
|
10
|
-
report_data = Ruport::Data::Table.new :column_names => ["Url", "Error
|
15
|
+
report_data = Ruport::Data::Table.new :column_names => ["Url", "Error"]
|
11
16
|
|
12
17
|
sc.crawl { |document|
|
13
18
|
|
@@ -15,8 +20,11 @@ sc.crawl { |document|
|
|
15
20
|
raakt = Raakt::Test.new(document.data)
|
16
21
|
result = raakt.all
|
17
22
|
puts "#{result.length}\t#{document.uri}"
|
23
|
+
|
18
24
|
if result.length > 0
|
19
|
-
|
25
|
+
for error in result
|
26
|
+
report_data << [document.uri, error.eid.to_s]
|
27
|
+
end
|
20
28
|
end
|
21
29
|
}
|
22
30
|
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# == Find PDF documents - SimpleCrawler example
|
2
|
+
# Author:: Peter Krantz (http://www.peterkrantz.com)
|
3
|
+
#
|
4
|
+
# This is an example of how SimpleCrawler can be used to find dcuments of a specific type on a website.
|
5
|
+
#
|
6
|
+
require '../lib/simplecrawler.rb'
|
7
|
+
require 'raakt'
|
8
|
+
require 'ruport'
|
9
|
+
|
10
|
+
# Set up a new crawler
|
11
|
+
sc = SimpleCrawler::Crawler.new(ARGV[0])
|
12
|
+
sc.maxcount = 200 #Only crawl 200 pages
|
13
|
+
|
14
|
+
sc.crawl { |document|
|
15
|
+
|
16
|
+
if document.headers["content-type"] == "application/pdf"
|
17
|
+
puts document.uri
|
18
|
+
end
|
19
|
+
|
20
|
+
}
|
data/lib/document.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
module SimpleCrawler
|
2
|
+
class Document
|
3
|
+
attr_accessor :uri, :data, :headers, :fetched_at, :http_status
|
4
|
+
|
5
|
+
def to_s
|
6
|
+
puts "Document"
|
7
|
+
puts " .uri:\t\t#{uri}"
|
8
|
+
puts " .fetched_at:\t#{fetched_at}"
|
9
|
+
puts " .headers:"
|
10
|
+
for header in headers
|
11
|
+
puts " #{header[0]}: #{header[1]}"
|
12
|
+
end
|
13
|
+
puts " .data.length:\t#{(data.length)}"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
data/lib/simplecrawler.rb
CHANGED
@@ -20,28 +20,10 @@ module SimpleCrawler
|
|
20
20
|
require 'rubygems'
|
21
21
|
require 'hpricot'
|
22
22
|
require 'open-uri'
|
23
|
+
require File.dirname(__FILE__) + '/document'
|
23
24
|
|
24
25
|
MARKUP_MIME_TYPES = ["text/html", "text/xml", "application/xml", "application/xhtml+xml"]
|
25
|
-
|
26
|
-
VERSION = "0.1.0"
|
27
|
-
|
28
|
-
class Document
|
29
|
-
attr_accessor :uri, :data, :headers, :fetched_at
|
30
|
-
|
31
|
-
def to_s
|
32
|
-
puts "Document"
|
33
|
-
puts " .uri:\t\t#{uri}"
|
34
|
-
puts " .fetched_at:\t#{fetched_at}"
|
35
|
-
puts " .headers:"
|
36
|
-
for header in headers
|
37
|
-
puts " #{header[0]}: #{header[1]}"
|
38
|
-
end
|
39
|
-
puts " .data.length:\t#{(data.length)}"
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
|
44
|
-
|
26
|
+
VERSION = "0.1.1"
|
45
27
|
|
46
28
|
class Crawler
|
47
29
|
|
@@ -58,6 +40,11 @@ module SimpleCrawler
|
|
58
40
|
end
|
59
41
|
|
60
42
|
|
43
|
+
# Override this method for your own logging needs.
|
44
|
+
def log(message)
|
45
|
+
puts message
|
46
|
+
end
|
47
|
+
|
61
48
|
# Check if a path should be ignored because it matches a skip pattern or is already visited.
|
62
49
|
def skip_uri?(uri)
|
63
50
|
|
@@ -119,6 +106,8 @@ module SimpleCrawler
|
|
119
106
|
uri.path = uri.path + path if path != "/"
|
120
107
|
doc.uri = uri
|
121
108
|
|
109
|
+
log("Trying #{uri}")
|
110
|
+
|
122
111
|
file = open(uri)
|
123
112
|
|
124
113
|
mime_type = file.meta["content-type"].split(";")[0] if file.meta["content-type"]
|
@@ -132,8 +121,8 @@ module SimpleCrawler
|
|
132
121
|
doc.headers = file.meta
|
133
122
|
doc.fetched_at = Time.now
|
134
123
|
rescue Exception
|
135
|
-
|
136
|
-
return
|
124
|
+
log("Error fetching [#{uri}]: #{$!}")
|
125
|
+
return doc
|
137
126
|
end
|
138
127
|
return doc
|
139
128
|
end
|
data/tests/simplecrawler_test.rb
CHANGED
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
|
|
3
3
|
specification_version: 1
|
4
4
|
name: simplecrawler
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.1.
|
7
|
-
date: 2007-08-
|
6
|
+
version: 0.1.1
|
7
|
+
date: 2007-08-30 00:00:00 +02:00
|
8
8
|
summary: A generic library for web crawling.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -30,10 +30,12 @@ authors:
|
|
30
30
|
- Peter Krantz
|
31
31
|
files:
|
32
32
|
- README
|
33
|
+
- lib/document.rb
|
33
34
|
- lib/simplecrawler.rb
|
34
35
|
- tests/simplecrawler_test.rb
|
35
36
|
- examples/accessibility_report.rb
|
36
37
|
- examples/crawl.rb
|
38
|
+
- examples/find_pdfs.rb
|
37
39
|
- examples/list_site_links.rb
|
38
40
|
test_files:
|
39
41
|
- tests/simplecrawler_test.rb
|