simplecrawler 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/examples/accessibility_report.rb +11 -3
- data/examples/find_pdfs.rb +20 -0
- data/lib/document.rb +16 -0
- data/lib/simplecrawler.rb +11 -22
- data/tests/simplecrawler_test.rb +1 -0
- metadata +4 -2
@@ -1,3 +1,8 @@
|
|
1
|
+
# == Basic accessibility report - SimpleCrawler example
|
2
|
+
# Author:: Peter Krantz (http://www.peterkrantz.com)
|
3
|
+
#
|
4
|
+
# This is an example of how SimpleCrawler can be used together with Raakt and Ruport to check basic accessibility of an entire website. For details on the error message id:s generated in the report see http://www.peterkrantz.com/raakt/wiki/error-message-ids
|
5
|
+
#
|
1
6
|
require '../lib/simplecrawler.rb'
|
2
7
|
require 'raakt'
|
3
8
|
require 'ruport'
|
@@ -5,9 +10,9 @@ require 'ruport'
|
|
5
10
|
# Set up a new crawler
|
6
11
|
sc = SimpleCrawler::Crawler.new(ARGV[0])
|
7
12
|
sc.skip_patterns = ["\\.doc$", "\\.pdf$", "\\.xls$", "\\.pdf$", "\\.zip$"]
|
8
|
-
sc.maxcount =
|
13
|
+
sc.maxcount = 100
|
9
14
|
|
10
|
-
report_data = Ruport::Data::Table.new :column_names => ["Url", "Error
|
15
|
+
report_data = Ruport::Data::Table.new :column_names => ["Url", "Error"]
|
11
16
|
|
12
17
|
sc.crawl { |document|
|
13
18
|
|
@@ -15,8 +20,11 @@ sc.crawl { |document|
|
|
15
20
|
raakt = Raakt::Test.new(document.data)
|
16
21
|
result = raakt.all
|
17
22
|
puts "#{result.length}\t#{document.uri}"
|
23
|
+
|
18
24
|
if result.length > 0
|
19
|
-
|
25
|
+
for error in result
|
26
|
+
report_data << [document.uri, error.eid.to_s]
|
27
|
+
end
|
20
28
|
end
|
21
29
|
}
|
22
30
|
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# == Find PDF documents - SimpleCrawler example
|
2
|
+
# Author:: Peter Krantz (http://www.peterkrantz.com)
|
3
|
+
#
|
4
|
+
# This is an example of how SimpleCrawler can be used to find dcuments of a specific type on a website.
|
5
|
+
#
|
6
|
+
require '../lib/simplecrawler.rb'
|
7
|
+
require 'raakt'
|
8
|
+
require 'ruport'
|
9
|
+
|
10
|
+
# Set up a new crawler
|
11
|
+
sc = SimpleCrawler::Crawler.new(ARGV[0])
|
12
|
+
sc.maxcount = 200 #Only crawl 200 pages
|
13
|
+
|
14
|
+
sc.crawl { |document|
|
15
|
+
|
16
|
+
if document.headers["content-type"] == "application/pdf"
|
17
|
+
puts document.uri
|
18
|
+
end
|
19
|
+
|
20
|
+
}
|
data/lib/document.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
module SimpleCrawler
|
2
|
+
class Document
|
3
|
+
attr_accessor :uri, :data, :headers, :fetched_at, :http_status
|
4
|
+
|
5
|
+
def to_s
|
6
|
+
puts "Document"
|
7
|
+
puts " .uri:\t\t#{uri}"
|
8
|
+
puts " .fetched_at:\t#{fetched_at}"
|
9
|
+
puts " .headers:"
|
10
|
+
for header in headers
|
11
|
+
puts " #{header[0]}: #{header[1]}"
|
12
|
+
end
|
13
|
+
puts " .data.length:\t#{(data.length)}"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
data/lib/simplecrawler.rb
CHANGED
@@ -20,28 +20,10 @@ module SimpleCrawler
|
|
20
20
|
require 'rubygems'
|
21
21
|
require 'hpricot'
|
22
22
|
require 'open-uri'
|
23
|
+
require File.dirname(__FILE__) + '/document'
|
23
24
|
|
24
25
|
MARKUP_MIME_TYPES = ["text/html", "text/xml", "application/xml", "application/xhtml+xml"]
|
25
|
-
|
26
|
-
VERSION = "0.1.0"
|
27
|
-
|
28
|
-
class Document
|
29
|
-
attr_accessor :uri, :data, :headers, :fetched_at
|
30
|
-
|
31
|
-
def to_s
|
32
|
-
puts "Document"
|
33
|
-
puts " .uri:\t\t#{uri}"
|
34
|
-
puts " .fetched_at:\t#{fetched_at}"
|
35
|
-
puts " .headers:"
|
36
|
-
for header in headers
|
37
|
-
puts " #{header[0]}: #{header[1]}"
|
38
|
-
end
|
39
|
-
puts " .data.length:\t#{(data.length)}"
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
|
44
|
-
|
26
|
+
VERSION = "0.1.1"
|
45
27
|
|
46
28
|
class Crawler
|
47
29
|
|
@@ -58,6 +40,11 @@ module SimpleCrawler
|
|
58
40
|
end
|
59
41
|
|
60
42
|
|
43
|
+
# Override this method for your own logging needs.
|
44
|
+
def log(message)
|
45
|
+
puts message
|
46
|
+
end
|
47
|
+
|
61
48
|
# Check if a path should be ignored because it matches a skip pattern or is already visited.
|
62
49
|
def skip_uri?(uri)
|
63
50
|
|
@@ -119,6 +106,8 @@ module SimpleCrawler
|
|
119
106
|
uri.path = uri.path + path if path != "/"
|
120
107
|
doc.uri = uri
|
121
108
|
|
109
|
+
log("Trying #{uri}")
|
110
|
+
|
122
111
|
file = open(uri)
|
123
112
|
|
124
113
|
mime_type = file.meta["content-type"].split(";")[0] if file.meta["content-type"]
|
@@ -132,8 +121,8 @@ module SimpleCrawler
|
|
132
121
|
doc.headers = file.meta
|
133
122
|
doc.fetched_at = Time.now
|
134
123
|
rescue Exception
|
135
|
-
|
136
|
-
return
|
124
|
+
log("Error fetching [#{uri}]: #{$!}")
|
125
|
+
return doc
|
137
126
|
end
|
138
127
|
return doc
|
139
128
|
end
|
data/tests/simplecrawler_test.rb
CHANGED
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
|
|
3
3
|
specification_version: 1
|
4
4
|
name: simplecrawler
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.1.
|
7
|
-
date: 2007-08-
|
6
|
+
version: 0.1.1
|
7
|
+
date: 2007-08-30 00:00:00 +02:00
|
8
8
|
summary: A generic library for web crawling.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -30,10 +30,12 @@ authors:
|
|
30
30
|
- Peter Krantz
|
31
31
|
files:
|
32
32
|
- README
|
33
|
+
- lib/document.rb
|
33
34
|
- lib/simplecrawler.rb
|
34
35
|
- tests/simplecrawler_test.rb
|
35
36
|
- examples/accessibility_report.rb
|
36
37
|
- examples/crawl.rb
|
38
|
+
- examples/find_pdfs.rb
|
37
39
|
- examples/list_site_links.rb
|
38
40
|
test_files:
|
39
41
|
- tests/simplecrawler_test.rb
|