simplecrawler 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,8 @@
1
+ # == Basic accessibility report - SimpleCrawler example
2
+ # Author:: Peter Krantz (http://www.peterkrantz.com)
3
+ #
4
+ # This is an example of how SimpleCrawler can be used together with Raakt and Ruport to check basic accessibility of an entire website. For details on the error message id:s generated in the report see http://www.peterkrantz.com/raakt/wiki/error-message-ids
5
+ #
1
6
  require '../lib/simplecrawler.rb'
2
7
  require 'raakt'
3
8
  require 'ruport'
@@ -5,9 +10,9 @@ require 'ruport'
5
10
  # Set up a new crawler
6
11
  sc = SimpleCrawler::Crawler.new(ARGV[0])
7
12
  sc.skip_patterns = ["\\.doc$", "\\.pdf$", "\\.xls$", "\\.pdf$", "\\.zip$"]
8
- sc.maxcount = 10
13
+ sc.maxcount = 100
9
14
 
10
- report_data = Ruport::Data::Table.new :column_names => ["Url", "Error count"]
15
+ report_data = Ruport::Data::Table.new :column_names => ["Url", "Error"]
11
16
 
12
17
  sc.crawl { |document|
13
18
 
@@ -15,8 +20,11 @@ sc.crawl { |document|
15
20
  raakt = Raakt::Test.new(document.data)
16
21
  result = raakt.all
17
22
  puts "#{result.length}\t#{document.uri}"
23
+
18
24
  if result.length > 0
19
- report_data << [document.uri, result.length]
25
+ for error in result
26
+ report_data << [document.uri, error.eid.to_s]
27
+ end
20
28
  end
21
29
  }
22
30
 
@@ -0,0 +1,20 @@
1
+ # == Find PDF documents - SimpleCrawler example
2
+ # Author:: Peter Krantz (http://www.peterkrantz.com)
3
+ #
4
+ # This is an example of how SimpleCrawler can be used to find dcuments of a specific type on a website.
5
+ #
6
+ require '../lib/simplecrawler.rb'
7
+ require 'raakt'
8
+ require 'ruport'
9
+
10
+ # Set up a new crawler
11
+ sc = SimpleCrawler::Crawler.new(ARGV[0])
12
+ sc.maxcount = 200 #Only crawl 200 pages
13
+
14
+ sc.crawl { |document|
15
+
16
+ if document.headers["content-type"] == "application/pdf"
17
+ puts document.uri
18
+ end
19
+
20
+ }
@@ -0,0 +1,16 @@
1
+ module SimpleCrawler
2
+ class Document
3
+ attr_accessor :uri, :data, :headers, :fetched_at, :http_status
4
+
5
+ def to_s
6
+ puts "Document"
7
+ puts " .uri:\t\t#{uri}"
8
+ puts " .fetched_at:\t#{fetched_at}"
9
+ puts " .headers:"
10
+ for header in headers
11
+ puts " #{header[0]}: #{header[1]}"
12
+ end
13
+ puts " .data.length:\t#{(data.length)}"
14
+ end
15
+ end
16
+ end
@@ -20,28 +20,10 @@ module SimpleCrawler
20
20
  require 'rubygems'
21
21
  require 'hpricot'
22
22
  require 'open-uri'
23
+ require File.dirname(__FILE__) + '/document'
23
24
 
24
25
  MARKUP_MIME_TYPES = ["text/html", "text/xml", "application/xml", "application/xhtml+xml"]
25
-
26
- VERSION = "0.1.0"
27
-
28
- class Document
29
- attr_accessor :uri, :data, :headers, :fetched_at
30
-
31
- def to_s
32
- puts "Document"
33
- puts " .uri:\t\t#{uri}"
34
- puts " .fetched_at:\t#{fetched_at}"
35
- puts " .headers:"
36
- for header in headers
37
- puts " #{header[0]}: #{header[1]}"
38
- end
39
- puts " .data.length:\t#{(data.length)}"
40
- end
41
- end
42
-
43
-
44
-
26
+ VERSION = "0.1.1"
45
27
 
46
28
  class Crawler
47
29
 
@@ -58,6 +40,11 @@ module SimpleCrawler
58
40
  end
59
41
 
60
42
 
43
+ # Override this method for your own logging needs.
44
+ def log(message)
45
+ puts message
46
+ end
47
+
61
48
  # Check if a path should be ignored because it matches a skip pattern or is already visited.
62
49
  def skip_uri?(uri)
63
50
 
@@ -119,6 +106,8 @@ module SimpleCrawler
119
106
  uri.path = uri.path + path if path != "/"
120
107
  doc.uri = uri
121
108
 
109
+ log("Trying #{uri}")
110
+
122
111
  file = open(uri)
123
112
 
124
113
  mime_type = file.meta["content-type"].split(";")[0] if file.meta["content-type"]
@@ -132,8 +121,8 @@ module SimpleCrawler
132
121
  doc.headers = file.meta
133
122
  doc.fetched_at = Time.now
134
123
  rescue Exception
135
- puts "Error: #{$!}"
136
- return ""
124
+ log("Error fetching [#{uri}]: #{$!}")
125
+ return doc
137
126
  end
138
127
  return doc
139
128
  end
@@ -98,4 +98,5 @@ class SimpleCrawlerTest < Test::Unit::TestCase
98
98
  @simplecrawler.queue_local_links(doc)
99
99
  assert_equal 1, @simplecrawler.queue.length
100
100
  end
101
+
101
102
  end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
3
3
  specification_version: 1
4
4
  name: simplecrawler
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.1.0
7
- date: 2007-08-27 00:00:00 +02:00
6
+ version: 0.1.1
7
+ date: 2007-08-30 00:00:00 +02:00
8
8
  summary: A generic library for web crawling.
9
9
  require_paths:
10
10
  - lib
@@ -30,10 +30,12 @@ authors:
30
30
  - Peter Krantz
31
31
  files:
32
32
  - README
33
+ - lib/document.rb
33
34
  - lib/simplecrawler.rb
34
35
  - tests/simplecrawler_test.rb
35
36
  - examples/accessibility_report.rb
36
37
  - examples/crawl.rb
38
+ - examples/find_pdfs.rb
37
39
  - examples/list_site_links.rb
38
40
  test_files:
39
41
  - tests/simplecrawler_test.rb