simplecrawler 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,8 @@
1
+ # == Basic accessibility report - SimpleCrawler example
2
+ # Author:: Peter Krantz (http://www.peterkrantz.com)
3
+ #
4
+ # This is an example of how SimpleCrawler can be used together with Raakt and Ruport to check basic accessibility of an entire website. For details on the error message id:s generated in the report see http://www.peterkrantz.com/raakt/wiki/error-message-ids
5
+ #
1
6
  require '../lib/simplecrawler.rb'
2
7
  require 'raakt'
3
8
  require 'ruport'
@@ -5,9 +10,9 @@ require 'ruport'
5
10
  # Set up a new crawler
6
11
  sc = SimpleCrawler::Crawler.new(ARGV[0])
7
12
  sc.skip_patterns = ["\\.doc$", "\\.pdf$", "\\.xls$", "\\.pdf$", "\\.zip$"]
8
- sc.maxcount = 10
13
+ sc.maxcount = 100
9
14
 
10
- report_data = Ruport::Data::Table.new :column_names => ["Url", "Error count"]
15
+ report_data = Ruport::Data::Table.new :column_names => ["Url", "Error"]
11
16
 
12
17
  sc.crawl { |document|
13
18
 
@@ -15,8 +20,11 @@ sc.crawl { |document|
15
20
  raakt = Raakt::Test.new(document.data)
16
21
  result = raakt.all
17
22
  puts "#{result.length}\t#{document.uri}"
23
+
18
24
  if result.length > 0
19
- report_data << [document.uri, result.length]
25
+ for error in result
26
+ report_data << [document.uri, error.eid.to_s]
27
+ end
20
28
  end
21
29
  }
22
30
 
@@ -0,0 +1,20 @@
1
+ # == Find PDF documents - SimpleCrawler example
2
+ # Author:: Peter Krantz (http://www.peterkrantz.com)
3
+ #
4
+ # This is an example of how SimpleCrawler can be used to find dcuments of a specific type on a website.
5
+ #
6
+ require '../lib/simplecrawler.rb'
7
+ require 'raakt'
8
+ require 'ruport'
9
+
10
+ # Set up a new crawler
11
+ sc = SimpleCrawler::Crawler.new(ARGV[0])
12
+ sc.maxcount = 200 #Only crawl 200 pages
13
+
14
+ sc.crawl { |document|
15
+
16
+ if document.headers["content-type"] == "application/pdf"
17
+ puts document.uri
18
+ end
19
+
20
+ }
@@ -0,0 +1,16 @@
1
+ module SimpleCrawler
2
+ class Document
3
+ attr_accessor :uri, :data, :headers, :fetched_at, :http_status
4
+
5
+ def to_s
6
+ puts "Document"
7
+ puts " .uri:\t\t#{uri}"
8
+ puts " .fetched_at:\t#{fetched_at}"
9
+ puts " .headers:"
10
+ for header in headers
11
+ puts " #{header[0]}: #{header[1]}"
12
+ end
13
+ puts " .data.length:\t#{(data.length)}"
14
+ end
15
+ end
16
+ end
@@ -20,28 +20,10 @@ module SimpleCrawler
20
20
  require 'rubygems'
21
21
  require 'hpricot'
22
22
  require 'open-uri'
23
+ require File.dirname(__FILE__) + '/document'
23
24
 
24
25
  MARKUP_MIME_TYPES = ["text/html", "text/xml", "application/xml", "application/xhtml+xml"]
25
-
26
- VERSION = "0.1.0"
27
-
28
- class Document
29
- attr_accessor :uri, :data, :headers, :fetched_at
30
-
31
- def to_s
32
- puts "Document"
33
- puts " .uri:\t\t#{uri}"
34
- puts " .fetched_at:\t#{fetched_at}"
35
- puts " .headers:"
36
- for header in headers
37
- puts " #{header[0]}: #{header[1]}"
38
- end
39
- puts " .data.length:\t#{(data.length)}"
40
- end
41
- end
42
-
43
-
44
-
26
+ VERSION = "0.1.1"
45
27
 
46
28
  class Crawler
47
29
 
@@ -58,6 +40,11 @@ module SimpleCrawler
58
40
  end
59
41
 
60
42
 
43
+ # Override this method for your own logging needs.
44
+ def log(message)
45
+ puts message
46
+ end
47
+
61
48
  # Check if a path should be ignored because it matches a skip pattern or is already visited.
62
49
  def skip_uri?(uri)
63
50
 
@@ -119,6 +106,8 @@ module SimpleCrawler
119
106
  uri.path = uri.path + path if path != "/"
120
107
  doc.uri = uri
121
108
 
109
+ log("Trying #{uri}")
110
+
122
111
  file = open(uri)
123
112
 
124
113
  mime_type = file.meta["content-type"].split(";")[0] if file.meta["content-type"]
@@ -132,8 +121,8 @@ module SimpleCrawler
132
121
  doc.headers = file.meta
133
122
  doc.fetched_at = Time.now
134
123
  rescue Exception
135
- puts "Error: #{$!}"
136
- return ""
124
+ log("Error fetching [#{uri}]: #{$!}")
125
+ return doc
137
126
  end
138
127
  return doc
139
128
  end
@@ -98,4 +98,5 @@ class SimpleCrawlerTest < Test::Unit::TestCase
98
98
  @simplecrawler.queue_local_links(doc)
99
99
  assert_equal 1, @simplecrawler.queue.length
100
100
  end
101
+
101
102
  end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
3
3
  specification_version: 1
4
4
  name: simplecrawler
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.1.0
7
- date: 2007-08-27 00:00:00 +02:00
6
+ version: 0.1.1
7
+ date: 2007-08-30 00:00:00 +02:00
8
8
  summary: A generic library for web crawling.
9
9
  require_paths:
10
10
  - lib
@@ -30,10 +30,12 @@ authors:
30
30
  - Peter Krantz
31
31
  files:
32
32
  - README
33
+ - lib/document.rb
33
34
  - lib/simplecrawler.rb
34
35
  - tests/simplecrawler_test.rb
35
36
  - examples/accessibility_report.rb
36
37
  - examples/crawl.rb
38
+ - examples/find_pdfs.rb
37
39
  - examples/list_site_links.rb
38
40
  test_files:
39
41
  - tests/simplecrawler_test.rb