crawler 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ /doc/*
2
+ /log/*
3
+ /pkg/*
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.0
@@ -0,0 +1,37 @@
1
+ #! /usr/bin/ruby
2
+ $LOAD_PATH << File.join(File.dirname(__FILE__), "..", "lib")
3
+ require 'rubygems'
4
+ require 'crawler'
5
+ require 'optparse'
6
+
7
+ options = {}
8
+ logfile = ""
9
+ optparser = OptionParser.new do |opts|
10
+ opts.on("-t", "--timeout X", Integer, "Timemout limit in seconds") { |x| options[:timeout] = x}
11
+ opts.on("-x", "--exclude path", Array, "List of paths to be excluded") { |x| options[:exclude] = x}
12
+ opts.on("-l", "--log file", String, "Filename to use as a log") {|log| logfile = log }
13
+ opts.parse!(ARGV)
14
+ end
15
+
16
+ unless logfile.empty?
17
+ log = File.new(logfile, "w")
18
+ else
19
+ log = $stdout
20
+ end
21
+
22
+
23
+ uri_string = ARGV[0]
24
+ begin
25
+ uri = URI.parse(uri_string)
26
+ raise unless uri.is_a?(URI::HTTP)
27
+ rescue
28
+ puts "Error parsing URI: #{uri_string}"
29
+ Process.exit
30
+ end
31
+
32
+ crawler = Crawler::Webcrawler.new(options)
33
+ observer = Crawler::Observer.new(log)
34
+
35
+ crawler.add_observer(observer)
36
+
37
+ crawler.crawl(uri)
@@ -0,0 +1,2 @@
1
+ require 'crawler/webcrawler'
2
+ require 'crawler/observer'
@@ -0,0 +1,22 @@
1
+ module Crawler
2
+
3
+ # Observer watches a Webcrawler and outputs messages to a log object. This defaults to STDOUT but may be anything which responds to +puts+.
4
+ class Observer
5
+
6
+ # Log object. Must respond to +puts+.
7
+ attr_accessor :log
8
+
9
+ # Creates a new Observer object
10
+ def initialize(log=$stdout)
11
+ @log = log
12
+ end
13
+
14
+ # Called by the Observable module through Webcrawler.
15
+ def update(response, url)
16
+ @log.puts "Scanning: " + url.to_s
17
+ if response.kind_of?(Net::HTTPClientError) or response.kind_of?(Net::HTTPServerError)
18
+ @log.puts "#{response.code} encountered for " + url.to_s
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,74 @@
1
+ require 'set'
2
+ require 'observer'
3
+ require 'net/http'
4
+ require 'nokogiri'
5
+ require 'timeout'
6
+
7
+ module Crawler
8
+ class Webcrawler
9
+
10
+ include Observable
11
+
12
+ # Set of all URIs which have been crawled
13
+ attr_accessor :crawled
14
+ # Queue of URIs to be crawled. Array which acts as a LIFO queue.
15
+ attr_accessor :queue
16
+ # Hash of options
17
+ attr_accessor :options
18
+
19
+ # Accepts the following options:
20
+ # * timeout -- Time limit for the crawl operation, after which a Timeout::Error exception is raised.
21
+ def initialize(options={})
22
+ @crawled = Set.new
23
+ @queue = []
24
+ @options = {
25
+ :timeout => 1.0/0, #Infinity
26
+ :external => false,
27
+ :exclude => []
28
+ }.merge(options)
29
+
30
+ end
31
+
32
+ # Given a URI object, the crawler will explore every linked page recursively using the Breadth First Search algorithm.
33
+ def crawl(start_uri)
34
+ start_uri = start_uri.normalize
35
+ @queue << start_uri
36
+
37
+ timeout(@options[:timeout]) {
38
+ while(uri = @queue.shift)
39
+
40
+ Net::HTTP.start(uri.host, uri.port) do |http|
41
+
42
+ head = http.head(uri.path)
43
+ next if head.content_type != "text/html" # If the page retrieved is not an HTML document, we'll choke on it anyway. Skip it
44
+
45
+ resp = http.get(uri.path)
46
+
47
+ changed
48
+ notify_observers(resp, uri)
49
+
50
+ html = Nokogiri.parse(resp.body)
51
+ a_tags = html.search("a")
52
+ @queue = @queue + a_tags.collect do |t|
53
+ begin
54
+ next_uri = uri + t.attribute("href").to_s.strip
55
+ rescue
56
+ nil
57
+ end
58
+ end
59
+ @queue = @queue.compact.uniq
60
+ @queue = @queue.reject {|u|
61
+ @crawled.include?(u) or
62
+ u == uri or
63
+ !(u.kind_of?(URI::HTTP)) or
64
+ (u.host != uri.host and !@options[:external]) or
65
+ (@options[:exclude].any? { |excl| u.path.include?(excl)})
66
+ }
67
+ end
68
+ @crawled << uri
69
+ end
70
+ }
71
+
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,136 @@
1
+ require File.join(File.dirname(__FILE__), "..", "spec_helper.rb")
2
+
3
+ module Crawler
4
+ describe Webcrawler do
5
+
6
+ before(:all) do
7
+ @uri_base = 'http://localhost:12000/'
8
+ www_root = File.join(File.dirname(__FILE__), '..', 'fixtures')
9
+ @server = Thread.new do
10
+ s = WEBrick::HTTPServer.new({:Port => 12000, :DocumentRoot => www_root, :AccessLog => []})
11
+ @port = s.config[:Port]
12
+ begin
13
+ s.start
14
+ ensure
15
+ s.shutdown
16
+ end
17
+ end
18
+ end
19
+
20
+ after(:all) do
21
+ @server.exit
22
+ end
23
+
24
+ context "before crawl" do
25
+ it "should have an empty crawl list" do
26
+ crawler = Webcrawler.new
27
+ crawler.crawled.should be_empty
28
+ end
29
+ end
30
+
31
+ context "during a crawl" do
32
+
33
+ before(:each) do
34
+ @crawler = Webcrawler.new
35
+ @obs = mock("observer", :update => nil, :null_object => true)
36
+ #@obs = Observer.new
37
+ @crawler.add_observer(@obs)
38
+ end
39
+
40
+ it "should send notifications" do
41
+ uri = URI.parse(@uri_base)
42
+ @obs.should_receive(:update)
43
+ @crawler.crawl(uri)
44
+ end
45
+
46
+ it "should send status code and URL" do
47
+ uri = URI.parse(@uri_base)
48
+ @obs.should_receive(:update).with(kind_of(Net::HTTPResponse), kind_of(URI))
49
+ @crawler.crawl(uri)
50
+ end
51
+
52
+ it "should send 404 for missing URL" do
53
+ uri = URI.parse(@uri_base + 'doesnotexist.html')
54
+ @obs.should_receive(:update).with(instance_of(Net::HTTPNotFound), uri)
55
+ @crawler.crawl(uri)
56
+ end
57
+
58
+ it "should not crawl a page more than once" do
59
+ uri = URI.parse(@uri_base)
60
+ @obs.should_receive(:update).with(kind_of(Net::HTTPResponse), uri + '/page5.html').once
61
+ @obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), uri + '/page5.html')
62
+ @crawler.crawl(uri)
63
+ end
64
+
65
+ it "should not add the current page to the queue" do
66
+ uri = URI.parse(@uri_base + "self-reference.html")
67
+ @obs.should_receive(:update).with(kind_of(Net::HTTPResponse), uri).once
68
+ @obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), uri)
69
+ @crawler.crawl(uri)
70
+ end
71
+
72
+ it "should remove nil items from the queue" do
73
+ uri = URI.parse(@uri_base + "self-reference.html")
74
+ @obs.should_receive(:update).twice
75
+ @crawler.crawl(uri)
76
+ end
77
+
78
+ it "should convert any exceptions to nil" do
79
+ uri = URI.parse(@uri_base + 'messed-up.html')
80
+ lambda { @crawler.crawl(uri) }.should_not raise_error
81
+ end
82
+
83
+ it "should not crawl anything but HTTP web addresses" do
84
+ uri = URI.parse(@uri_base + 'non-http.html')
85
+ @obs.should_receive(:update).once
86
+ @crawler.crawl(uri)
87
+ end
88
+
89
+ it "should not, by default, crawl outside its original host" do
90
+ uri = URI.parse(@uri_base + 'external.html')
91
+ @obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), URI.parse("http://example.com"))
92
+ @crawler.crawl(uri)
93
+ end
94
+
95
+ it "should only download HTML content types" do
96
+ uri = URI.parse(@uri_base + 'non-html.html')
97
+ @obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), uri + '/pdf.pdf')
98
+ @crawler.crawl(uri)
99
+ end
100
+
101
+ it "should not download anything in the excluded option" do
102
+ uri = URI.parse(@uri_base + 'exclusion.html')
103
+ @crawler.options[:exclude] = ["/excluded/"]
104
+ @obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), uri + '/excluded/shouldnt-hit.html')
105
+ @crawler.crawl(uri)
106
+ end
107
+
108
+
109
+ end
110
+
111
+ context "after crawl" do
112
+ before(:each) do
113
+ @crawler = Webcrawler.new
114
+ @uri = URI.parse(@uri_base)
115
+ @crawler.crawl(@uri)
116
+ end
117
+
118
+ it "should have at least one item in crawled" do
119
+ @crawler.should have_at_least(1).crawled
120
+ end
121
+
122
+ it "should have put crawled links into crawled" do
123
+ @crawler.should have_at_least(2).crawled
124
+ end
125
+
126
+ it "should have the children of child pages in crawled" do
127
+ @crawler.crawled.should include(@uri + "/page4.html")
128
+ end
129
+
130
+ it "should have an empty queue" do
131
+ @crawler.queue.should be_empty
132
+ end
133
+
134
+ end
135
+ end
136
+ end
@@ -0,0 +1,28 @@
1
+ require File.join(File.dirname(__FILE__), "..", "spec_helper.rb")
2
+ require 'stringio'
3
+
4
+ module Crawler
5
+ describe Observer do
6
+
7
+ def test_code(code, log, obs)
8
+ log.should_receive(:puts).with("#{code} encountered for http://example.com/")
9
+ resp = Net::HTTPResponse::CODE_TO_OBJ["#{code}"].new("1.1", code, "")
10
+ obs.update(resp, URI.parse("http://example.com/"))
11
+ end
12
+
13
+ it "should output a warning when an error code is reached" do
14
+ log = double('log', :null_object => true)
15
+ obs = Observer.new(log)
16
+ (400..416).each { |code| test_code(code, log, obs) }
17
+ (500..505).each { |code| test_code(code, log, obs) }
18
+ end
19
+
20
+ it "should not output a warning when 200 is encountered" do
21
+ log = double('log')
22
+ obs = Observer.new(log)
23
+ log.should_not_receive(:puts).with(/\d{3} encountered/)
24
+ obs.update(Net::HTTPOK.new("1.1", "200", ""), URI.parse("http://example.com/"))
25
+ end
26
+
27
+ end
28
+ end
@@ -0,0 +1,6 @@
1
+ <html>
2
+ <head>
3
+ </head>
4
+ <body>
5
+ you should never see this
6
+ </body>
@@ -0,0 +1,7 @@
1
+ <html>
2
+ <head>
3
+ </head>
4
+ <body>
5
+ <a href="/excluded/shouldnt-hit.html">nope</a>
6
+ </body>
7
+ </html>
@@ -0,0 +1,8 @@
1
+ <html>
2
+ <head>
3
+ <title>External</title>
4
+ </head>
5
+ <body>
6
+ <a href="http://example.com/">example</a>
7
+ </body>
8
+ </html>
@@ -0,0 +1,9 @@
1
+ <html>
2
+ <head>
3
+ <title>Test</title>
4
+ </head>
5
+ <body>
6
+ <a href="page2.html">Page 2</a>
7
+ <a href="page3.html">Page 3</a>
8
+ </body>
9
+ </html>
@@ -0,0 +1,7 @@
1
+ <html>
2
+ <head></head>
3
+ <body>
4
+ <a href=" flange.html">yes</a>
5
+ <a href="javascript: void(0);">javascript</a>
6
+ </body>
7
+ </html>
@@ -0,0 +1,7 @@
1
+ <html>
2
+ <head>
3
+ </head>
4
+ <body>
5
+ <a href="/pdf.pdf">pdf</a>
6
+ </body>
7
+ </html>
@@ -0,0 +1,14 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
2
+ "http://www.w3.org/TR/html4/strict.dtd">
3
+
4
+ <html lang="en">
5
+ <head>
6
+ <title>non-http</title>
7
+ </head>
8
+ <body>
9
+ <!-- None of the following should be followed -->
10
+ <a href="mailto:test@example.com">mailto</a>
11
+ <a href="ftp://ftp.example.com">ftp</a>
12
+
13
+ </body>
14
+ </html>
@@ -0,0 +1,9 @@
1
+ <html>
2
+ <head>
3
+ <title>test2</title>
4
+ </head>
5
+ <body>
6
+ <a href="/page4.html">Link</a>
7
+ <a href="/page5.html">Link</a>
8
+ </body>
9
+ </html>
@@ -0,0 +1,8 @@
1
+ <html>
2
+ <head>
3
+ <title>test2</title>
4
+ </head>
5
+ <body>
6
+ <a href="/page5.html">Link</a>
7
+ </body>
8
+ </html>
File without changes
@@ -0,0 +1,8 @@
1
+ <html>
2
+ <head>
3
+ <title>something</title>
4
+ </head>
5
+ <body>
6
+ <a href="/">Whatever</a>
7
+ </body>
8
+ </html>
File without changes
@@ -0,0 +1,13 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
2
+ "http://www.w3.org/TR/html4/strict.dtd">
3
+
4
+ <html lang="en">
5
+ <head>
6
+ <title>self-reference</title>
7
+ </head>
8
+ <body>
9
+ <a href="self-reference.html">link</a> <!-- will be converted to nil -->
10
+ <a href="page5.html">link</a>
11
+
12
+ </body>
13
+ </html>
@@ -0,0 +1,4 @@
1
+ $LOAD_PATH << File.join(File.dirname(__FILE__), "..", "lib")
2
+ require 'spec'
3
+ require 'crawler'
4
+ require 'WEBrick'
metadata ADDED
@@ -0,0 +1,87 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: ruby
6
+ authors:
7
+ - Tyler Cunnion
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-01-25 00:00:00 -05:00
13
+ default_executable: crawler
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: nokogiri
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ description: BFS webcrawler that implements Observable
26
+ email: tyler.cunnion@gmail.com
27
+ executables:
28
+ - crawler
29
+ extensions: []
30
+
31
+ extra_rdoc_files: []
32
+
33
+ files:
34
+ - .gitignore
35
+ - VERSION
36
+ - bin/crawler
37
+ - lib/crawler.rb
38
+ - lib/crawler/observer.rb
39
+ - lib/crawler/webcrawler.rb
40
+ - spec/crawler/crawler_spec.rb
41
+ - spec/crawler/observer_spec.rb
42
+ - spec/fixtures/excluded/shouldnt-hit.html
43
+ - spec/fixtures/exclusion.html
44
+ - spec/fixtures/external.html
45
+ - spec/fixtures/index.html
46
+ - spec/fixtures/messed-up.html
47
+ - spec/fixtures/non-html.html
48
+ - spec/fixtures/non-http.html
49
+ - spec/fixtures/page2.html
50
+ - spec/fixtures/page3.html
51
+ - spec/fixtures/page4.html
52
+ - spec/fixtures/page5.html
53
+ - spec/fixtures/pdf.pdf
54
+ - spec/fixtures/self-reference.html
55
+ - spec/spec_helper.rb
56
+ has_rdoc: true
57
+ homepage: http://github.com/tylercunnion/crawler
58
+ licenses: []
59
+
60
+ post_install_message:
61
+ rdoc_options:
62
+ - --charset=UTF-8
63
+ require_paths:
64
+ - lib
65
+ required_ruby_version: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: "0"
70
+ version:
71
+ required_rubygems_version: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: "0"
76
+ version:
77
+ requirements: []
78
+
79
+ rubyforge_project:
80
+ rubygems_version: 1.3.5
81
+ signing_key:
82
+ specification_version: 3
83
+ summary: Simple webcrawler
84
+ test_files:
85
+ - spec/crawler/crawler_spec.rb
86
+ - spec/crawler/observer_spec.rb
87
+ - spec/spec_helper.rb