crawler 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +3 -0
- data/VERSION +1 -0
- data/bin/crawler +37 -0
- data/lib/crawler.rb +2 -0
- data/lib/crawler/observer.rb +22 -0
- data/lib/crawler/webcrawler.rb +74 -0
- data/spec/crawler/crawler_spec.rb +136 -0
- data/spec/crawler/observer_spec.rb +28 -0
- data/spec/fixtures/excluded/shouldnt-hit.html +6 -0
- data/spec/fixtures/exclusion.html +7 -0
- data/spec/fixtures/external.html +8 -0
- data/spec/fixtures/index.html +9 -0
- data/spec/fixtures/messed-up.html +7 -0
- data/spec/fixtures/non-html.html +7 -0
- data/spec/fixtures/non-http.html +14 -0
- data/spec/fixtures/page2.html +9 -0
- data/spec/fixtures/page3.html +8 -0
- data/spec/fixtures/page4.html +0 -0
- data/spec/fixtures/page5.html +8 -0
- data/spec/fixtures/pdf.pdf +0 -0
- data/spec/fixtures/self-reference.html +13 -0
- data/spec/spec_helper.rb +4 -0
- metadata +87 -0
data/.gitignore
ADDED
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.2.0
|
data/bin/crawler
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
$LOAD_PATH << File.join(File.dirname(__FILE__), "..", "lib")
|
3
|
+
require 'rubygems'
|
4
|
+
require 'crawler'
|
5
|
+
require 'optparse'
|
6
|
+
|
7
|
+
options = {}
|
8
|
+
logfile = ""
|
9
|
+
optparser = OptionParser.new do |opts|
|
10
|
+
opts.on("-t", "--timeout X", Integer, "Timemout limit in seconds") { |x| options[:timeout] = x}
|
11
|
+
opts.on("-x", "--exclude path", Array, "List of paths to be excluded") { |x| options[:exclude] = x}
|
12
|
+
opts.on("-l", "--log file", String, "Filename to use as a log") {|log| logfile = log }
|
13
|
+
opts.parse!(ARGV)
|
14
|
+
end
|
15
|
+
|
16
|
+
unless logfile.empty?
|
17
|
+
log = File.new(logfile, "w")
|
18
|
+
else
|
19
|
+
log = $stdout
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
uri_string = ARGV[0]
|
24
|
+
begin
|
25
|
+
uri = URI.parse(uri_string)
|
26
|
+
raise unless uri.is_a?(URI::HTTP)
|
27
|
+
rescue
|
28
|
+
puts "Error parsing URI: #{uri_string}"
|
29
|
+
Process.exit
|
30
|
+
end
|
31
|
+
|
32
|
+
crawler = Crawler::Webcrawler.new(options)
|
33
|
+
observer = Crawler::Observer.new(log)
|
34
|
+
|
35
|
+
crawler.add_observer(observer)
|
36
|
+
|
37
|
+
crawler.crawl(uri)
|
data/lib/crawler.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
module Crawler
|
2
|
+
|
3
|
+
# Observer watches a Webcrawler and outputs messages to a log object. This defaults to STDOUT but may be anything which responds to +puts+.
|
4
|
+
class Observer
|
5
|
+
|
6
|
+
# Log object. Must respond to +puts+.
|
7
|
+
attr_accessor :log
|
8
|
+
|
9
|
+
# Creates a new Observer object
|
10
|
+
def initialize(log=$stdout)
|
11
|
+
@log = log
|
12
|
+
end
|
13
|
+
|
14
|
+
# Called by the Observable module through Webcrawler.
|
15
|
+
def update(response, url)
|
16
|
+
@log.puts "Scanning: " + url.to_s
|
17
|
+
if response.kind_of?(Net::HTTPClientError) or response.kind_of?(Net::HTTPServerError)
|
18
|
+
@log.puts "#{response.code} encountered for " + url.to_s
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'set'
|
2
|
+
require 'observer'
|
3
|
+
require 'net/http'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'timeout'
|
6
|
+
|
7
|
+
module Crawler
|
8
|
+
class Webcrawler
|
9
|
+
|
10
|
+
include Observable
|
11
|
+
|
12
|
+
# Set of all URIs which have been crawled
|
13
|
+
attr_accessor :crawled
|
14
|
+
# Queue of URIs to be crawled. Array which acts as a LIFO queue.
|
15
|
+
attr_accessor :queue
|
16
|
+
# Hash of options
|
17
|
+
attr_accessor :options
|
18
|
+
|
19
|
+
# Accepts the following options:
|
20
|
+
# * timeout -- Time limit for the crawl operation, after which a Timeout::Error exception is raised.
|
21
|
+
def initialize(options={})
|
22
|
+
@crawled = Set.new
|
23
|
+
@queue = []
|
24
|
+
@options = {
|
25
|
+
:timeout => 1.0/0, #Infinity
|
26
|
+
:external => false,
|
27
|
+
:exclude => []
|
28
|
+
}.merge(options)
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
# Given a URI object, the crawler will explore every linked page recursively using the Breadth First Search algorithm.
|
33
|
+
def crawl(start_uri)
|
34
|
+
start_uri = start_uri.normalize
|
35
|
+
@queue << start_uri
|
36
|
+
|
37
|
+
timeout(@options[:timeout]) {
|
38
|
+
while(uri = @queue.shift)
|
39
|
+
|
40
|
+
Net::HTTP.start(uri.host, uri.port) do |http|
|
41
|
+
|
42
|
+
head = http.head(uri.path)
|
43
|
+
next if head.content_type != "text/html" # If the page retrieved is not an HTML document, we'll choke on it anyway. Skip it
|
44
|
+
|
45
|
+
resp = http.get(uri.path)
|
46
|
+
|
47
|
+
changed
|
48
|
+
notify_observers(resp, uri)
|
49
|
+
|
50
|
+
html = Nokogiri.parse(resp.body)
|
51
|
+
a_tags = html.search("a")
|
52
|
+
@queue = @queue + a_tags.collect do |t|
|
53
|
+
begin
|
54
|
+
next_uri = uri + t.attribute("href").to_s.strip
|
55
|
+
rescue
|
56
|
+
nil
|
57
|
+
end
|
58
|
+
end
|
59
|
+
@queue = @queue.compact.uniq
|
60
|
+
@queue = @queue.reject {|u|
|
61
|
+
@crawled.include?(u) or
|
62
|
+
u == uri or
|
63
|
+
!(u.kind_of?(URI::HTTP)) or
|
64
|
+
(u.host != uri.host and !@options[:external]) or
|
65
|
+
(@options[:exclude].any? { |excl| u.path.include?(excl)})
|
66
|
+
}
|
67
|
+
end
|
68
|
+
@crawled << uri
|
69
|
+
end
|
70
|
+
}
|
71
|
+
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), "..", "spec_helper.rb")
|
2
|
+
|
3
|
+
module Crawler
|
4
|
+
describe Webcrawler do
|
5
|
+
|
6
|
+
before(:all) do
|
7
|
+
@uri_base = 'http://localhost:12000/'
|
8
|
+
www_root = File.join(File.dirname(__FILE__), '..', 'fixtures')
|
9
|
+
@server = Thread.new do
|
10
|
+
s = WEBrick::HTTPServer.new({:Port => 12000, :DocumentRoot => www_root, :AccessLog => []})
|
11
|
+
@port = s.config[:Port]
|
12
|
+
begin
|
13
|
+
s.start
|
14
|
+
ensure
|
15
|
+
s.shutdown
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
after(:all) do
|
21
|
+
@server.exit
|
22
|
+
end
|
23
|
+
|
24
|
+
context "before crawl" do
|
25
|
+
it "should have an empty crawl list" do
|
26
|
+
crawler = Webcrawler.new
|
27
|
+
crawler.crawled.should be_empty
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
context "during a crawl" do
|
32
|
+
|
33
|
+
before(:each) do
|
34
|
+
@crawler = Webcrawler.new
|
35
|
+
@obs = mock("observer", :update => nil, :null_object => true)
|
36
|
+
#@obs = Observer.new
|
37
|
+
@crawler.add_observer(@obs)
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should send notifications" do
|
41
|
+
uri = URI.parse(@uri_base)
|
42
|
+
@obs.should_receive(:update)
|
43
|
+
@crawler.crawl(uri)
|
44
|
+
end
|
45
|
+
|
46
|
+
it "should send status code and URL" do
|
47
|
+
uri = URI.parse(@uri_base)
|
48
|
+
@obs.should_receive(:update).with(kind_of(Net::HTTPResponse), kind_of(URI))
|
49
|
+
@crawler.crawl(uri)
|
50
|
+
end
|
51
|
+
|
52
|
+
it "should send 404 for missing URL" do
|
53
|
+
uri = URI.parse(@uri_base + 'doesnotexist.html')
|
54
|
+
@obs.should_receive(:update).with(instance_of(Net::HTTPNotFound), uri)
|
55
|
+
@crawler.crawl(uri)
|
56
|
+
end
|
57
|
+
|
58
|
+
it "should not crawl a page more than once" do
|
59
|
+
uri = URI.parse(@uri_base)
|
60
|
+
@obs.should_receive(:update).with(kind_of(Net::HTTPResponse), uri + '/page5.html').once
|
61
|
+
@obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), uri + '/page5.html')
|
62
|
+
@crawler.crawl(uri)
|
63
|
+
end
|
64
|
+
|
65
|
+
it "should not add the current page to the queue" do
|
66
|
+
uri = URI.parse(@uri_base + "self-reference.html")
|
67
|
+
@obs.should_receive(:update).with(kind_of(Net::HTTPResponse), uri).once
|
68
|
+
@obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), uri)
|
69
|
+
@crawler.crawl(uri)
|
70
|
+
end
|
71
|
+
|
72
|
+
it "should remove nil items from the queue" do
|
73
|
+
uri = URI.parse(@uri_base + "self-reference.html")
|
74
|
+
@obs.should_receive(:update).twice
|
75
|
+
@crawler.crawl(uri)
|
76
|
+
end
|
77
|
+
|
78
|
+
it "should convert any exceptions to nil" do
|
79
|
+
uri = URI.parse(@uri_base + 'messed-up.html')
|
80
|
+
lambda { @crawler.crawl(uri) }.should_not raise_error
|
81
|
+
end
|
82
|
+
|
83
|
+
it "should not crawl anything but HTTP web addresses" do
|
84
|
+
uri = URI.parse(@uri_base + 'non-http.html')
|
85
|
+
@obs.should_receive(:update).once
|
86
|
+
@crawler.crawl(uri)
|
87
|
+
end
|
88
|
+
|
89
|
+
it "should not, by default, crawl outside its original host" do
|
90
|
+
uri = URI.parse(@uri_base + 'external.html')
|
91
|
+
@obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), URI.parse("http://example.com"))
|
92
|
+
@crawler.crawl(uri)
|
93
|
+
end
|
94
|
+
|
95
|
+
it "should only download HTML content types" do
|
96
|
+
uri = URI.parse(@uri_base + 'non-html.html')
|
97
|
+
@obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), uri + '/pdf.pdf')
|
98
|
+
@crawler.crawl(uri)
|
99
|
+
end
|
100
|
+
|
101
|
+
it "should not download anything in the excluded option" do
|
102
|
+
uri = URI.parse(@uri_base + 'exclusion.html')
|
103
|
+
@crawler.options[:exclude] = ["/excluded/"]
|
104
|
+
@obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), uri + '/excluded/shouldnt-hit.html')
|
105
|
+
@crawler.crawl(uri)
|
106
|
+
end
|
107
|
+
|
108
|
+
|
109
|
+
end
|
110
|
+
|
111
|
+
context "after crawl" do
|
112
|
+
before(:each) do
|
113
|
+
@crawler = Webcrawler.new
|
114
|
+
@uri = URI.parse(@uri_base)
|
115
|
+
@crawler.crawl(@uri)
|
116
|
+
end
|
117
|
+
|
118
|
+
it "should have at least one item in crawled" do
|
119
|
+
@crawler.should have_at_least(1).crawled
|
120
|
+
end
|
121
|
+
|
122
|
+
it "should have put crawled links into crawled" do
|
123
|
+
@crawler.should have_at_least(2).crawled
|
124
|
+
end
|
125
|
+
|
126
|
+
it "should have the children of child pages in crawled" do
|
127
|
+
@crawler.crawled.should include(@uri + "/page4.html")
|
128
|
+
end
|
129
|
+
|
130
|
+
it "should have an empty queue" do
|
131
|
+
@crawler.queue.should be_empty
|
132
|
+
end
|
133
|
+
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), "..", "spec_helper.rb")
|
2
|
+
require 'stringio'
|
3
|
+
|
4
|
+
module Crawler
|
5
|
+
describe Observer do
|
6
|
+
|
7
|
+
def test_code(code, log, obs)
|
8
|
+
log.should_receive(:puts).with("#{code} encountered for http://example.com/")
|
9
|
+
resp = Net::HTTPResponse::CODE_TO_OBJ["#{code}"].new("1.1", code, "")
|
10
|
+
obs.update(resp, URI.parse("http://example.com/"))
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should output a warning when an error code is reached" do
|
14
|
+
log = double('log', :null_object => true)
|
15
|
+
obs = Observer.new(log)
|
16
|
+
(400..416).each { |code| test_code(code, log, obs) }
|
17
|
+
(500..505).each { |code| test_code(code, log, obs) }
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should not output a warning when 200 is encountered" do
|
21
|
+
log = double('log')
|
22
|
+
obs = Observer.new(log)
|
23
|
+
log.should_not_receive(:puts).with(/\d{3} encountered/)
|
24
|
+
obs.update(Net::HTTPOK.new("1.1", "200", ""), URI.parse("http://example.com/"))
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
2
|
+
"http://www.w3.org/TR/html4/strict.dtd">
|
3
|
+
|
4
|
+
<html lang="en">
|
5
|
+
<head>
|
6
|
+
<title>non-http</title>
|
7
|
+
</head>
|
8
|
+
<body>
|
9
|
+
<!-- None of the following should be followed -->
|
10
|
+
<a href="mailto:test@example.com">mailto</a>
|
11
|
+
<a href="ftp://ftp.example.com">ftp</a>
|
12
|
+
|
13
|
+
</body>
|
14
|
+
</html>
|
File without changes
|
File without changes
|
@@ -0,0 +1,13 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
2
|
+
"http://www.w3.org/TR/html4/strict.dtd">
|
3
|
+
|
4
|
+
<html lang="en">
|
5
|
+
<head>
|
6
|
+
<title>self-reference</title>
|
7
|
+
</head>
|
8
|
+
<body>
|
9
|
+
<a href="self-reference.html">link</a> <!-- will be converted to nil -->
|
10
|
+
<a href="page5.html">link</a>
|
11
|
+
|
12
|
+
</body>
|
13
|
+
</html>
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: crawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Tyler Cunnion
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-01-25 00:00:00 -05:00
|
13
|
+
default_executable: crawler
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: nokogiri
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
version:
|
25
|
+
description: BFS webcrawler that implements Observable
|
26
|
+
email: tyler.cunnion@gmail.com
|
27
|
+
executables:
|
28
|
+
- crawler
|
29
|
+
extensions: []
|
30
|
+
|
31
|
+
extra_rdoc_files: []
|
32
|
+
|
33
|
+
files:
|
34
|
+
- .gitignore
|
35
|
+
- VERSION
|
36
|
+
- bin/crawler
|
37
|
+
- lib/crawler.rb
|
38
|
+
- lib/crawler/observer.rb
|
39
|
+
- lib/crawler/webcrawler.rb
|
40
|
+
- spec/crawler/crawler_spec.rb
|
41
|
+
- spec/crawler/observer_spec.rb
|
42
|
+
- spec/fixtures/excluded/shouldnt-hit.html
|
43
|
+
- spec/fixtures/exclusion.html
|
44
|
+
- spec/fixtures/external.html
|
45
|
+
- spec/fixtures/index.html
|
46
|
+
- spec/fixtures/messed-up.html
|
47
|
+
- spec/fixtures/non-html.html
|
48
|
+
- spec/fixtures/non-http.html
|
49
|
+
- spec/fixtures/page2.html
|
50
|
+
- spec/fixtures/page3.html
|
51
|
+
- spec/fixtures/page4.html
|
52
|
+
- spec/fixtures/page5.html
|
53
|
+
- spec/fixtures/pdf.pdf
|
54
|
+
- spec/fixtures/self-reference.html
|
55
|
+
- spec/spec_helper.rb
|
56
|
+
has_rdoc: true
|
57
|
+
homepage: http://github.com/tylercunnion/crawler
|
58
|
+
licenses: []
|
59
|
+
|
60
|
+
post_install_message:
|
61
|
+
rdoc_options:
|
62
|
+
- --charset=UTF-8
|
63
|
+
require_paths:
|
64
|
+
- lib
|
65
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: "0"
|
70
|
+
version:
|
71
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: "0"
|
76
|
+
version:
|
77
|
+
requirements: []
|
78
|
+
|
79
|
+
rubyforge_project:
|
80
|
+
rubygems_version: 1.3.5
|
81
|
+
signing_key:
|
82
|
+
specification_version: 3
|
83
|
+
summary: Simple webcrawler
|
84
|
+
test_files:
|
85
|
+
- spec/crawler/crawler_spec.rb
|
86
|
+
- spec/crawler/observer_spec.rb
|
87
|
+
- spec/spec_helper.rb
|