crawler 0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,24 @@
1
+ Rakefile
2
+ bin/crawler
3
+ features/step_definitions/crawler_steps.rb
4
+ features/support/env.rb
5
+ lib/crawler.rb
6
+ lib/crawler/observer.rb
7
+ lib/crawler/webcrawler.rb
8
+ spec/crawler/crawler_spec.rb
9
+ spec/crawler/observer_spec.rb
10
+ spec/fixtures/excluded/shouldnt-hit.html
11
+ spec/fixtures/exclusion.html
12
+ spec/fixtures/external.html
13
+ spec/fixtures/index.html
14
+ spec/fixtures/messed-up.html
15
+ spec/fixtures/non-html.html
16
+ spec/fixtures/non-http.html
17
+ spec/fixtures/page2.html
18
+ spec/fixtures/page3.html
19
+ spec/fixtures/page4.html
20
+ spec/fixtures/page5.html
21
+ spec/fixtures/pdf.pdf
22
+ spec/fixtures/self-reference.html
23
+ spec/spec_helper.rb
24
+ Manifest
@@ -0,0 +1,10 @@
1
+ require 'rake'
2
+ require 'echoe'
3
+
4
+ Echoe.new('crawler', '0.2') do |g|
5
+ g.description = "Simple webcrawler"
6
+ g.url = "http://github.com/tylercunnion/crawler"
7
+ g.author = "Tyler Cunnion"
8
+ g.email = "tyler.cunnion+ruby@gmail.com"
9
+ g.ignore_pattern = ["tmp/*", "features/*", "log/*"]
10
+ end
@@ -0,0 +1,37 @@
1
+ #! /usr/bin/ruby
2
+ $LOAD_PATH << File.join(File.dirname(__FILE__), "..", "lib")
3
+ require 'rubygems'
4
+ require 'crawler'
5
+ require 'optparse'
6
+
7
+ options = {}
8
+ logfile = ""
9
+ optparser = OptionParser.new do |opts|
10
+ opts.on("-t", "--timeout X", Integer, "Timemout limit in seconds") { |x| options[:timeout] = x}
11
+ opts.on("-x", "--exclude path", Array, "List of paths to be excluded") { |x| options[:exclude] = x}
12
+ opts.on("-l", "--log file", String, "Filename to use as a log") {|log| logfile = log }
13
+ opts.parse!(ARGV)
14
+ end
15
+
16
+ unless logfile.empty?
17
+ log = File.new(logfile, "w")
18
+ else
19
+ log = $stdout
20
+ end
21
+
22
+
23
+ uri_string = ARGV[0]
24
+ begin
25
+ uri = URI.parse(uri_string)
26
+ raise unless uri.is_a?(URI::HTTP)
27
+ rescue
28
+ puts "Error parsing URI: #{uri_string}"
29
+ Process.exit
30
+ end
31
+
32
+ crawler = Crawler::Webcrawler.new(options)
33
+ observer = Crawler::Observer.new(log)
34
+
35
+ crawler.add_observer(observer)
36
+
37
+ crawler.crawl(uri)
@@ -0,0 +1,32 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{crawler}
5
+ s.version = "0.2"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Tyler Cunnion"]
9
+ s.date = %q{2010-01-25}
10
+ s.default_executable = %q{crawler}
11
+ s.description = %q{Simple webcrawler}
12
+ s.email = %q{tyler.cunnion+ruby@gmail.com}
13
+ s.executables = ["crawler"]
14
+ s.extra_rdoc_files = ["bin/crawler", "lib/crawler.rb", "lib/crawler/observer.rb", "lib/crawler/webcrawler.rb"]
15
+ s.files = ["Rakefile", "bin/crawler", "features/step_definitions/crawler_steps.rb", "features/support/env.rb", "lib/crawler.rb", "lib/crawler/observer.rb", "lib/crawler/webcrawler.rb", "spec/crawler/crawler_spec.rb", "spec/crawler/observer_spec.rb", "spec/fixtures/excluded/shouldnt-hit.html", "spec/fixtures/exclusion.html", "spec/fixtures/external.html", "spec/fixtures/index.html", "spec/fixtures/messed-up.html", "spec/fixtures/non-html.html", "spec/fixtures/non-http.html", "spec/fixtures/page2.html", "spec/fixtures/page3.html", "spec/fixtures/page4.html", "spec/fixtures/page5.html", "spec/fixtures/pdf.pdf", "spec/fixtures/self-reference.html", "spec/spec_helper.rb", "Manifest", "crawler.gemspec"]
16
+ s.homepage = %q{http://github.com/tylercunnion/crawler}
17
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Crawler"]
18
+ s.require_paths = ["lib"]
19
+ s.rubyforge_project = %q{crawler}
20
+ s.rubygems_version = %q{1.3.5}
21
+ s.summary = %q{Simple webcrawler}
22
+
23
+ if s.respond_to? :specification_version then
24
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
25
+ s.specification_version = 3
26
+
27
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
28
+ else
29
+ end
30
+ else
31
+ end
32
+ end
@@ -0,0 +1,20 @@
1
+ Given /^the crawl has not begun$/ do
2
+ end
3
+
4
+ When /^I start a crawl with the URI "([^\"]*)"$/ do |arg1|
5
+ @obs = Crawler::Observer.new
6
+ @crawler = Crawler::Webcrawler.new
7
+
8
+ @crawler.add_observer(@obs)
9
+
10
+ @uri = URI.parse(arg1)
11
+ @crawler.crawl(@uri)
12
+ end
13
+
14
+ Then /^the page should be downloaded$/ do
15
+ @crawler.crawled.should include(@uri)
16
+ end
17
+
18
+ Then /^the observer should be updated$/ do
19
+ @obs.should_receive(:update)
20
+ end
@@ -0,0 +1,2 @@
1
+ $LOAD_PATH << File.join(File.dirname(__FILE__), "..", "..", "lib")
2
+ require 'crawler'
@@ -0,0 +1,2 @@
1
+ require 'crawler/webcrawler'
2
+ require 'crawler/observer'
@@ -0,0 +1,22 @@
1
+ module Crawler
2
+
3
+ # Observer watches a Webcrawler and outputs messages to a log object. This defaults to STDOUT but may be anything which responds to +puts+.
4
+ class Observer
5
+
6
+ # Log object. Must respond to +puts+.
7
+ attr_accessor :log
8
+
9
+ # Creates a new Observer object
10
+ def initialize(log=$stdout)
11
+ @log = log
12
+ end
13
+
14
+ # Called by the Observable module through Webcrawler.
15
+ def update(response, url)
16
+ @log.puts "Scanning: " + url.to_s
17
+ if response.kind_of?(Net::HTTPClientError) or response.kind_of?(Net::HTTPServerError)
18
+ @log.puts "#{response.code} encountered for " + url.to_s
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,74 @@
1
+ require 'set'
2
+ require 'observer'
3
+ require 'net/http'
4
+ require 'nokogiri'
5
+ require 'timeout'
6
+
7
+ module Crawler
8
+ class Webcrawler
9
+
10
+ include Observable
11
+
12
+ # Set of all URIs which have been crawled
13
+ attr_accessor :crawled
14
+ # Queue of URIs to be crawled. Array which acts as a LIFO queue.
15
+ attr_accessor :queue
16
+ # Hash of options
17
+ attr_accessor :options
18
+
19
+ # Accepts the following options:
20
+ # * timeout -- Time limit for the crawl operation, after which a Timeout::Error exception is raised.
21
+ def initialize(options={})
22
+ @crawled = Set.new
23
+ @queue = []
24
+ @options = {
25
+ :timeout => 1.0/0, #Infinity
26
+ :external => false,
27
+ :exclude => []
28
+ }.merge(options)
29
+
30
+ end
31
+
32
+ # Given a URI object, the crawler will explore every linked page recursively using the Breadth First Search algorithm.
33
+ def crawl(start_uri)
34
+ start_uri = start_uri.normalize
35
+ @queue << start_uri
36
+
37
+ timeout(@options[:timeout]) {
38
+ while(uri = @queue.shift)
39
+
40
+ Net::HTTP.start(uri.host, uri.port) do |http|
41
+
42
+ head = http.head(uri.path)
43
+ next if head.content_type != "text/html" # If the page retrieved is not an HTML document, we'll choke on it anyway. Skip it
44
+
45
+ resp = http.get(uri.path)
46
+
47
+ changed
48
+ notify_observers(resp, uri)
49
+
50
+ html = Nokogiri.parse(resp.body)
51
+ a_tags = html.search("a")
52
+ @queue = @queue + a_tags.collect do |t|
53
+ begin
54
+ next_uri = uri + t.attribute("href").to_s.strip
55
+ rescue
56
+ nil
57
+ end
58
+ end
59
+ @queue = @queue.compact.uniq
60
+ @queue = @queue.reject {|u|
61
+ @crawled.include?(u) or
62
+ u == uri or
63
+ !(u.kind_of?(URI::HTTP)) or
64
+ (u.host != uri.host and !@options[:external]) or
65
+ (@options[:exclude].any? { |excl| u.path.include?(excl)})
66
+ }
67
+ end
68
+ @crawled << uri
69
+ end
70
+ }
71
+
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,136 @@
1
+ require File.join(File.dirname(__FILE__), "..", "spec_helper.rb")
2
+
3
+ module Crawler
4
+ describe Webcrawler do
5
+
6
+ before(:all) do
7
+ @uri_base = 'http://localhost:12000/'
8
+ www_root = File.join(File.dirname(__FILE__), '..', 'fixtures')
9
+ @server = Thread.new do
10
+ s = WEBrick::HTTPServer.new({:Port => 12000, :DocumentRoot => www_root, :AccessLog => []})
11
+ @port = s.config[:Port]
12
+ begin
13
+ s.start
14
+ ensure
15
+ s.shutdown
16
+ end
17
+ end
18
+ end
19
+
20
+ after(:all) do
21
+ @server.exit
22
+ end
23
+
24
+ context "before crawl" do
25
+ it "should have an empty crawl list" do
26
+ crawler = Webcrawler.new
27
+ crawler.crawled.should be_empty
28
+ end
29
+ end
30
+
31
+ context "during a crawl" do
32
+
33
+ before(:each) do
34
+ @crawler = Webcrawler.new
35
+ @obs = mock("observer", :update => nil, :null_object => true)
36
+ #@obs = Observer.new
37
+ @crawler.add_observer(@obs)
38
+ end
39
+
40
+ it "should send notifications" do
41
+ uri = URI.parse(@uri_base)
42
+ @obs.should_receive(:update)
43
+ @crawler.crawl(uri)
44
+ end
45
+
46
+ it "should send status code and URL" do
47
+ uri = URI.parse(@uri_base)
48
+ @obs.should_receive(:update).with(kind_of(Net::HTTPResponse), kind_of(URI))
49
+ @crawler.crawl(uri)
50
+ end
51
+
52
+ it "should send 404 for missing URL" do
53
+ uri = URI.parse(@uri_base + 'doesnotexist.html')
54
+ @obs.should_receive(:update).with(instance_of(Net::HTTPNotFound), uri)
55
+ @crawler.crawl(uri)
56
+ end
57
+
58
+ it "should not crawl a page more than once" do
59
+ uri = URI.parse(@uri_base)
60
+ @obs.should_receive(:update).with(kind_of(Net::HTTPResponse), uri + '/page5.html').once
61
+ @obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), uri + '/page5.html')
62
+ @crawler.crawl(uri)
63
+ end
64
+
65
+ it "should not add the current page to the queue" do
66
+ uri = URI.parse(@uri_base + "self-reference.html")
67
+ @obs.should_receive(:update).with(kind_of(Net::HTTPResponse), uri).once
68
+ @obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), uri)
69
+ @crawler.crawl(uri)
70
+ end
71
+
72
+ it "should remove nil items from the queue" do
73
+ uri = URI.parse(@uri_base + "self-reference.html")
74
+ @obs.should_receive(:update).twice
75
+ @crawler.crawl(uri)
76
+ end
77
+
78
+ it "should convert any exceptions to nil" do
79
+ uri = URI.parse(@uri_base + 'messed-up.html')
80
+ lambda { @crawler.crawl(uri) }.should_not raise_error
81
+ end
82
+
83
+ it "should not crawl anything but HTTP web addresses" do
84
+ uri = URI.parse(@uri_base + 'non-http.html')
85
+ @obs.should_receive(:update).once
86
+ @crawler.crawl(uri)
87
+ end
88
+
89
+ it "should not, by default, crawl outside its original host" do
90
+ uri = URI.parse(@uri_base + 'external.html')
91
+ @obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), URI.parse("http://example.com"))
92
+ @crawler.crawl(uri)
93
+ end
94
+
95
+ it "should only download HTML content types" do
96
+ uri = URI.parse(@uri_base + 'non-html.html')
97
+ @obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), uri + '/pdf.pdf')
98
+ @crawler.crawl(uri)
99
+ end
100
+
101
+ it "should not download anything in the excluded option" do
102
+ uri = URI.parse(@uri_base + 'exclusion.html')
103
+ @crawler.options[:exclude] = ["/excluded/"]
104
+ @obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), uri + '/excluded/shouldnt-hit.html')
105
+ @crawler.crawl(uri)
106
+ end
107
+
108
+
109
+ end
110
+
111
+ context "after crawl" do
112
+ before(:each) do
113
+ @crawler = Webcrawler.new
114
+ @uri = URI.parse(@uri_base)
115
+ @crawler.crawl(@uri)
116
+ end
117
+
118
+ it "should have at least one item in crawled" do
119
+ @crawler.should have_at_least(1).crawled
120
+ end
121
+
122
+ it "should have put crawled links into crawled" do
123
+ @crawler.should have_at_least(2).crawled
124
+ end
125
+
126
+ it "should have the children of child pages in crawled" do
127
+ @crawler.crawled.should include(@uri + "/page4.html")
128
+ end
129
+
130
+ it "should have an empty queue" do
131
+ @crawler.queue.should be_empty
132
+ end
133
+
134
+ end
135
+ end
136
+ end
@@ -0,0 +1,28 @@
1
+ require File.join(File.dirname(__FILE__), "..", "spec_helper.rb")
2
+ require 'stringio'
3
+
4
+ module Crawler
5
+ describe Observer do
6
+
7
+ def test_code(code, log, obs)
8
+ log.should_receive(:puts).with("#{code} encountered for http://example.com/")
9
+ resp = Net::HTTPResponse::CODE_TO_OBJ["#{code}"].new("1.1", code, "")
10
+ obs.update(resp, URI.parse("http://example.com/"))
11
+ end
12
+
13
+ it "should output a warning when an error code is reached" do
14
+ log = double('log', :null_object => true)
15
+ obs = Observer.new(log)
16
+ (400..416).each { |code| test_code(code, log, obs) }
17
+ (500..505).each { |code| test_code(code, log, obs) }
18
+ end
19
+
20
+ it "should not output a warning when 200 is encountered" do
21
+ log = double('log')
22
+ obs = Observer.new(log)
23
+ log.should_not_receive(:puts).with(/\d{3} encountered/)
24
+ obs.update(Net::HTTPOK.new("1.1", "200", ""), URI.parse("http://example.com/"))
25
+ end
26
+
27
+ end
28
+ end
@@ -0,0 +1,6 @@
1
+ <html>
2
+ <head>
3
+ </head>
4
+ <body>
5
+ you should never see this
6
+ </body>
@@ -0,0 +1,7 @@
1
+ <html>
2
+ <head>
3
+ </head>
4
+ <body>
5
+ <a href="/excluded/shouldnt-hit.html">nope</a>
6
+ </body>
7
+ </html>
@@ -0,0 +1,8 @@
1
+ <html>
2
+ <head>
3
+ <title>External</title>
4
+ </head>
5
+ <body>
6
+ <a href="http://example.com/">example</a>
7
+ </body>
8
+ </html>
@@ -0,0 +1,9 @@
1
+ <html>
2
+ <head>
3
+ <title>Test</title>
4
+ </head>
5
+ <body>
6
+ <a href="page2.html">Page 2</a>
7
+ <a href="page3.html">Page 3</a>
8
+ </body>
9
+ </html>
@@ -0,0 +1,7 @@
1
+ <html>
2
+ <head></head>
3
+ <body>
4
+ <a href=" flange.html">yes</a>
5
+ <a href="javascript: void(0);">javascript</a>
6
+ </body>
7
+ </html>
@@ -0,0 +1,7 @@
1
+ <html>
2
+ <head>
3
+ </head>
4
+ <body>
5
+ <a href="/pdf.pdf">pdf</a>
6
+ </body>
7
+ </html>
@@ -0,0 +1,14 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
2
+ "http://www.w3.org/TR/html4/strict.dtd">
3
+
4
+ <html lang="en">
5
+ <head>
6
+ <title>non-http</title>
7
+ </head>
8
+ <body>
9
+ <!-- None of the following should be followed -->
10
+ <a href="mailto:test@example.com">mailto</a>
11
+ <a href="ftp://ftp.example.com">ftp</a>
12
+
13
+ </body>
14
+ </html>
@@ -0,0 +1,9 @@
1
+ <html>
2
+ <head>
3
+ <title>test2</title>
4
+ </head>
5
+ <body>
6
+ <a href="/page4.html">Link</a>
7
+ <a href="/page5.html">Link</a>
8
+ </body>
9
+ </html>
@@ -0,0 +1,8 @@
1
+ <html>
2
+ <head>
3
+ <title>test2</title>
4
+ </head>
5
+ <body>
6
+ <a href="/page5.html">Link</a>
7
+ </body>
8
+ </html>
File without changes
@@ -0,0 +1,8 @@
1
+ <html>
2
+ <head>
3
+ <title>something</title>
4
+ </head>
5
+ <body>
6
+ <a href="/">Whatever</a>
7
+ </body>
8
+ </html>
File without changes
@@ -0,0 +1,13 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
2
+ "http://www.w3.org/TR/html4/strict.dtd">
3
+
4
+ <html lang="en">
5
+ <head>
6
+ <title>self-reference</title>
7
+ </head>
8
+ <body>
9
+ <a href="self-reference.html">link</a> <!-- will be converted to nil -->
10
+ <a href="page5.html">link</a>
11
+
12
+ </body>
13
+ </html>
@@ -0,0 +1,4 @@
1
+ $LOAD_PATH << File.join(File.dirname(__FILE__), "..", "lib")
2
+ require 'spec'
3
+ require 'crawler'
4
+ require 'WEBrick'
metadata ADDED
@@ -0,0 +1,85 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: "0.2"
5
+ platform: ruby
6
+ authors:
7
+ - Tyler Cunnion
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-01-25 00:00:00 -05:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Simple webcrawler
17
+ email: tyler.cunnion+ruby@gmail.com
18
+ executables:
19
+ - crawler
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - bin/crawler
24
+ - lib/crawler.rb
25
+ - lib/crawler/observer.rb
26
+ - lib/crawler/webcrawler.rb
27
+ files:
28
+ - Rakefile
29
+ - bin/crawler
30
+ - features/step_definitions/crawler_steps.rb
31
+ - features/support/env.rb
32
+ - lib/crawler.rb
33
+ - lib/crawler/observer.rb
34
+ - lib/crawler/webcrawler.rb
35
+ - spec/crawler/crawler_spec.rb
36
+ - spec/crawler/observer_spec.rb
37
+ - spec/fixtures/excluded/shouldnt-hit.html
38
+ - spec/fixtures/exclusion.html
39
+ - spec/fixtures/external.html
40
+ - spec/fixtures/index.html
41
+ - spec/fixtures/messed-up.html
42
+ - spec/fixtures/non-html.html
43
+ - spec/fixtures/non-http.html
44
+ - spec/fixtures/page2.html
45
+ - spec/fixtures/page3.html
46
+ - spec/fixtures/page4.html
47
+ - spec/fixtures/page5.html
48
+ - spec/fixtures/pdf.pdf
49
+ - spec/fixtures/self-reference.html
50
+ - spec/spec_helper.rb
51
+ - Manifest
52
+ - crawler.gemspec
53
+ has_rdoc: true
54
+ homepage: http://github.com/tylercunnion/crawler
55
+ licenses: []
56
+
57
+ post_install_message:
58
+ rdoc_options:
59
+ - --line-numbers
60
+ - --inline-source
61
+ - --title
62
+ - Crawler
63
+ require_paths:
64
+ - lib
65
+ required_ruby_version: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: "0"
70
+ version:
71
+ required_rubygems_version: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: "1.2"
76
+ version:
77
+ requirements: []
78
+
79
+ rubyforge_project: crawler
80
+ rubygems_version: 1.3.5
81
+ signing_key:
82
+ specification_version: 3
83
+ summary: Simple webcrawler
84
+ test_files: []
85
+