crawler 0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Manifest +24 -0
- data/Rakefile +10 -0
- data/bin/crawler +37 -0
- data/crawler.gemspec +32 -0
- data/features/step_definitions/crawler_steps.rb +20 -0
- data/features/support/env.rb +2 -0
- data/lib/crawler.rb +2 -0
- data/lib/crawler/observer.rb +22 -0
- data/lib/crawler/webcrawler.rb +74 -0
- data/spec/crawler/crawler_spec.rb +136 -0
- data/spec/crawler/observer_spec.rb +28 -0
- data/spec/fixtures/excluded/shouldnt-hit.html +6 -0
- data/spec/fixtures/exclusion.html +7 -0
- data/spec/fixtures/external.html +8 -0
- data/spec/fixtures/index.html +9 -0
- data/spec/fixtures/messed-up.html +7 -0
- data/spec/fixtures/non-html.html +7 -0
- data/spec/fixtures/non-http.html +14 -0
- data/spec/fixtures/page2.html +9 -0
- data/spec/fixtures/page3.html +8 -0
- data/spec/fixtures/page4.html +0 -0
- data/spec/fixtures/page5.html +8 -0
- data/spec/fixtures/pdf.pdf +0 -0
- data/spec/fixtures/self-reference.html +13 -0
- data/spec/spec_helper.rb +4 -0
- metadata +85 -0
data/Manifest
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
Rakefile
|
2
|
+
bin/crawler
|
3
|
+
features/step_definitions/crawler_steps.rb
|
4
|
+
features/support/env.rb
|
5
|
+
lib/crawler.rb
|
6
|
+
lib/crawler/observer.rb
|
7
|
+
lib/crawler/webcrawler.rb
|
8
|
+
spec/crawler/crawler_spec.rb
|
9
|
+
spec/crawler/observer_spec.rb
|
10
|
+
spec/fixtures/excluded/shouldnt-hit.html
|
11
|
+
spec/fixtures/exclusion.html
|
12
|
+
spec/fixtures/external.html
|
13
|
+
spec/fixtures/index.html
|
14
|
+
spec/fixtures/messed-up.html
|
15
|
+
spec/fixtures/non-html.html
|
16
|
+
spec/fixtures/non-http.html
|
17
|
+
spec/fixtures/page2.html
|
18
|
+
spec/fixtures/page3.html
|
19
|
+
spec/fixtures/page4.html
|
20
|
+
spec/fixtures/page5.html
|
21
|
+
spec/fixtures/pdf.pdf
|
22
|
+
spec/fixtures/self-reference.html
|
23
|
+
spec/spec_helper.rb
|
24
|
+
Manifest
|
data/Rakefile
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
require 'rake'
|
2
|
+
require 'echoe'
|
3
|
+
|
4
|
+
Echoe.new('crawler', '0.2') do |g|
|
5
|
+
g.description = "Simple webcrawler"
|
6
|
+
g.url = "http://github.com/tylercunnion/crawler"
|
7
|
+
g.author = "Tyler Cunnion"
|
8
|
+
g.email = "tyler.cunnion+ruby@gmail.com"
|
9
|
+
g.ignore_pattern = ["tmp/*", "features/*", "log/*"]
|
10
|
+
end
|
data/bin/crawler
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
$LOAD_PATH << File.join(File.dirname(__FILE__), "..", "lib")
|
3
|
+
require 'rubygems'
|
4
|
+
require 'crawler'
|
5
|
+
require 'optparse'
|
6
|
+
|
7
|
+
options = {}
|
8
|
+
logfile = ""
|
9
|
+
optparser = OptionParser.new do |opts|
|
10
|
+
opts.on("-t", "--timeout X", Integer, "Timemout limit in seconds") { |x| options[:timeout] = x}
|
11
|
+
opts.on("-x", "--exclude path", Array, "List of paths to be excluded") { |x| options[:exclude] = x}
|
12
|
+
opts.on("-l", "--log file", String, "Filename to use as a log") {|log| logfile = log }
|
13
|
+
opts.parse!(ARGV)
|
14
|
+
end
|
15
|
+
|
16
|
+
unless logfile.empty?
|
17
|
+
log = File.new(logfile, "w")
|
18
|
+
else
|
19
|
+
log = $stdout
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
uri_string = ARGV[0]
|
24
|
+
begin
|
25
|
+
uri = URI.parse(uri_string)
|
26
|
+
raise unless uri.is_a?(URI::HTTP)
|
27
|
+
rescue
|
28
|
+
puts "Error parsing URI: #{uri_string}"
|
29
|
+
Process.exit
|
30
|
+
end
|
31
|
+
|
32
|
+
crawler = Crawler::Webcrawler.new(options)
|
33
|
+
observer = Crawler::Observer.new(log)
|
34
|
+
|
35
|
+
crawler.add_observer(observer)
|
36
|
+
|
37
|
+
crawler.crawl(uri)
|
data/crawler.gemspec
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{crawler}
|
5
|
+
s.version = "0.2"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["Tyler Cunnion"]
|
9
|
+
s.date = %q{2010-01-25}
|
10
|
+
s.default_executable = %q{crawler}
|
11
|
+
s.description = %q{Simple webcrawler}
|
12
|
+
s.email = %q{tyler.cunnion+ruby@gmail.com}
|
13
|
+
s.executables = ["crawler"]
|
14
|
+
s.extra_rdoc_files = ["bin/crawler", "lib/crawler.rb", "lib/crawler/observer.rb", "lib/crawler/webcrawler.rb"]
|
15
|
+
s.files = ["Rakefile", "bin/crawler", "features/step_definitions/crawler_steps.rb", "features/support/env.rb", "lib/crawler.rb", "lib/crawler/observer.rb", "lib/crawler/webcrawler.rb", "spec/crawler/crawler_spec.rb", "spec/crawler/observer_spec.rb", "spec/fixtures/excluded/shouldnt-hit.html", "spec/fixtures/exclusion.html", "spec/fixtures/external.html", "spec/fixtures/index.html", "spec/fixtures/messed-up.html", "spec/fixtures/non-html.html", "spec/fixtures/non-http.html", "spec/fixtures/page2.html", "spec/fixtures/page3.html", "spec/fixtures/page4.html", "spec/fixtures/page5.html", "spec/fixtures/pdf.pdf", "spec/fixtures/self-reference.html", "spec/spec_helper.rb", "Manifest", "crawler.gemspec"]
|
16
|
+
s.homepage = %q{http://github.com/tylercunnion/crawler}
|
17
|
+
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Crawler"]
|
18
|
+
s.require_paths = ["lib"]
|
19
|
+
s.rubyforge_project = %q{crawler}
|
20
|
+
s.rubygems_version = %q{1.3.5}
|
21
|
+
s.summary = %q{Simple webcrawler}
|
22
|
+
|
23
|
+
if s.respond_to? :specification_version then
|
24
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
25
|
+
s.specification_version = 3
|
26
|
+
|
27
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
28
|
+
else
|
29
|
+
end
|
30
|
+
else
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
Given /^the crawl has not begun$/ do
|
2
|
+
end
|
3
|
+
|
4
|
+
When /^I start a crawl with the URI "([^\"]*)"$/ do |arg1|
|
5
|
+
@obs = Crawler::Observer.new
|
6
|
+
@crawler = Crawler::Webcrawler.new
|
7
|
+
|
8
|
+
@crawler.add_observer(@obs)
|
9
|
+
|
10
|
+
@uri = URI.parse(arg1)
|
11
|
+
@crawler.crawl(@uri)
|
12
|
+
end
|
13
|
+
|
14
|
+
Then /^the page should be downloaded$/ do
|
15
|
+
@crawler.crawled.should include(@uri)
|
16
|
+
end
|
17
|
+
|
18
|
+
Then /^the observer should be updated$/ do
|
19
|
+
@obs.should_receive(:update)
|
20
|
+
end
|
data/lib/crawler.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
module Crawler
|
2
|
+
|
3
|
+
# Observer watches a Webcrawler and outputs messages to a log object. This defaults to STDOUT but may be anything which responds to +puts+.
|
4
|
+
class Observer
|
5
|
+
|
6
|
+
# Log object. Must respond to +puts+.
|
7
|
+
attr_accessor :log
|
8
|
+
|
9
|
+
# Creates a new Observer object
|
10
|
+
def initialize(log=$stdout)
|
11
|
+
@log = log
|
12
|
+
end
|
13
|
+
|
14
|
+
# Called by the Observable module through Webcrawler.
|
15
|
+
def update(response, url)
|
16
|
+
@log.puts "Scanning: " + url.to_s
|
17
|
+
if response.kind_of?(Net::HTTPClientError) or response.kind_of?(Net::HTTPServerError)
|
18
|
+
@log.puts "#{response.code} encountered for " + url.to_s
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'set'
|
2
|
+
require 'observer'
|
3
|
+
require 'net/http'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'timeout'
|
6
|
+
|
7
|
+
module Crawler
|
8
|
+
class Webcrawler
|
9
|
+
|
10
|
+
include Observable
|
11
|
+
|
12
|
+
# Set of all URIs which have been crawled
|
13
|
+
attr_accessor :crawled
|
14
|
+
# Queue of URIs to be crawled. Array which acts as a LIFO queue.
|
15
|
+
attr_accessor :queue
|
16
|
+
# Hash of options
|
17
|
+
attr_accessor :options
|
18
|
+
|
19
|
+
# Accepts the following options:
|
20
|
+
# * timeout -- Time limit for the crawl operation, after which a Timeout::Error exception is raised.
|
21
|
+
def initialize(options={})
|
22
|
+
@crawled = Set.new
|
23
|
+
@queue = []
|
24
|
+
@options = {
|
25
|
+
:timeout => 1.0/0, #Infinity
|
26
|
+
:external => false,
|
27
|
+
:exclude => []
|
28
|
+
}.merge(options)
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
# Given a URI object, the crawler will explore every linked page recursively using the Breadth First Search algorithm.
|
33
|
+
def crawl(start_uri)
|
34
|
+
start_uri = start_uri.normalize
|
35
|
+
@queue << start_uri
|
36
|
+
|
37
|
+
timeout(@options[:timeout]) {
|
38
|
+
while(uri = @queue.shift)
|
39
|
+
|
40
|
+
Net::HTTP.start(uri.host, uri.port) do |http|
|
41
|
+
|
42
|
+
head = http.head(uri.path)
|
43
|
+
next if head.content_type != "text/html" # If the page retrieved is not an HTML document, we'll choke on it anyway. Skip it
|
44
|
+
|
45
|
+
resp = http.get(uri.path)
|
46
|
+
|
47
|
+
changed
|
48
|
+
notify_observers(resp, uri)
|
49
|
+
|
50
|
+
html = Nokogiri.parse(resp.body)
|
51
|
+
a_tags = html.search("a")
|
52
|
+
@queue = @queue + a_tags.collect do |t|
|
53
|
+
begin
|
54
|
+
next_uri = uri + t.attribute("href").to_s.strip
|
55
|
+
rescue
|
56
|
+
nil
|
57
|
+
end
|
58
|
+
end
|
59
|
+
@queue = @queue.compact.uniq
|
60
|
+
@queue = @queue.reject {|u|
|
61
|
+
@crawled.include?(u) or
|
62
|
+
u == uri or
|
63
|
+
!(u.kind_of?(URI::HTTP)) or
|
64
|
+
(u.host != uri.host and !@options[:external]) or
|
65
|
+
(@options[:exclude].any? { |excl| u.path.include?(excl)})
|
66
|
+
}
|
67
|
+
end
|
68
|
+
@crawled << uri
|
69
|
+
end
|
70
|
+
}
|
71
|
+
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), "..", "spec_helper.rb")
|
2
|
+
|
3
|
+
module Crawler
|
4
|
+
describe Webcrawler do
|
5
|
+
|
6
|
+
before(:all) do
|
7
|
+
@uri_base = 'http://localhost:12000/'
|
8
|
+
www_root = File.join(File.dirname(__FILE__), '..', 'fixtures')
|
9
|
+
@server = Thread.new do
|
10
|
+
s = WEBrick::HTTPServer.new({:Port => 12000, :DocumentRoot => www_root, :AccessLog => []})
|
11
|
+
@port = s.config[:Port]
|
12
|
+
begin
|
13
|
+
s.start
|
14
|
+
ensure
|
15
|
+
s.shutdown
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
after(:all) do
|
21
|
+
@server.exit
|
22
|
+
end
|
23
|
+
|
24
|
+
context "before crawl" do
|
25
|
+
it "should have an empty crawl list" do
|
26
|
+
crawler = Webcrawler.new
|
27
|
+
crawler.crawled.should be_empty
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
context "during a crawl" do
|
32
|
+
|
33
|
+
before(:each) do
|
34
|
+
@crawler = Webcrawler.new
|
35
|
+
@obs = mock("observer", :update => nil, :null_object => true)
|
36
|
+
#@obs = Observer.new
|
37
|
+
@crawler.add_observer(@obs)
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should send notifications" do
|
41
|
+
uri = URI.parse(@uri_base)
|
42
|
+
@obs.should_receive(:update)
|
43
|
+
@crawler.crawl(uri)
|
44
|
+
end
|
45
|
+
|
46
|
+
it "should send status code and URL" do
|
47
|
+
uri = URI.parse(@uri_base)
|
48
|
+
@obs.should_receive(:update).with(kind_of(Net::HTTPResponse), kind_of(URI))
|
49
|
+
@crawler.crawl(uri)
|
50
|
+
end
|
51
|
+
|
52
|
+
it "should send 404 for missing URL" do
|
53
|
+
uri = URI.parse(@uri_base + 'doesnotexist.html')
|
54
|
+
@obs.should_receive(:update).with(instance_of(Net::HTTPNotFound), uri)
|
55
|
+
@crawler.crawl(uri)
|
56
|
+
end
|
57
|
+
|
58
|
+
it "should not crawl a page more than once" do
|
59
|
+
uri = URI.parse(@uri_base)
|
60
|
+
@obs.should_receive(:update).with(kind_of(Net::HTTPResponse), uri + '/page5.html').once
|
61
|
+
@obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), uri + '/page5.html')
|
62
|
+
@crawler.crawl(uri)
|
63
|
+
end
|
64
|
+
|
65
|
+
it "should not add the current page to the queue" do
|
66
|
+
uri = URI.parse(@uri_base + "self-reference.html")
|
67
|
+
@obs.should_receive(:update).with(kind_of(Net::HTTPResponse), uri).once
|
68
|
+
@obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), uri)
|
69
|
+
@crawler.crawl(uri)
|
70
|
+
end
|
71
|
+
|
72
|
+
it "should remove nil items from the queue" do
|
73
|
+
uri = URI.parse(@uri_base + "self-reference.html")
|
74
|
+
@obs.should_receive(:update).twice
|
75
|
+
@crawler.crawl(uri)
|
76
|
+
end
|
77
|
+
|
78
|
+
it "should convert any exceptions to nil" do
|
79
|
+
uri = URI.parse(@uri_base + 'messed-up.html')
|
80
|
+
lambda { @crawler.crawl(uri) }.should_not raise_error
|
81
|
+
end
|
82
|
+
|
83
|
+
it "should not crawl anything but HTTP web addresses" do
|
84
|
+
uri = URI.parse(@uri_base + 'non-http.html')
|
85
|
+
@obs.should_receive(:update).once
|
86
|
+
@crawler.crawl(uri)
|
87
|
+
end
|
88
|
+
|
89
|
+
it "should not, by default, crawl outside its original host" do
|
90
|
+
uri = URI.parse(@uri_base + 'external.html')
|
91
|
+
@obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), URI.parse("http://example.com"))
|
92
|
+
@crawler.crawl(uri)
|
93
|
+
end
|
94
|
+
|
95
|
+
it "should only download HTML content types" do
|
96
|
+
uri = URI.parse(@uri_base + 'non-html.html')
|
97
|
+
@obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), uri + '/pdf.pdf')
|
98
|
+
@crawler.crawl(uri)
|
99
|
+
end
|
100
|
+
|
101
|
+
it "should not download anything in the excluded option" do
|
102
|
+
uri = URI.parse(@uri_base + 'exclusion.html')
|
103
|
+
@crawler.options[:exclude] = ["/excluded/"]
|
104
|
+
@obs.should_not_receive(:update).with(kind_of(Net::HTTPResponse), uri + '/excluded/shouldnt-hit.html')
|
105
|
+
@crawler.crawl(uri)
|
106
|
+
end
|
107
|
+
|
108
|
+
|
109
|
+
end
|
110
|
+
|
111
|
+
context "after crawl" do
|
112
|
+
before(:each) do
|
113
|
+
@crawler = Webcrawler.new
|
114
|
+
@uri = URI.parse(@uri_base)
|
115
|
+
@crawler.crawl(@uri)
|
116
|
+
end
|
117
|
+
|
118
|
+
it "should have at least one item in crawled" do
|
119
|
+
@crawler.should have_at_least(1).crawled
|
120
|
+
end
|
121
|
+
|
122
|
+
it "should have put crawled links into crawled" do
|
123
|
+
@crawler.should have_at_least(2).crawled
|
124
|
+
end
|
125
|
+
|
126
|
+
it "should have the children of child pages in crawled" do
|
127
|
+
@crawler.crawled.should include(@uri + "/page4.html")
|
128
|
+
end
|
129
|
+
|
130
|
+
it "should have an empty queue" do
|
131
|
+
@crawler.queue.should be_empty
|
132
|
+
end
|
133
|
+
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), "..", "spec_helper.rb")
|
2
|
+
require 'stringio'
|
3
|
+
|
4
|
+
module Crawler
|
5
|
+
describe Observer do
|
6
|
+
|
7
|
+
def test_code(code, log, obs)
|
8
|
+
log.should_receive(:puts).with("#{code} encountered for http://example.com/")
|
9
|
+
resp = Net::HTTPResponse::CODE_TO_OBJ["#{code}"].new("1.1", code, "")
|
10
|
+
obs.update(resp, URI.parse("http://example.com/"))
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should output a warning when an error code is reached" do
|
14
|
+
log = double('log', :null_object => true)
|
15
|
+
obs = Observer.new(log)
|
16
|
+
(400..416).each { |code| test_code(code, log, obs) }
|
17
|
+
(500..505).each { |code| test_code(code, log, obs) }
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should not output a warning when 200 is encountered" do
|
21
|
+
log = double('log')
|
22
|
+
obs = Observer.new(log)
|
23
|
+
log.should_not_receive(:puts).with(/\d{3} encountered/)
|
24
|
+
obs.update(Net::HTTPOK.new("1.1", "200", ""), URI.parse("http://example.com/"))
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
2
|
+
"http://www.w3.org/TR/html4/strict.dtd">
|
3
|
+
|
4
|
+
<html lang="en">
|
5
|
+
<head>
|
6
|
+
<title>non-http</title>
|
7
|
+
</head>
|
8
|
+
<body>
|
9
|
+
<!-- None of the following should be followed -->
|
10
|
+
<a href="mailto:test@example.com">mailto</a>
|
11
|
+
<a href="ftp://ftp.example.com">ftp</a>
|
12
|
+
|
13
|
+
</body>
|
14
|
+
</html>
|
File without changes
|
File without changes
|
@@ -0,0 +1,13 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
2
|
+
"http://www.w3.org/TR/html4/strict.dtd">
|
3
|
+
|
4
|
+
<html lang="en">
|
5
|
+
<head>
|
6
|
+
<title>self-reference</title>
|
7
|
+
</head>
|
8
|
+
<body>
|
9
|
+
<a href="self-reference.html">link</a> <!-- will be converted to nil -->
|
10
|
+
<a href="page5.html">link</a>
|
11
|
+
|
12
|
+
</body>
|
13
|
+
</html>
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: crawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: "0.2"
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Tyler Cunnion
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-01-25 00:00:00 -05:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: Simple webcrawler
|
17
|
+
email: tyler.cunnion+ruby@gmail.com
|
18
|
+
executables:
|
19
|
+
- crawler
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files:
|
23
|
+
- bin/crawler
|
24
|
+
- lib/crawler.rb
|
25
|
+
- lib/crawler/observer.rb
|
26
|
+
- lib/crawler/webcrawler.rb
|
27
|
+
files:
|
28
|
+
- Rakefile
|
29
|
+
- bin/crawler
|
30
|
+
- features/step_definitions/crawler_steps.rb
|
31
|
+
- features/support/env.rb
|
32
|
+
- lib/crawler.rb
|
33
|
+
- lib/crawler/observer.rb
|
34
|
+
- lib/crawler/webcrawler.rb
|
35
|
+
- spec/crawler/crawler_spec.rb
|
36
|
+
- spec/crawler/observer_spec.rb
|
37
|
+
- spec/fixtures/excluded/shouldnt-hit.html
|
38
|
+
- spec/fixtures/exclusion.html
|
39
|
+
- spec/fixtures/external.html
|
40
|
+
- spec/fixtures/index.html
|
41
|
+
- spec/fixtures/messed-up.html
|
42
|
+
- spec/fixtures/non-html.html
|
43
|
+
- spec/fixtures/non-http.html
|
44
|
+
- spec/fixtures/page2.html
|
45
|
+
- spec/fixtures/page3.html
|
46
|
+
- spec/fixtures/page4.html
|
47
|
+
- spec/fixtures/page5.html
|
48
|
+
- spec/fixtures/pdf.pdf
|
49
|
+
- spec/fixtures/self-reference.html
|
50
|
+
- spec/spec_helper.rb
|
51
|
+
- Manifest
|
52
|
+
- crawler.gemspec
|
53
|
+
has_rdoc: true
|
54
|
+
homepage: http://github.com/tylercunnion/crawler
|
55
|
+
licenses: []
|
56
|
+
|
57
|
+
post_install_message:
|
58
|
+
rdoc_options:
|
59
|
+
- --line-numbers
|
60
|
+
- --inline-source
|
61
|
+
- --title
|
62
|
+
- Crawler
|
63
|
+
require_paths:
|
64
|
+
- lib
|
65
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: "0"
|
70
|
+
version:
|
71
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: "1.2"
|
76
|
+
version:
|
77
|
+
requirements: []
|
78
|
+
|
79
|
+
rubyforge_project: crawler
|
80
|
+
rubygems_version: 1.3.5
|
81
|
+
signing_key:
|
82
|
+
specification_version: 3
|
83
|
+
summary: Simple webcrawler
|
84
|
+
test_files: []
|
85
|
+
|