rcrawl 0.3.5 → 0.4.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README +2 -2
- data/Rakefile +1 -1
- data/lib/rcrawl/crawler.rb +153 -2
- data/lib/rcrawl/version.rb +5 -0
- data/lib/rcrawl.rb +2 -124
- metadata +4 -5
- data/lib/rcrawl/process/html.rb +0 -61
data/README
CHANGED
@@ -32,7 +32,7 @@ The structure of the crawling process was inspired by the specs of the Mercator
|
|
32
32
|
|
33
33
|
# Returns a hash where the key is a url and the value is
|
34
34
|
# the raw html from that url
|
35
|
-
crawler.
|
35
|
+
crawler.raw_html
|
36
36
|
|
37
37
|
|
38
38
|
# Returns a hash where the key is a URL and the value is
|
@@ -48,4 +48,4 @@ Copyright © 2006 Digital Duckies, LLC, under MIT License
|
|
48
48
|
|
49
49
|
Developed for http://digitalduckies.net
|
50
50
|
|
51
|
-
News, code, and documentation at http://digitalduckies.net
|
51
|
+
News, code, and documentation at http://blog.digitalduckies.net
|
data/Rakefile
CHANGED
data/lib/rcrawl/crawler.rb
CHANGED
@@ -1,4 +1,155 @@
|
|
1
1
|
module Rcrawl
|
2
|
+
|
2
3
|
class Crawler
|
3
|
-
|
4
|
-
|
4
|
+
|
5
|
+
attr_accessor :links_to_visit, :site
|
6
|
+
attr_reader :visited_links, :external_links, :raw_html, :rules, :sites,
|
7
|
+
:errors
|
8
|
+
# Initializes various variables when a new Crawler object is instantiated
|
9
|
+
def initialize(site)
|
10
|
+
puts "Rcrawl Version #{VERSION} initializing..."
|
11
|
+
@links_to_visit = Array.new
|
12
|
+
@visited_links = Array.new
|
13
|
+
@external_links = Array.new
|
14
|
+
@raw_html = Hash.new
|
15
|
+
@rules = RobotRules.new("rcrawl/#{VERSION}")
|
16
|
+
@sites = Hash.new
|
17
|
+
@errors = Hash.new
|
18
|
+
@site = URI.parse(site) || raise("You didn't give me a site to crawl")
|
19
|
+
@links_to_visit << site
|
20
|
+
puts "Ready to crawl #{site}"
|
21
|
+
end
|
22
|
+
|
23
|
+
# Coordinates the whole crawling process
|
24
|
+
def crawl
|
25
|
+
until @links_to_visit.empty? do
|
26
|
+
begin
|
27
|
+
# Get link
|
28
|
+
url_server
|
29
|
+
next unless robot_safe? @url
|
30
|
+
# Parse robots.txt, then download document if robot_safe
|
31
|
+
fetch_http(@url)
|
32
|
+
# Store raw HTML in variable to read/reread as needed
|
33
|
+
# Then call any processing modules you need for the current document
|
34
|
+
ris(@document)
|
35
|
+
rescue
|
36
|
+
puts ""
|
37
|
+
puts "I died on #{@url}"
|
38
|
+
$stderr.puts $!
|
39
|
+
@errors[@url] = $!
|
40
|
+
next
|
41
|
+
ensure
|
42
|
+
# Stuff you want to make sure gets printed out
|
43
|
+
puts " done!"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
puts "Visited #{@visited_links.size} links."
|
48
|
+
end
|
49
|
+
|
50
|
+
# Authoritative list of URLs to be processed by Rcrawl
|
51
|
+
def url_server
|
52
|
+
unless @links_to_visit.empty?
|
53
|
+
@url = @links_to_visit.pop
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# Download the document
|
58
|
+
def fetch_http(url)
|
59
|
+
# Make sure robots.txt has been parsed for this site first,
|
60
|
+
# if not, parse robots.txt then grab document.
|
61
|
+
uri = URI.parse(url)
|
62
|
+
print "Visiting: #{url}"
|
63
|
+
@document = uri.read
|
64
|
+
@visited_links << url
|
65
|
+
end
|
66
|
+
|
67
|
+
# Rewind Input Stream, for storing and reading of raw HTML
|
68
|
+
def ris(document)
|
69
|
+
print "."
|
70
|
+
# Store raw HTML into local variable
|
71
|
+
# Based on MIME type, invoke the proper processing modules
|
72
|
+
case document.content_type
|
73
|
+
when "text/html"
|
74
|
+
link_extractor(document)
|
75
|
+
process_html(document)
|
76
|
+
else
|
77
|
+
print "... not HTML, skipping..."
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# HTML processing module for extracting links
|
82
|
+
def link_extractor(document)
|
83
|
+
print "."
|
84
|
+
# Parse all links from HTML into an array
|
85
|
+
# Set up the scrAPI (http://labnotes.org)
|
86
|
+
links = Scraper.define do
|
87
|
+
array :urls
|
88
|
+
process "a[href]", :urls => "@href"
|
89
|
+
result :urls
|
90
|
+
end
|
91
|
+
|
92
|
+
urls = links.scrape(document)
|
93
|
+
|
94
|
+
urls.each { |url|
|
95
|
+
uri = URI.parse(url)
|
96
|
+
|
97
|
+
# Derelativeize links if necessary
|
98
|
+
if uri.relative?
|
99
|
+
url = @site.merge(url).to_s
|
100
|
+
uri = URI.parse(url)
|
101
|
+
end
|
102
|
+
|
103
|
+
# Check domain, if in same domain, keep link, else trash it
|
104
|
+
if uri.host != @site.host
|
105
|
+
@external_links << url
|
106
|
+
@external_links.uniq!
|
107
|
+
next
|
108
|
+
end
|
109
|
+
|
110
|
+
# Find out if we've seen this link already
|
111
|
+
if (@visited_links.include? url) || (@links_to_visit.include? url)
|
112
|
+
next
|
113
|
+
end
|
114
|
+
|
115
|
+
@links_to_visit << url
|
116
|
+
}
|
117
|
+
end
|
118
|
+
|
119
|
+
# HTML processing module for raw HTML storage
|
120
|
+
def process_html(document)
|
121
|
+
|
122
|
+
# Add link and raw HTML to a hash as key/value
|
123
|
+
# for later storage in database
|
124
|
+
unless @raw_html.has_value?(document)
|
125
|
+
print "."
|
126
|
+
@raw_html[@document.base_uri] = document
|
127
|
+
end
|
128
|
+
|
129
|
+
end
|
130
|
+
|
131
|
+
# robots.txt parsing
|
132
|
+
def robot_safe?(url)
|
133
|
+
uri = URI.parse(url)
|
134
|
+
location = "#{uri.host}:#{uri.port}"
|
135
|
+
|
136
|
+
return true unless %w{http https}.include?(uri.scheme)
|
137
|
+
|
138
|
+
unless @sites.include? location
|
139
|
+
@sites[location] = true
|
140
|
+
|
141
|
+
robot_url = "http://#{location}/robots.txt"
|
142
|
+
begin
|
143
|
+
robot_file = open(robot_url) { |page| page.read }
|
144
|
+
rescue
|
145
|
+
return true
|
146
|
+
end
|
147
|
+
@rules.parse(robot_url, robot_file)
|
148
|
+
end
|
149
|
+
|
150
|
+
@rules.allowed? url
|
151
|
+
end
|
152
|
+
|
153
|
+
end # class Crawler
|
154
|
+
|
155
|
+
end # module Rcrawl
|
data/lib/rcrawl.rb
CHANGED
@@ -4,128 +4,6 @@ require 'rubygems'
|
|
4
4
|
require 'open-uri'
|
5
5
|
require 'scrapi'
|
6
6
|
require 'rcrawl/robot_rules'
|
7
|
-
require 'rcrawl/
|
7
|
+
require 'rcrawl/crawler'
|
8
|
+
require 'rcrawl/version'
|
8
9
|
|
9
|
-
module Rcrawl
|
10
|
-
|
11
|
-
# Crawler will retrieve an entire website, one page at a time,
|
12
|
-
# parsing the page using whatever modules you pass it to.
|
13
|
-
class Crawler
|
14
|
-
|
15
|
-
# Initializes various variables when a new Rcrawl object is instantiated
|
16
|
-
def initialize(site)
|
17
|
-
@links_to_visit = Array.new
|
18
|
-
@visited_links = Array.new
|
19
|
-
@external_links = Array.new
|
20
|
-
@raw_html = Hash.new
|
21
|
-
@rules = RobotRules.new("Rcrawl")
|
22
|
-
@sites = Hash.new
|
23
|
-
@site = URI.parse(site)
|
24
|
-
@links_to_visit << site
|
25
|
-
@errors = Hash.new
|
26
|
-
puts "Site is #{site}"
|
27
|
-
end
|
28
|
-
|
29
|
-
# Coordinates the whole crawling process
|
30
|
-
def crawl
|
31
|
-
until @links_to_visit.empty? do
|
32
|
-
begin
|
33
|
-
# Get link
|
34
|
-
url_server
|
35
|
-
next unless robot_safe? @url
|
36
|
-
# Parse robots.txt, then download document if robot_safe
|
37
|
-
fetch_http(@url)
|
38
|
-
# Store raw HTML in variable to read/reread as needed
|
39
|
-
# Then call any processing modules you need for the current document
|
40
|
-
ris(@document)
|
41
|
-
rescue
|
42
|
-
puts ""
|
43
|
-
puts "I died on #{@url}"
|
44
|
-
$stderr.puts $!
|
45
|
-
@errors[@url] = $!
|
46
|
-
next
|
47
|
-
ensure
|
48
|
-
# Stuff you want to make sure gets printed out
|
49
|
-
puts " done!"
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
puts "Visited #{@visited_links.size} links."
|
54
|
-
end
|
55
|
-
|
56
|
-
# Authoritative list of URLs to be processed by Rcrawl
|
57
|
-
def url_server
|
58
|
-
unless @links_to_visit.empty?
|
59
|
-
@url = @links_to_visit.pop
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
# Download the document
|
64
|
-
def fetch_http(url)
|
65
|
-
# Make sure robots.txt has been parsed for this site first,
|
66
|
-
# if not, parse robots.txt then grab document.
|
67
|
-
uri = URI.parse(url)
|
68
|
-
print "Visiting: #{url}"
|
69
|
-
@document = uri.read
|
70
|
-
@visited_links << url
|
71
|
-
end
|
72
|
-
|
73
|
-
# Rewind Input Stream, for storing and reading of raw HTML
|
74
|
-
def ris(document)
|
75
|
-
print "."
|
76
|
-
# Store raw HTML into local variable
|
77
|
-
# Based on MIME type, invoke the proper processing modules
|
78
|
-
case document.content_type
|
79
|
-
when "text/html"
|
80
|
-
Rcrawl::Process::HTML.link_extractor(document)
|
81
|
-
Rcrawl::Process::HTML.process_html(document)
|
82
|
-
else
|
83
|
-
print "... not HTML, skipping..."
|
84
|
-
end
|
85
|
-
end
|
86
|
-
|
87
|
-
# robots.txt parsing
|
88
|
-
def robot_safe?(url)
|
89
|
-
uri = URI.parse(url)
|
90
|
-
location = "#{uri.host}:#{uri.port}"
|
91
|
-
|
92
|
-
return true unless %w{http https}.include?(uri.scheme)
|
93
|
-
|
94
|
-
unless @sites.include? location
|
95
|
-
@sites[location] = true
|
96
|
-
|
97
|
-
robot_url = "http://#{location}/robots.txt"
|
98
|
-
begin
|
99
|
-
robot_file = open(robot_url) { |page| page.read }
|
100
|
-
rescue
|
101
|
-
return true
|
102
|
-
end
|
103
|
-
@rules.parse(robot_url, robot_file)
|
104
|
-
end
|
105
|
-
|
106
|
-
@rules.allowed? url
|
107
|
-
end
|
108
|
-
|
109
|
-
# Returns array of links visited during crawl
|
110
|
-
def visited_links
|
111
|
-
return @visited_links
|
112
|
-
end
|
113
|
-
|
114
|
-
# Returns array of external links
|
115
|
-
def external_links
|
116
|
-
return @external_links
|
117
|
-
end
|
118
|
-
|
119
|
-
# Returns a hash where {key => URL, value => HTML} from all pages crawled
|
120
|
-
def dump
|
121
|
-
return @raw_html
|
122
|
-
end
|
123
|
-
|
124
|
-
# Returns a hash where {key => URL, value => "Error message"} from any
|
125
|
-
# errors encountered during the crawl
|
126
|
-
def errors
|
127
|
-
return @errors
|
128
|
-
end
|
129
|
-
|
130
|
-
end
|
131
|
-
end
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: rcrawl
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date: 2006-09-
|
6
|
+
version: 0.4.5
|
7
|
+
date: 2006-09-26 00:00:00 -05:00
|
8
8
|
summary: A web crawler written in ruby
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -31,10 +31,9 @@ authors:
|
|
31
31
|
files:
|
32
32
|
- lib/rcrawl.rb
|
33
33
|
- lib/rcrawl
|
34
|
-
- lib/rcrawl/crawler.rb
|
35
34
|
- lib/rcrawl/robot_rules.rb
|
36
|
-
- lib/rcrawl/
|
37
|
-
- lib/rcrawl/
|
35
|
+
- lib/rcrawl/crawler.rb
|
36
|
+
- lib/rcrawl/version.rb
|
38
37
|
- README
|
39
38
|
- MIT-LICENSE
|
40
39
|
- Rakefile
|
data/lib/rcrawl/process/html.rb
DELETED
@@ -1,61 +0,0 @@
|
|
1
|
-
module Rcrawl
|
2
|
-
|
3
|
-
module Process
|
4
|
-
|
5
|
-
module HTML
|
6
|
-
|
7
|
-
# HTML processing module for extracting links
|
8
|
-
def HTML.link_extractor(document)
|
9
|
-
print "."
|
10
|
-
# Parse all links from HTML into an array
|
11
|
-
# Set up the scrAPI (http://labnotes.org)
|
12
|
-
links = Scraper.define do
|
13
|
-
array :urls
|
14
|
-
process "a[href]", :urls => "@href"
|
15
|
-
result :urls
|
16
|
-
end
|
17
|
-
|
18
|
-
urls = links.scrape(document)
|
19
|
-
|
20
|
-
urls.each { |url|
|
21
|
-
uri = URI.parse(url)
|
22
|
-
|
23
|
-
# Derelativeize links if necessary
|
24
|
-
if uri.relative?
|
25
|
-
url = @site.merge(url).to_s
|
26
|
-
uri = URI.parse(url)
|
27
|
-
end
|
28
|
-
|
29
|
-
# Check domain, if in same domain, keep link, else trash it
|
30
|
-
if uri.host != @site.host
|
31
|
-
@external_links << url
|
32
|
-
@external_links.uniq!
|
33
|
-
next
|
34
|
-
end
|
35
|
-
|
36
|
-
# Find out if we've seen this link already
|
37
|
-
if (@visited_links.include? url) || (@links_to_visit.include? url)
|
38
|
-
next
|
39
|
-
end
|
40
|
-
|
41
|
-
@links_to_visit << url
|
42
|
-
}
|
43
|
-
end
|
44
|
-
|
45
|
-
# HTML processing module for raw HTML storage
|
46
|
-
def HTML.process_html(document)
|
47
|
-
|
48
|
-
# Add link and raw HTML to a hash as key/value
|
49
|
-
# for later storage in database
|
50
|
-
unless @raw_html.has_value?(document)
|
51
|
-
print "."
|
52
|
-
@raw_html[document.base_uri] = document
|
53
|
-
end
|
54
|
-
|
55
|
-
end
|
56
|
-
|
57
|
-
end
|
58
|
-
|
59
|
-
end
|
60
|
-
|
61
|
-
end
|