rcrawl 0.3.5 → 0.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +2 -2
- data/Rakefile +1 -1
- data/lib/rcrawl/crawler.rb +153 -2
- data/lib/rcrawl/version.rb +5 -0
- data/lib/rcrawl.rb +2 -124
- metadata +4 -5
- data/lib/rcrawl/process/html.rb +0 -61
data/README
CHANGED
@@ -32,7 +32,7 @@ The structure of the crawling process was inspired by the specs of the Mercator
|
|
32
32
|
|
33
33
|
# Returns a hash where the key is a url and the value is
|
34
34
|
# the raw html from that url
|
35
|
-
crawler.
|
35
|
+
crawler.raw_html
|
36
36
|
|
37
37
|
|
38
38
|
# Returns a hash where the key is a URL and the value is
|
@@ -48,4 +48,4 @@ Copyright © 2006 Digital Duckies, LLC, under MIT License
|
|
48
48
|
|
49
49
|
Developed for http://digitalduckies.net
|
50
50
|
|
51
|
-
News, code, and documentation at http://digitalduckies.net
|
51
|
+
News, code, and documentation at http://blog.digitalduckies.net
|
data/Rakefile
CHANGED
data/lib/rcrawl/crawler.rb
CHANGED
@@ -1,4 +1,155 @@
|
|
1
1
|
module Rcrawl
|
2
|
+
|
2
3
|
class Crawler
|
3
|
-
|
4
|
-
|
4
|
+
|
5
|
+
attr_accessor :links_to_visit, :site
|
6
|
+
attr_reader :visited_links, :external_links, :raw_html, :rules, :sites,
|
7
|
+
:errors
|
8
|
+
# Initializes various variables when a new Crawler object is instantiated
|
9
|
+
def initialize(site)
|
10
|
+
puts "Rcrawl Version #{VERSION} initializing..."
|
11
|
+
@links_to_visit = Array.new
|
12
|
+
@visited_links = Array.new
|
13
|
+
@external_links = Array.new
|
14
|
+
@raw_html = Hash.new
|
15
|
+
@rules = RobotRules.new("rcrawl/#{VERSION}")
|
16
|
+
@sites = Hash.new
|
17
|
+
@errors = Hash.new
|
18
|
+
@site = URI.parse(site) || raise("You didn't give me a site to crawl")
|
19
|
+
@links_to_visit << site
|
20
|
+
puts "Ready to crawl #{site}"
|
21
|
+
end
|
22
|
+
|
23
|
+
# Coordinates the whole crawling process
|
24
|
+
def crawl
|
25
|
+
until @links_to_visit.empty? do
|
26
|
+
begin
|
27
|
+
# Get link
|
28
|
+
url_server
|
29
|
+
next unless robot_safe? @url
|
30
|
+
# Parse robots.txt, then download document if robot_safe
|
31
|
+
fetch_http(@url)
|
32
|
+
# Store raw HTML in variable to read/reread as needed
|
33
|
+
# Then call any processing modules you need for the current document
|
34
|
+
ris(@document)
|
35
|
+
rescue
|
36
|
+
puts ""
|
37
|
+
puts "I died on #{@url}"
|
38
|
+
$stderr.puts $!
|
39
|
+
@errors[@url] = $!
|
40
|
+
next
|
41
|
+
ensure
|
42
|
+
# Stuff you want to make sure gets printed out
|
43
|
+
puts " done!"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
puts "Visited #{@visited_links.size} links."
|
48
|
+
end
|
49
|
+
|
50
|
+
# Authoritative list of URLs to be processed by Rcrawl
|
51
|
+
def url_server
|
52
|
+
unless @links_to_visit.empty?
|
53
|
+
@url = @links_to_visit.pop
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# Download the document
|
58
|
+
def fetch_http(url)
|
59
|
+
# Make sure robots.txt has been parsed for this site first,
|
60
|
+
# if not, parse robots.txt then grab document.
|
61
|
+
uri = URI.parse(url)
|
62
|
+
print "Visiting: #{url}"
|
63
|
+
@document = uri.read
|
64
|
+
@visited_links << url
|
65
|
+
end
|
66
|
+
|
67
|
+
# Rewind Input Stream, for storing and reading of raw HTML
|
68
|
+
def ris(document)
|
69
|
+
print "."
|
70
|
+
# Store raw HTML into local variable
|
71
|
+
# Based on MIME type, invoke the proper processing modules
|
72
|
+
case document.content_type
|
73
|
+
when "text/html"
|
74
|
+
link_extractor(document)
|
75
|
+
process_html(document)
|
76
|
+
else
|
77
|
+
print "... not HTML, skipping..."
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# HTML processing module for extracting links
|
82
|
+
def link_extractor(document)
|
83
|
+
print "."
|
84
|
+
# Parse all links from HTML into an array
|
85
|
+
# Set up the scrAPI (http://labnotes.org)
|
86
|
+
links = Scraper.define do
|
87
|
+
array :urls
|
88
|
+
process "a[href]", :urls => "@href"
|
89
|
+
result :urls
|
90
|
+
end
|
91
|
+
|
92
|
+
urls = links.scrape(document)
|
93
|
+
|
94
|
+
urls.each { |url|
|
95
|
+
uri = URI.parse(url)
|
96
|
+
|
97
|
+
# Derelativeize links if necessary
|
98
|
+
if uri.relative?
|
99
|
+
url = @site.merge(url).to_s
|
100
|
+
uri = URI.parse(url)
|
101
|
+
end
|
102
|
+
|
103
|
+
# Check domain, if in same domain, keep link, else trash it
|
104
|
+
if uri.host != @site.host
|
105
|
+
@external_links << url
|
106
|
+
@external_links.uniq!
|
107
|
+
next
|
108
|
+
end
|
109
|
+
|
110
|
+
# Find out if we've seen this link already
|
111
|
+
if (@visited_links.include? url) || (@links_to_visit.include? url)
|
112
|
+
next
|
113
|
+
end
|
114
|
+
|
115
|
+
@links_to_visit << url
|
116
|
+
}
|
117
|
+
end
|
118
|
+
|
119
|
+
# HTML processing module for raw HTML storage
|
120
|
+
def process_html(document)
|
121
|
+
|
122
|
+
# Add link and raw HTML to a hash as key/value
|
123
|
+
# for later storage in database
|
124
|
+
unless @raw_html.has_value?(document)
|
125
|
+
print "."
|
126
|
+
@raw_html[@document.base_uri] = document
|
127
|
+
end
|
128
|
+
|
129
|
+
end
|
130
|
+
|
131
|
+
# robots.txt parsing
|
132
|
+
def robot_safe?(url)
|
133
|
+
uri = URI.parse(url)
|
134
|
+
location = "#{uri.host}:#{uri.port}"
|
135
|
+
|
136
|
+
return true unless %w{http https}.include?(uri.scheme)
|
137
|
+
|
138
|
+
unless @sites.include? location
|
139
|
+
@sites[location] = true
|
140
|
+
|
141
|
+
robot_url = "http://#{location}/robots.txt"
|
142
|
+
begin
|
143
|
+
robot_file = open(robot_url) { |page| page.read }
|
144
|
+
rescue
|
145
|
+
return true
|
146
|
+
end
|
147
|
+
@rules.parse(robot_url, robot_file)
|
148
|
+
end
|
149
|
+
|
150
|
+
@rules.allowed? url
|
151
|
+
end
|
152
|
+
|
153
|
+
end # class Crawler
|
154
|
+
|
155
|
+
end # module Rcrawl
|
data/lib/rcrawl.rb
CHANGED
@@ -4,128 +4,6 @@ require 'rubygems'
|
|
4
4
|
require 'open-uri'
|
5
5
|
require 'scrapi'
|
6
6
|
require 'rcrawl/robot_rules'
|
7
|
-
require 'rcrawl/
|
7
|
+
require 'rcrawl/crawler'
|
8
|
+
require 'rcrawl/version'
|
8
9
|
|
9
|
-
module Rcrawl
|
10
|
-
|
11
|
-
# Crawler will retrieve an entire website, one page at a time,
|
12
|
-
# parsing the page using whatever modules you pass it to.
|
13
|
-
class Crawler
|
14
|
-
|
15
|
-
# Initializes various variables when a new Rcrawl object is instantiated
|
16
|
-
def initialize(site)
|
17
|
-
@links_to_visit = Array.new
|
18
|
-
@visited_links = Array.new
|
19
|
-
@external_links = Array.new
|
20
|
-
@raw_html = Hash.new
|
21
|
-
@rules = RobotRules.new("Rcrawl")
|
22
|
-
@sites = Hash.new
|
23
|
-
@site = URI.parse(site)
|
24
|
-
@links_to_visit << site
|
25
|
-
@errors = Hash.new
|
26
|
-
puts "Site is #{site}"
|
27
|
-
end
|
28
|
-
|
29
|
-
# Coordinates the whole crawling process
|
30
|
-
def crawl
|
31
|
-
until @links_to_visit.empty? do
|
32
|
-
begin
|
33
|
-
# Get link
|
34
|
-
url_server
|
35
|
-
next unless robot_safe? @url
|
36
|
-
# Parse robots.txt, then download document if robot_safe
|
37
|
-
fetch_http(@url)
|
38
|
-
# Store raw HTML in variable to read/reread as needed
|
39
|
-
# Then call any processing modules you need for the current document
|
40
|
-
ris(@document)
|
41
|
-
rescue
|
42
|
-
puts ""
|
43
|
-
puts "I died on #{@url}"
|
44
|
-
$stderr.puts $!
|
45
|
-
@errors[@url] = $!
|
46
|
-
next
|
47
|
-
ensure
|
48
|
-
# Stuff you want to make sure gets printed out
|
49
|
-
puts " done!"
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
puts "Visited #{@visited_links.size} links."
|
54
|
-
end
|
55
|
-
|
56
|
-
# Authoritative list of URLs to be processed by Rcrawl
|
57
|
-
def url_server
|
58
|
-
unless @links_to_visit.empty?
|
59
|
-
@url = @links_to_visit.pop
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
# Download the document
|
64
|
-
def fetch_http(url)
|
65
|
-
# Make sure robots.txt has been parsed for this site first,
|
66
|
-
# if not, parse robots.txt then grab document.
|
67
|
-
uri = URI.parse(url)
|
68
|
-
print "Visiting: #{url}"
|
69
|
-
@document = uri.read
|
70
|
-
@visited_links << url
|
71
|
-
end
|
72
|
-
|
73
|
-
# Rewind Input Stream, for storing and reading of raw HTML
|
74
|
-
def ris(document)
|
75
|
-
print "."
|
76
|
-
# Store raw HTML into local variable
|
77
|
-
# Based on MIME type, invoke the proper processing modules
|
78
|
-
case document.content_type
|
79
|
-
when "text/html"
|
80
|
-
Rcrawl::Process::HTML.link_extractor(document)
|
81
|
-
Rcrawl::Process::HTML.process_html(document)
|
82
|
-
else
|
83
|
-
print "... not HTML, skipping..."
|
84
|
-
end
|
85
|
-
end
|
86
|
-
|
87
|
-
# robots.txt parsing
|
88
|
-
def robot_safe?(url)
|
89
|
-
uri = URI.parse(url)
|
90
|
-
location = "#{uri.host}:#{uri.port}"
|
91
|
-
|
92
|
-
return true unless %w{http https}.include?(uri.scheme)
|
93
|
-
|
94
|
-
unless @sites.include? location
|
95
|
-
@sites[location] = true
|
96
|
-
|
97
|
-
robot_url = "http://#{location}/robots.txt"
|
98
|
-
begin
|
99
|
-
robot_file = open(robot_url) { |page| page.read }
|
100
|
-
rescue
|
101
|
-
return true
|
102
|
-
end
|
103
|
-
@rules.parse(robot_url, robot_file)
|
104
|
-
end
|
105
|
-
|
106
|
-
@rules.allowed? url
|
107
|
-
end
|
108
|
-
|
109
|
-
# Returns array of links visited during crawl
|
110
|
-
def visited_links
|
111
|
-
return @visited_links
|
112
|
-
end
|
113
|
-
|
114
|
-
# Returns array of external links
|
115
|
-
def external_links
|
116
|
-
return @external_links
|
117
|
-
end
|
118
|
-
|
119
|
-
# Returns a hash where {key => URL, value => HTML} from all pages crawled
|
120
|
-
def dump
|
121
|
-
return @raw_html
|
122
|
-
end
|
123
|
-
|
124
|
-
# Returns a hash where {key => URL, value => "Error message"} from any
|
125
|
-
# errors encountered during the crawl
|
126
|
-
def errors
|
127
|
-
return @errors
|
128
|
-
end
|
129
|
-
|
130
|
-
end
|
131
|
-
end
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: rcrawl
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date: 2006-09-
|
6
|
+
version: 0.4.5
|
7
|
+
date: 2006-09-26 00:00:00 -05:00
|
8
8
|
summary: A web crawler written in ruby
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -31,10 +31,9 @@ authors:
|
|
31
31
|
files:
|
32
32
|
- lib/rcrawl.rb
|
33
33
|
- lib/rcrawl
|
34
|
-
- lib/rcrawl/crawler.rb
|
35
34
|
- lib/rcrawl/robot_rules.rb
|
36
|
-
- lib/rcrawl/
|
37
|
-
- lib/rcrawl/
|
35
|
+
- lib/rcrawl/crawler.rb
|
36
|
+
- lib/rcrawl/version.rb
|
38
37
|
- README
|
39
38
|
- MIT-LICENSE
|
40
39
|
- Rakefile
|
data/lib/rcrawl/process/html.rb
DELETED
@@ -1,61 +0,0 @@
|
|
1
|
-
module Rcrawl
|
2
|
-
|
3
|
-
module Process
|
4
|
-
|
5
|
-
module HTML
|
6
|
-
|
7
|
-
# HTML processing module for extracting links
|
8
|
-
def HTML.link_extractor(document)
|
9
|
-
print "."
|
10
|
-
# Parse all links from HTML into an array
|
11
|
-
# Set up the scrAPI (http://labnotes.org)
|
12
|
-
links = Scraper.define do
|
13
|
-
array :urls
|
14
|
-
process "a[href]", :urls => "@href"
|
15
|
-
result :urls
|
16
|
-
end
|
17
|
-
|
18
|
-
urls = links.scrape(document)
|
19
|
-
|
20
|
-
urls.each { |url|
|
21
|
-
uri = URI.parse(url)
|
22
|
-
|
23
|
-
# Derelativeize links if necessary
|
24
|
-
if uri.relative?
|
25
|
-
url = @site.merge(url).to_s
|
26
|
-
uri = URI.parse(url)
|
27
|
-
end
|
28
|
-
|
29
|
-
# Check domain, if in same domain, keep link, else trash it
|
30
|
-
if uri.host != @site.host
|
31
|
-
@external_links << url
|
32
|
-
@external_links.uniq!
|
33
|
-
next
|
34
|
-
end
|
35
|
-
|
36
|
-
# Find out if we've seen this link already
|
37
|
-
if (@visited_links.include? url) || (@links_to_visit.include? url)
|
38
|
-
next
|
39
|
-
end
|
40
|
-
|
41
|
-
@links_to_visit << url
|
42
|
-
}
|
43
|
-
end
|
44
|
-
|
45
|
-
# HTML processing module for raw HTML storage
|
46
|
-
def HTML.process_html(document)
|
47
|
-
|
48
|
-
# Add link and raw HTML to a hash as key/value
|
49
|
-
# for later storage in database
|
50
|
-
unless @raw_html.has_value?(document)
|
51
|
-
print "."
|
52
|
-
@raw_html[document.base_uri] = document
|
53
|
-
end
|
54
|
-
|
55
|
-
end
|
56
|
-
|
57
|
-
end
|
58
|
-
|
59
|
-
end
|
60
|
-
|
61
|
-
end
|