rcrawl 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +20 -0
- data/README +22 -0
- data/Rakefile +39 -0
- data/TODO +1 -0
- data/lib/rcrawl.rb +177 -0
- data/lib/robot_rules.rb +81 -0
- metadata +61 -0
data/MIT-LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2006 Shawn Hansen
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Rcrawl is intended to be a web crawler written entirely in ruby.
|
2
|
+
It's limited right now by the fact that it will stay on the original domain provided.
|
3
|
+
I decided to roll my own crawler in ruby after finding only snippets of code on
|
4
|
+
various web sites or newsgroups, for crawlers written in ruby.
|
5
|
+
|
6
|
+
The structure of the crawling process was inspired by the specs of the Mercator crawler (http://www.cindoc.csic.es/cybermetrics/pdf/68.pdf).
|
7
|
+
|
8
|
+
== Examples
|
9
|
+
bot = Rcrawl.new(url) # This instantiates a new Rcrawl object
|
10
|
+
|
11
|
+
bot.crawl # This will actually crawl the website
|
12
|
+
|
13
|
+
== After the bot is done crawling
|
14
|
+
bot.visited_links # Returns an array of visited links
|
15
|
+
|
16
|
+
bot.dump # Returns a hash where the key is a url and the value is
|
17
|
+
# the raw html from that url
|
18
|
+
|
19
|
+
bot.errors # Returns a hash where the key is a URL and the value is
|
20
|
+
# the error message from stderr
|
21
|
+
|
22
|
+
bot.external_links # Returns an array of external links
|
data/Rakefile
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
Gem::manage_gems
|
3
|
+
require 'rake'
|
4
|
+
require 'rake/rdoctask'
|
5
|
+
require 'rake/gempackagetask'
|
6
|
+
|
7
|
+
|
8
|
+
|
9
|
+
desc "Generate documentation"
|
10
|
+
Rake::RDocTask.new(:rdoc) do |rdoc|
|
11
|
+
rdoc.rdoc_dir = "rdoc"
|
12
|
+
rdoc.title = "Crawler"
|
13
|
+
rdoc.options << "--line-numbers"
|
14
|
+
rdoc.options << "--inline-source"
|
15
|
+
rdoc.rdoc_files.include("README")
|
16
|
+
rdoc.rdoc_files.include("lib/**/*.rb")
|
17
|
+
end
|
18
|
+
|
19
|
+
spec = Gem::Specification.new do |s|
|
20
|
+
s.name = "rcrawl"
|
21
|
+
s.version = "0.2.5"
|
22
|
+
s.author = "Shawn Hansen"
|
23
|
+
s.email = "shawn.hansen@gmail.com"
|
24
|
+
s.homepage = "http://blog.denomi.net"
|
25
|
+
s.platform = Gem::Platform::RUBY
|
26
|
+
s.summary = "A web crawler written in ruby"
|
27
|
+
s.files = FileList["{test,lib}/**/*", "README", "MIT-LICENSE", "Rakefile", "TODO"].to_a
|
28
|
+
s.require_path = "lib"
|
29
|
+
s.autorequire = "rcrawl.rb"
|
30
|
+
s.has_rdoc = true
|
31
|
+
s.extra_rdoc_files = ["README", "MIT-LICENSE", "TODO"]
|
32
|
+
s.add_dependency("scrapi", ">=1.2.0")
|
33
|
+
s.rubyforge_project = "rcrawl"
|
34
|
+
end
|
35
|
+
|
36
|
+
gem = Rake::GemPackageTask.new(spec) do |pkg|
|
37
|
+
pkg.need_tar = true
|
38
|
+
pkg.need_zip = true
|
39
|
+
end
|
data/lib/rcrawl.rb
ADDED
@@ -0,0 +1,177 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# rcrawl/0.2.0
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'open-uri'
|
6
|
+
require 'scrapi'
|
7
|
+
require 'robot_rules'
|
8
|
+
|
9
|
+
# Rcrawl will retrieve an entire website, one page at a time,
|
10
|
+
# parsing the page using whatever modules you pass it to.
|
11
|
+
class Rcrawl
|
12
|
+
|
13
|
+
# Initializes various variables when a new Rcrawl object is instantiated
|
14
|
+
def initialize(site)
|
15
|
+
@links_to_visit = Array.new
|
16
|
+
@visited_links = Array.new
|
17
|
+
@external_links = Array.new
|
18
|
+
@raw_html = Hash.new
|
19
|
+
@rules = RobotRules.new("Rcrawl")
|
20
|
+
@sites = Hash.new
|
21
|
+
@site = URI.parse(site)
|
22
|
+
@links_to_visit << site
|
23
|
+
@errors = Hash.new
|
24
|
+
puts "Site is #{site}"
|
25
|
+
end
|
26
|
+
|
27
|
+
# Coordinates the whole crawling process
|
28
|
+
def crawl
|
29
|
+
until @links_to_visit.empty? do
|
30
|
+
begin
|
31
|
+
# Get link
|
32
|
+
url_server
|
33
|
+
next unless robot_safe? @url
|
34
|
+
# Parse robots.txt, then download document if robot_safe
|
35
|
+
fetch_http(@url)
|
36
|
+
# Store raw HTML in variable to read/reread as needed
|
37
|
+
# Then call any processing modules you need for the current document
|
38
|
+
ris(@document)
|
39
|
+
rescue
|
40
|
+
puts ""
|
41
|
+
puts "I died on #{@url}"
|
42
|
+
$stderr.puts $!
|
43
|
+
@errors[@url] = $!
|
44
|
+
next
|
45
|
+
ensure
|
46
|
+
# Stuff you want to make sure gets printed out
|
47
|
+
puts " done!"
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
puts "Visited #{@visited_links.size} links."
|
52
|
+
end
|
53
|
+
|
54
|
+
# Authoritative list of URLs to be processed by Rcrawl
|
55
|
+
def url_server
|
56
|
+
unless @links_to_visit.empty?
|
57
|
+
@url = @links_to_visit.pop
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Download the document
|
62
|
+
def fetch_http(url)
|
63
|
+
# Make sure robots.txt has been parsed for this site first,
|
64
|
+
# if not, parse robots.txt then grab document.
|
65
|
+
uri = URI.parse(url)
|
66
|
+
print "Visiting: #{url}"
|
67
|
+
@document = uri.read
|
68
|
+
@visited_links << url
|
69
|
+
end
|
70
|
+
|
71
|
+
# Rewind Input Stream, for storing and reading of raw HTML
|
72
|
+
def ris(document)
|
73
|
+
# Store raw HTML into local variable
|
74
|
+
# Based on MIME type, invoke the proper processing modules
|
75
|
+
if document.content_type == "text/html"
|
76
|
+
print "."
|
77
|
+
link_extractor(document) # If HTML
|
78
|
+
process_html(document) # If HTML
|
79
|
+
else
|
80
|
+
print "... not HTML, skipping..."
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
# HTML processing module for extracting links
|
85
|
+
def link_extractor(document)
|
86
|
+
print "."
|
87
|
+
|
88
|
+
# Parse all links from HTML into an array
|
89
|
+
# Set up the scrAPI (http://labnotes.org)
|
90
|
+
links = Scraper.define do
|
91
|
+
array :urls
|
92
|
+
process "a[href]", :urls => "@href"
|
93
|
+
result :urls
|
94
|
+
end
|
95
|
+
|
96
|
+
urls = links.scrape(document)
|
97
|
+
|
98
|
+
urls.each { |url|
|
99
|
+
uri = URI.parse(url)
|
100
|
+
|
101
|
+
# Derelativeize links if necessary
|
102
|
+
if uri.relative?
|
103
|
+
url = @site.merge(url).to_s
|
104
|
+
uri = URI.parse(url)
|
105
|
+
end
|
106
|
+
|
107
|
+
# Check domain, if in same domain, keep link, else trash it
|
108
|
+
if uri.host != @site.host
|
109
|
+
@external_links << url
|
110
|
+
@external_links.uniq!
|
111
|
+
next
|
112
|
+
end
|
113
|
+
|
114
|
+
# Find out if we've seen this link already
|
115
|
+
if (@visited_links.include? url) || (@links_to_visit.include? url)
|
116
|
+
next
|
117
|
+
end
|
118
|
+
|
119
|
+
@links_to_visit << url
|
120
|
+
}
|
121
|
+
|
122
|
+
end
|
123
|
+
|
124
|
+
# HTML processing module for raw HTML storage
|
125
|
+
def process_html(document)
|
126
|
+
# Add link and raw HTML to a hash as key/value
|
127
|
+
# for later storage in database
|
128
|
+
unless @raw_html.has_value?(document)
|
129
|
+
print "."
|
130
|
+
@raw_html[document.base_uri] = document
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
# robots.txt parsing
|
135
|
+
def robot_safe?(url)
|
136
|
+
uri = URI.parse(url)
|
137
|
+
location = "#{uri.host}:#{uri.port}"
|
138
|
+
|
139
|
+
return true unless %w{http https}.include?(uri.scheme)
|
140
|
+
|
141
|
+
unless @sites.include? location
|
142
|
+
@sites[location] = true
|
143
|
+
|
144
|
+
robot_url = "http://#{location}/robots.txt"
|
145
|
+
begin
|
146
|
+
robot_file = open(robot_url) { |page| page.read }
|
147
|
+
rescue
|
148
|
+
return true
|
149
|
+
end
|
150
|
+
@rules.parse(robot_url, robot_file)
|
151
|
+
end
|
152
|
+
|
153
|
+
@rules.allowed? url
|
154
|
+
end
|
155
|
+
|
156
|
+
# Returns array of links visited during crawl
|
157
|
+
def visited_links
|
158
|
+
return @visited_links
|
159
|
+
end
|
160
|
+
|
161
|
+
# Returns array of external links
|
162
|
+
def external_links
|
163
|
+
return @external_links
|
164
|
+
end
|
165
|
+
|
166
|
+
# Returns a hash where {key => URL, value => HTML} from all pages crawled
|
167
|
+
def dump
|
168
|
+
return @raw_html
|
169
|
+
end
|
170
|
+
|
171
|
+
# Returns a hash where {key => URL, value => "Error message"} from any
|
172
|
+
# errors encountered during the crawl
|
173
|
+
def errors
|
174
|
+
return @errors
|
175
|
+
end
|
176
|
+
|
177
|
+
end
|
data/lib/robot_rules.rb
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# robot_rules.rb
|
4
|
+
#
|
5
|
+
# Created by James Edward Gray II on 2006-01-31.
|
6
|
+
# Copyright 2006 Gray Productions. All rights reserved.
|
7
|
+
# Included with rcrawl by permission from James Edward Gray II
|
8
|
+
|
9
|
+
require "uri"
|
10
|
+
|
11
|
+
# Based on Perl's WWW::RobotRules module, by Gisle Aas.
|
12
|
+
class RobotRules
|
13
|
+
def initialize( user_agent )
|
14
|
+
@user_agent = user_agent.scan(/\S+/).first.sub(%r{/.*},
|
15
|
+
"").downcase
|
16
|
+
@rules = Hash.new { |rules, rule| rules[rule] = Array.new }
|
17
|
+
end
|
18
|
+
|
19
|
+
def parse( text_uri, robots_data )
|
20
|
+
uri = URI.parse(text_uri)
|
21
|
+
location = "#{uri.host}:#{uri.port}"
|
22
|
+
@rules.delete(location)
|
23
|
+
|
24
|
+
rules = robots_data.split(/[\015\012]+/).
|
25
|
+
map { |rule| rule.sub(/\s*#.*$/, "") }
|
26
|
+
anon_rules = Array.new
|
27
|
+
my_rules = Array.new
|
28
|
+
current = anon_rules
|
29
|
+
rules.each do |rule|
|
30
|
+
case rule
|
31
|
+
when /^\s*User-Agent\s*:\s*(.+?)\s*$/i
|
32
|
+
break unless my_rules.empty?
|
33
|
+
|
34
|
+
current = if $1 == "*"
|
35
|
+
anon_rules
|
36
|
+
elsif $1.downcase.index(@user_agent)
|
37
|
+
my_rules
|
38
|
+
else
|
39
|
+
nil
|
40
|
+
end
|
41
|
+
when /^\s*Disallow\s*:\s*(.*?)\s*$/i
|
42
|
+
next if current.nil?
|
43
|
+
|
44
|
+
if $1.empty?
|
45
|
+
current << nil
|
46
|
+
else
|
47
|
+
disallow = URI.parse($1)
|
48
|
+
|
49
|
+
next unless disallow.scheme.nil? or disallow.scheme ==
|
50
|
+
uri.scheme
|
51
|
+
next unless disallow.port.nil? or disallow.port == uri.port
|
52
|
+
next unless disallow.host.nil? or
|
53
|
+
disallow.host.downcase == uri.host.downcase
|
54
|
+
|
55
|
+
disallow = disallow.path
|
56
|
+
disallow = "/" if disallow.empty?
|
57
|
+
disallow = "/#{disallow}" unless disallow[0] == ?/
|
58
|
+
|
59
|
+
current << disallow
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
@rules[location] = if my_rules.empty?
|
65
|
+
anon_rules.compact
|
66
|
+
else
|
67
|
+
my_rules.compact
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def allowed?( text_uri )
|
72
|
+
uri = URI.parse(text_uri)
|
73
|
+
location = "#{uri.host}:#{uri.port}"
|
74
|
+
path = uri.path
|
75
|
+
|
76
|
+
return true unless %w{http https}.include?(uri.scheme)
|
77
|
+
|
78
|
+
not @rules[location].any? { |rule| path.index(rule) == 0 }
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
metadata
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.0
|
3
|
+
specification_version: 1
|
4
|
+
name: rcrawl
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.2.5
|
7
|
+
date: 2006-09-20 00:00:00 -05:00
|
8
|
+
summary: A web crawler written in ruby
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: shawn.hansen@gmail.com
|
12
|
+
homepage: http://blog.denomi.net
|
13
|
+
rubyforge_project: rcrawl
|
14
|
+
description:
|
15
|
+
autorequire: rcrawl.rb
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
post_install_message:
|
29
|
+
authors:
|
30
|
+
- Shawn Hansen
|
31
|
+
files:
|
32
|
+
- lib/rcrawl.rb
|
33
|
+
- lib/robot_rules.rb
|
34
|
+
- README
|
35
|
+
- MIT-LICENSE
|
36
|
+
- Rakefile
|
37
|
+
- TODO
|
38
|
+
test_files: []
|
39
|
+
|
40
|
+
rdoc_options: []
|
41
|
+
|
42
|
+
extra_rdoc_files:
|
43
|
+
- README
|
44
|
+
- MIT-LICENSE
|
45
|
+
- TODO
|
46
|
+
executables: []
|
47
|
+
|
48
|
+
extensions: []
|
49
|
+
|
50
|
+
requirements: []
|
51
|
+
|
52
|
+
dependencies:
|
53
|
+
- !ruby/object:Gem::Dependency
|
54
|
+
name: scrapi
|
55
|
+
version_requirement:
|
56
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: 1.2.0
|
61
|
+
version:
|