rcrawl 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (7) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +22 -0
  3. data/Rakefile +39 -0
  4. data/TODO +1 -0
  5. data/lib/rcrawl.rb +177 -0
  6. data/lib/robot_rules.rb +81 -0
  7. metadata +61 -0
data/MIT-LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2006 Shawn Hansen
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,22 @@
1
+ Rcrawl is intended to be a web crawler written entirely in ruby.
2
+ It's limited right now by the fact that it will stay on the original domain provided.
3
+ I decided to roll my own crawler in ruby after finding only snippets of code on
4
+ various web sites or newsgroups, for crawlers written in ruby.
5
+
6
+ The structure of the crawling process was inspired by the specs of the Mercator crawler (http://www.cindoc.csic.es/cybermetrics/pdf/68.pdf).
7
+
8
+ == Examples
9
+ bot = Rcrawl.new(url) # This instantiates a new Rcrawl object
10
+
11
+ bot.crawl # This will actually crawl the website
12
+
13
+ == After the bot is done crawling
14
+ bot.visited_links # Returns an array of visited links
15
+
16
+ bot.dump # Returns a hash where the key is a url and the value is
17
+ # the raw html from that url
18
+
19
+ bot.errors # Returns a hash where the key is a URL and the value is
20
+ # the error message from stderr
21
+
22
+ bot.external_links # Returns an array of external links
data/Rakefile ADDED
@@ -0,0 +1,39 @@
1
+ require 'rubygems'
2
+ Gem::manage_gems
3
+ require 'rake'
4
+ require 'rake/rdoctask'
5
+ require 'rake/gempackagetask'
6
+
7
+
8
+
9
+ desc "Generate documentation"
10
+ Rake::RDocTask.new(:rdoc) do |rdoc|
11
+ rdoc.rdoc_dir = "rdoc"
12
+ rdoc.title = "Crawler"
13
+ rdoc.options << "--line-numbers"
14
+ rdoc.options << "--inline-source"
15
+ rdoc.rdoc_files.include("README")
16
+ rdoc.rdoc_files.include("lib/**/*.rb")
17
+ end
18
+
19
+ spec = Gem::Specification.new do |s|
20
+ s.name = "rcrawl"
21
+ s.version = "0.2.5"
22
+ s.author = "Shawn Hansen"
23
+ s.email = "shawn.hansen@gmail.com"
24
+ s.homepage = "http://blog.denomi.net"
25
+ s.platform = Gem::Platform::RUBY
26
+ s.summary = "A web crawler written in ruby"
27
+ s.files = FileList["{test,lib}/**/*", "README", "MIT-LICENSE", "Rakefile", "TODO"].to_a
28
+ s.require_path = "lib"
29
+ s.autorequire = "rcrawl.rb"
30
+ s.has_rdoc = true
31
+ s.extra_rdoc_files = ["README", "MIT-LICENSE", "TODO"]
32
+ s.add_dependency("scrapi", ">=1.2.0")
33
+ s.rubyforge_project = "rcrawl"
34
+ end
35
+
36
+ gem = Rake::GemPackageTask.new(spec) do |pkg|
37
+ pkg.need_tar = true
38
+ pkg.need_zip = true
39
+ end
data/TODO ADDED
@@ -0,0 +1 @@
1
+ Lots! TODO will be updated soon.
data/lib/rcrawl.rb ADDED
@@ -0,0 +1,177 @@
1
+ #!/usr/bin/env ruby
2
+ # rcrawl/0.2.0
3
+
4
+ require 'rubygems'
5
+ require 'open-uri'
6
+ require 'scrapi'
7
+ require 'robot_rules'
8
+
9
+ # Rcrawl will retrieve an entire website, one page at a time,
10
+ # parsing the page using whatever modules you pass it to.
11
+ class Rcrawl
12
+
13
+ # Initializes various variables when a new Rcrawl object is instantiated
14
+ def initialize(site)
15
+ @links_to_visit = Array.new
16
+ @visited_links = Array.new
17
+ @external_links = Array.new
18
+ @raw_html = Hash.new
19
+ @rules = RobotRules.new("Rcrawl")
20
+ @sites = Hash.new
21
+ @site = URI.parse(site)
22
+ @links_to_visit << site
23
+ @errors = Hash.new
24
+ puts "Site is #{site}"
25
+ end
26
+
27
+ # Coordinates the whole crawling process
28
+ def crawl
29
+ until @links_to_visit.empty? do
30
+ begin
31
+ # Get link
32
+ url_server
33
+ next unless robot_safe? @url
34
+ # Parse robots.txt, then download document if robot_safe
35
+ fetch_http(@url)
36
+ # Store raw HTML in variable to read/reread as needed
37
+ # Then call any processing modules you need for the current document
38
+ ris(@document)
39
+ rescue
40
+ puts ""
41
+ puts "I died on #{@url}"
42
+ $stderr.puts $!
43
+ @errors[@url] = $!
44
+ next
45
+ ensure
46
+ # Stuff you want to make sure gets printed out
47
+ puts " done!"
48
+ end
49
+ end
50
+
51
+ puts "Visited #{@visited_links.size} links."
52
+ end
53
+
54
+ # Authoritative list of URLs to be processed by Rcrawl
55
+ def url_server
56
+ unless @links_to_visit.empty?
57
+ @url = @links_to_visit.pop
58
+ end
59
+ end
60
+
61
+ # Download the document
62
+ def fetch_http(url)
63
+ # Make sure robots.txt has been parsed for this site first,
64
+ # if not, parse robots.txt then grab document.
65
+ uri = URI.parse(url)
66
+ print "Visiting: #{url}"
67
+ @document = uri.read
68
+ @visited_links << url
69
+ end
70
+
71
+ # Rewind Input Stream, for storing and reading of raw HTML
72
+ def ris(document)
73
+ # Store raw HTML into local variable
74
+ # Based on MIME type, invoke the proper processing modules
75
+ if document.content_type == "text/html"
76
+ print "."
77
+ link_extractor(document) # If HTML
78
+ process_html(document) # If HTML
79
+ else
80
+ print "... not HTML, skipping..."
81
+ end
82
+ end
83
+
84
+ # HTML processing module for extracting links
85
+ def link_extractor(document)
86
+ print "."
87
+
88
+ # Parse all links from HTML into an array
89
+ # Set up the scrAPI (http://labnotes.org)
90
+ links = Scraper.define do
91
+ array :urls
92
+ process "a[href]", :urls => "@href"
93
+ result :urls
94
+ end
95
+
96
+ urls = links.scrape(document)
97
+
98
+ urls.each { |url|
99
+ uri = URI.parse(url)
100
+
101
+ # Derelativeize links if necessary
102
+ if uri.relative?
103
+ url = @site.merge(url).to_s
104
+ uri = URI.parse(url)
105
+ end
106
+
107
+ # Check domain, if in same domain, keep link, else trash it
108
+ if uri.host != @site.host
109
+ @external_links << url
110
+ @external_links.uniq!
111
+ next
112
+ end
113
+
114
+ # Find out if we've seen this link already
115
+ if (@visited_links.include? url) || (@links_to_visit.include? url)
116
+ next
117
+ end
118
+
119
+ @links_to_visit << url
120
+ }
121
+
122
+ end
123
+
124
+ # HTML processing module for raw HTML storage
125
+ def process_html(document)
126
+ # Add link and raw HTML to a hash as key/value
127
+ # for later storage in database
128
+ unless @raw_html.has_value?(document)
129
+ print "."
130
+ @raw_html[document.base_uri] = document
131
+ end
132
+ end
133
+
134
+ # robots.txt parsing
135
+ def robot_safe?(url)
136
+ uri = URI.parse(url)
137
+ location = "#{uri.host}:#{uri.port}"
138
+
139
+ return true unless %w{http https}.include?(uri.scheme)
140
+
141
+ unless @sites.include? location
142
+ @sites[location] = true
143
+
144
+ robot_url = "http://#{location}/robots.txt"
145
+ begin
146
+ robot_file = open(robot_url) { |page| page.read }
147
+ rescue
148
+ return true
149
+ end
150
+ @rules.parse(robot_url, robot_file)
151
+ end
152
+
153
+ @rules.allowed? url
154
+ end
155
+
156
+ # Returns array of links visited during crawl
157
+ def visited_links
158
+ return @visited_links
159
+ end
160
+
161
+ # Returns array of external links
162
+ def external_links
163
+ return @external_links
164
+ end
165
+
166
+ # Returns a hash where {key => URL, value => HTML} from all pages crawled
167
+ def dump
168
+ return @raw_html
169
+ end
170
+
171
+ # Returns a hash where {key => URL, value => "Error message"} from any
172
+ # errors encountered during the crawl
173
+ def errors
174
+ return @errors
175
+ end
176
+
177
+ end
@@ -0,0 +1,81 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # robot_rules.rb
4
+ #
5
+ # Created by James Edward Gray II on 2006-01-31.
6
+ # Copyright 2006 Gray Productions. All rights reserved.
7
+ # Included with rcrawl by permission from James Edward Gray II
8
+
9
+ require "uri"
10
+
11
+ # Based on Perl's WWW::RobotRules module, by Gisle Aas.
12
+ class RobotRules
13
+ def initialize( user_agent )
14
+ @user_agent = user_agent.scan(/\S+/).first.sub(%r{/.*},
15
+ "").downcase
16
+ @rules = Hash.new { |rules, rule| rules[rule] = Array.new }
17
+ end
18
+
19
+ def parse( text_uri, robots_data )
20
+ uri = URI.parse(text_uri)
21
+ location = "#{uri.host}:#{uri.port}"
22
+ @rules.delete(location)
23
+
24
+ rules = robots_data.split(/[\015\012]+/).
25
+ map { |rule| rule.sub(/\s*#.*$/, "") }
26
+ anon_rules = Array.new
27
+ my_rules = Array.new
28
+ current = anon_rules
29
+ rules.each do |rule|
30
+ case rule
31
+ when /^\s*User-Agent\s*:\s*(.+?)\s*$/i
32
+ break unless my_rules.empty?
33
+
34
+ current = if $1 == "*"
35
+ anon_rules
36
+ elsif $1.downcase.index(@user_agent)
37
+ my_rules
38
+ else
39
+ nil
40
+ end
41
+ when /^\s*Disallow\s*:\s*(.*?)\s*$/i
42
+ next if current.nil?
43
+
44
+ if $1.empty?
45
+ current << nil
46
+ else
47
+ disallow = URI.parse($1)
48
+
49
+ next unless disallow.scheme.nil? or disallow.scheme ==
50
+ uri.scheme
51
+ next unless disallow.port.nil? or disallow.port == uri.port
52
+ next unless disallow.host.nil? or
53
+ disallow.host.downcase == uri.host.downcase
54
+
55
+ disallow = disallow.path
56
+ disallow = "/" if disallow.empty?
57
+ disallow = "/#{disallow}" unless disallow[0] == ?/
58
+
59
+ current << disallow
60
+ end
61
+ end
62
+ end
63
+
64
+ @rules[location] = if my_rules.empty?
65
+ anon_rules.compact
66
+ else
67
+ my_rules.compact
68
+ end
69
+ end
70
+
71
+ def allowed?( text_uri )
72
+ uri = URI.parse(text_uri)
73
+ location = "#{uri.host}:#{uri.port}"
74
+ path = uri.path
75
+
76
+ return true unless %w{http https}.include?(uri.scheme)
77
+
78
+ not @rules[location].any? { |rule| path.index(rule) == 0 }
79
+ end
80
+ end
81
+
metadata ADDED
@@ -0,0 +1,61 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0
3
+ specification_version: 1
4
+ name: rcrawl
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.2.5
7
+ date: 2006-09-20 00:00:00 -05:00
8
+ summary: A web crawler written in ruby
9
+ require_paths:
10
+ - lib
11
+ email: shawn.hansen@gmail.com
12
+ homepage: http://blog.denomi.net
13
+ rubyforge_project: rcrawl
14
+ description:
15
+ autorequire: rcrawl.rb
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Shawn Hansen
31
+ files:
32
+ - lib/rcrawl.rb
33
+ - lib/robot_rules.rb
34
+ - README
35
+ - MIT-LICENSE
36
+ - Rakefile
37
+ - TODO
38
+ test_files: []
39
+
40
+ rdoc_options: []
41
+
42
+ extra_rdoc_files:
43
+ - README
44
+ - MIT-LICENSE
45
+ - TODO
46
+ executables: []
47
+
48
+ extensions: []
49
+
50
+ requirements: []
51
+
52
+ dependencies:
53
+ - !ruby/object:Gem::Dependency
54
+ name: scrapi
55
+ version_requirement:
56
+ version_requirements: !ruby/object:Gem::Version::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: 1.2.0
61
+ version: