rcrawl 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (7) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +22 -0
  3. data/Rakefile +39 -0
  4. data/TODO +1 -0
  5. data/lib/rcrawl.rb +177 -0
  6. data/lib/robot_rules.rb +81 -0
  7. metadata +61 -0
data/MIT-LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2006 Shawn Hansen
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,22 @@
1
+ Rcrawl is intended to be a web crawler written entirely in ruby.
2
+ It's limited right now by the fact that it will stay on the original domain provided.
3
+ I decided to roll my own crawler in ruby after finding only snippets of code on
4
+ various web sites or newsgroups, for crawlers written in ruby.
5
+
6
+ The structure of the crawling process was inspired by the specs of the Mercator crawler (http://www.cindoc.csic.es/cybermetrics/pdf/68.pdf).
7
+
8
+ == Examples
9
+ bot = Rcrawl.new(url) # This instantiates a new Rcrawl object
10
+
11
+ bot.crawl # This will actually crawl the website
12
+
13
+ == After the bot is done crawling
14
+ bot.visited_links # Returns an array of visited links
15
+
16
+ bot.dump # Returns a hash where the key is a url and the value is
17
+ # the raw html from that url
18
+
19
+ bot.errors # Returns a hash where the key is a URL and the value is
20
+ # the error message from stderr
21
+
22
+ bot.external_links # Returns an array of external links
data/Rakefile ADDED
@@ -0,0 +1,39 @@
1
+ require 'rubygems'
2
+ Gem::manage_gems
3
+ require 'rake'
4
+ require 'rake/rdoctask'
5
+ require 'rake/gempackagetask'
6
+
7
+
8
+
9
+ desc "Generate documentation"
10
+ Rake::RDocTask.new(:rdoc) do |rdoc|
11
+ rdoc.rdoc_dir = "rdoc"
12
+ rdoc.title = "Crawler"
13
+ rdoc.options << "--line-numbers"
14
+ rdoc.options << "--inline-source"
15
+ rdoc.rdoc_files.include("README")
16
+ rdoc.rdoc_files.include("lib/**/*.rb")
17
+ end
18
+
19
+ spec = Gem::Specification.new do |s|
20
+ s.name = "rcrawl"
21
+ s.version = "0.2.5"
22
+ s.author = "Shawn Hansen"
23
+ s.email = "shawn.hansen@gmail.com"
24
+ s.homepage = "http://blog.denomi.net"
25
+ s.platform = Gem::Platform::RUBY
26
+ s.summary = "A web crawler written in ruby"
27
+ s.files = FileList["{test,lib}/**/*", "README", "MIT-LICENSE", "Rakefile", "TODO"].to_a
28
+ s.require_path = "lib"
29
+ s.autorequire = "rcrawl.rb"
30
+ s.has_rdoc = true
31
+ s.extra_rdoc_files = ["README", "MIT-LICENSE", "TODO"]
32
+ s.add_dependency("scrapi", ">=1.2.0")
33
+ s.rubyforge_project = "rcrawl"
34
+ end
35
+
36
+ gem = Rake::GemPackageTask.new(spec) do |pkg|
37
+ pkg.need_tar = true
38
+ pkg.need_zip = true
39
+ end
data/TODO ADDED
@@ -0,0 +1 @@
1
+ Lots! TODO will be updated soon.
data/lib/rcrawl.rb ADDED
@@ -0,0 +1,177 @@
1
+ #!/usr/bin/env ruby
2
+ # rcrawl/0.2.0
3
+
4
+ require 'rubygems'
5
+ require 'open-uri'
6
+ require 'scrapi'
7
+ require 'robot_rules'
8
+
9
+ # Rcrawl will retrieve an entire website, one page at a time,
10
+ # parsing the page using whatever modules you pass it to.
11
+ class Rcrawl
12
+
13
+ # Initializes various variables when a new Rcrawl object is instantiated
14
+ def initialize(site)
15
+ @links_to_visit = Array.new
16
+ @visited_links = Array.new
17
+ @external_links = Array.new
18
+ @raw_html = Hash.new
19
+ @rules = RobotRules.new("Rcrawl")
20
+ @sites = Hash.new
21
+ @site = URI.parse(site)
22
+ @links_to_visit << site
23
+ @errors = Hash.new
24
+ puts "Site is #{site}"
25
+ end
26
+
27
+ # Coordinates the whole crawling process
28
+ def crawl
29
+ until @links_to_visit.empty? do
30
+ begin
31
+ # Get link
32
+ url_server
33
+ next unless robot_safe? @url
34
+ # Parse robots.txt, then download document if robot_safe
35
+ fetch_http(@url)
36
+ # Store raw HTML in variable to read/reread as needed
37
+ # Then call any processing modules you need for the current document
38
+ ris(@document)
39
+ rescue
40
+ puts ""
41
+ puts "I died on #{@url}"
42
+ $stderr.puts $!
43
+ @errors[@url] = $!
44
+ next
45
+ ensure
46
+ # Stuff you want to make sure gets printed out
47
+ puts " done!"
48
+ end
49
+ end
50
+
51
+ puts "Visited #{@visited_links.size} links."
52
+ end
53
+
54
+ # Authoritative list of URLs to be processed by Rcrawl
55
+ def url_server
56
+ unless @links_to_visit.empty?
57
+ @url = @links_to_visit.pop
58
+ end
59
+ end
60
+
61
+ # Download the document
62
+ def fetch_http(url)
63
+ # Make sure robots.txt has been parsed for this site first,
64
+ # if not, parse robots.txt then grab document.
65
+ uri = URI.parse(url)
66
+ print "Visiting: #{url}"
67
+ @document = uri.read
68
+ @visited_links << url
69
+ end
70
+
71
+ # Rewind Input Stream, for storing and reading of raw HTML
72
+ def ris(document)
73
+ # Store raw HTML into local variable
74
+ # Based on MIME type, invoke the proper processing modules
75
+ if document.content_type == "text/html"
76
+ print "."
77
+ link_extractor(document) # If HTML
78
+ process_html(document) # If HTML
79
+ else
80
+ print "... not HTML, skipping..."
81
+ end
82
+ end
83
+
84
+ # HTML processing module for extracting links
85
+ def link_extractor(document)
86
+ print "."
87
+
88
+ # Parse all links from HTML into an array
89
+ # Set up the scrAPI (http://labnotes.org)
90
+ links = Scraper.define do
91
+ array :urls
92
+ process "a[href]", :urls => "@href"
93
+ result :urls
94
+ end
95
+
96
+ urls = links.scrape(document)
97
+
98
+ urls.each { |url|
99
+ uri = URI.parse(url)
100
+
101
+ # Derelativeize links if necessary
102
+ if uri.relative?
103
+ url = @site.merge(url).to_s
104
+ uri = URI.parse(url)
105
+ end
106
+
107
+ # Check domain, if in same domain, keep link, else trash it
108
+ if uri.host != @site.host
109
+ @external_links << url
110
+ @external_links.uniq!
111
+ next
112
+ end
113
+
114
+ # Find out if we've seen this link already
115
+ if (@visited_links.include? url) || (@links_to_visit.include? url)
116
+ next
117
+ end
118
+
119
+ @links_to_visit << url
120
+ }
121
+
122
+ end
123
+
124
+ # HTML processing module for raw HTML storage
125
+ def process_html(document)
126
+ # Add link and raw HTML to a hash as key/value
127
+ # for later storage in database
128
+ unless @raw_html.has_value?(document)
129
+ print "."
130
+ @raw_html[document.base_uri] = document
131
+ end
132
+ end
133
+
134
+ # robots.txt parsing
135
+ def robot_safe?(url)
136
+ uri = URI.parse(url)
137
+ location = "#{uri.host}:#{uri.port}"
138
+
139
+ return true unless %w{http https}.include?(uri.scheme)
140
+
141
+ unless @sites.include? location
142
+ @sites[location] = true
143
+
144
+ robot_url = "http://#{location}/robots.txt"
145
+ begin
146
+ robot_file = open(robot_url) { |page| page.read }
147
+ rescue
148
+ return true
149
+ end
150
+ @rules.parse(robot_url, robot_file)
151
+ end
152
+
153
+ @rules.allowed? url
154
+ end
155
+
156
+ # Returns array of links visited during crawl
157
+ def visited_links
158
+ return @visited_links
159
+ end
160
+
161
+ # Returns array of external links
162
+ def external_links
163
+ return @external_links
164
+ end
165
+
166
+ # Returns a hash where {key => URL, value => HTML} from all pages crawled
167
+ def dump
168
+ return @raw_html
169
+ end
170
+
171
+ # Returns a hash where {key => URL, value => "Error message"} from any
172
+ # errors encountered during the crawl
173
+ def errors
174
+ return @errors
175
+ end
176
+
177
+ end
@@ -0,0 +1,81 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # robot_rules.rb
4
+ #
5
+ # Created by James Edward Gray II on 2006-01-31.
6
+ # Copyright 2006 Gray Productions. All rights reserved.
7
+ # Included with rcrawl by permission from James Edward Gray II
8
+
9
+ require "uri"
10
+
11
+ # Based on Perl's WWW::RobotRules module, by Gisle Aas.
12
+ class RobotRules
13
+ def initialize( user_agent )
14
+ @user_agent = user_agent.scan(/\S+/).first.sub(%r{/.*},
15
+ "").downcase
16
+ @rules = Hash.new { |rules, rule| rules[rule] = Array.new }
17
+ end
18
+
19
+ def parse( text_uri, robots_data )
20
+ uri = URI.parse(text_uri)
21
+ location = "#{uri.host}:#{uri.port}"
22
+ @rules.delete(location)
23
+
24
+ rules = robots_data.split(/[\015\012]+/).
25
+ map { |rule| rule.sub(/\s*#.*$/, "") }
26
+ anon_rules = Array.new
27
+ my_rules = Array.new
28
+ current = anon_rules
29
+ rules.each do |rule|
30
+ case rule
31
+ when /^\s*User-Agent\s*:\s*(.+?)\s*$/i
32
+ break unless my_rules.empty?
33
+
34
+ current = if $1 == "*"
35
+ anon_rules
36
+ elsif $1.downcase.index(@user_agent)
37
+ my_rules
38
+ else
39
+ nil
40
+ end
41
+ when /^\s*Disallow\s*:\s*(.*?)\s*$/i
42
+ next if current.nil?
43
+
44
+ if $1.empty?
45
+ current << nil
46
+ else
47
+ disallow = URI.parse($1)
48
+
49
+ next unless disallow.scheme.nil? or disallow.scheme ==
50
+ uri.scheme
51
+ next unless disallow.port.nil? or disallow.port == uri.port
52
+ next unless disallow.host.nil? or
53
+ disallow.host.downcase == uri.host.downcase
54
+
55
+ disallow = disallow.path
56
+ disallow = "/" if disallow.empty?
57
+ disallow = "/#{disallow}" unless disallow[0] == ?/
58
+
59
+ current << disallow
60
+ end
61
+ end
62
+ end
63
+
64
+ @rules[location] = if my_rules.empty?
65
+ anon_rules.compact
66
+ else
67
+ my_rules.compact
68
+ end
69
+ end
70
+
71
+ def allowed?( text_uri )
72
+ uri = URI.parse(text_uri)
73
+ location = "#{uri.host}:#{uri.port}"
74
+ path = uri.path
75
+
76
+ return true unless %w{http https}.include?(uri.scheme)
77
+
78
+ not @rules[location].any? { |rule| path.index(rule) == 0 }
79
+ end
80
+ end
81
+
metadata ADDED
@@ -0,0 +1,61 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0
3
+ specification_version: 1
4
+ name: rcrawl
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.2.5
7
+ date: 2006-09-20 00:00:00 -05:00
8
+ summary: A web crawler written in ruby
9
+ require_paths:
10
+ - lib
11
+ email: shawn.hansen@gmail.com
12
+ homepage: http://blog.denomi.net
13
+ rubyforge_project: rcrawl
14
+ description:
15
+ autorequire: rcrawl.rb
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Shawn Hansen
31
+ files:
32
+ - lib/rcrawl.rb
33
+ - lib/robot_rules.rb
34
+ - README
35
+ - MIT-LICENSE
36
+ - Rakefile
37
+ - TODO
38
+ test_files: []
39
+
40
+ rdoc_options: []
41
+
42
+ extra_rdoc_files:
43
+ - README
44
+ - MIT-LICENSE
45
+ - TODO
46
+ executables: []
47
+
48
+ extensions: []
49
+
50
+ requirements: []
51
+
52
+ dependencies:
53
+ - !ruby/object:Gem::Dependency
54
+ name: scrapi
55
+ version_requirement:
56
+ version_requirements: !ruby/object:Gem::Version::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: 1.2.0
61
+ version: