rcrawl 0.3.5 → 0.4.5

Sign up to get free protection for your applications and to get access to all the features.
data/README CHANGED
@@ -32,7 +32,7 @@ The structure of the crawling process was inspired by the specs of the Mercator
32
32
 
33
33
  # Returns a hash where the key is a url and the value is
34
34
  # the raw html from that url
35
- crawler.dump
35
+ crawler.raw_html
36
36
 
37
37
 
38
38
  # Returns a hash where the key is a URL and the value is
@@ -48,4 +48,4 @@ Copyright © 2006 Digital Duckies, LLC, under MIT License
48
48
 
49
49
  Developed for http://digitalduckies.net
50
50
 
51
- News, code, and documentation at http://digitalduckies.net
51
+ News, code, and documentation at http://blog.digitalduckies.net
data/Rakefile CHANGED
@@ -18,7 +18,7 @@ end
18
18
 
19
19
  spec = Gem::Specification.new do |s|
20
20
  s.name = "rcrawl"
21
- s.version = "0.3.5"
21
+ s.version = "0.4.5"
22
22
  s.author = "Digital Duckies"
23
23
  s.email = "rcrawl@digitalduckies.net"
24
24
  s.homepage = "http://digitalduckies.net"
@@ -1,4 +1,155 @@
1
1
  module Rcrawl
2
+
2
3
  class Crawler
3
- end
4
- end
4
+
5
+ attr_accessor :links_to_visit, :site
6
+ attr_reader :visited_links, :external_links, :raw_html, :rules, :sites,
7
+ :errors
8
+ # Initializes various variables when a new Crawler object is instantiated
9
+ def initialize(site)
10
+ puts "Rcrawl Version #{VERSION} initializing..."
11
+ @links_to_visit = Array.new
12
+ @visited_links = Array.new
13
+ @external_links = Array.new
14
+ @raw_html = Hash.new
15
+ @rules = RobotRules.new("rcrawl/#{VERSION}")
16
+ @sites = Hash.new
17
+ @errors = Hash.new
18
+ @site = URI.parse(site) || raise("You didn't give me a site to crawl")
19
+ @links_to_visit << site
20
+ puts "Ready to crawl #{site}"
21
+ end
22
+
23
+ # Coordinates the whole crawling process
24
+ def crawl
25
+ until @links_to_visit.empty? do
26
+ begin
27
+ # Get link
28
+ url_server
29
+ next unless robot_safe? @url
30
+ # Parse robots.txt, then download document if robot_safe
31
+ fetch_http(@url)
32
+ # Store raw HTML in variable to read/reread as needed
33
+ # Then call any processing modules you need for the current document
34
+ ris(@document)
35
+ rescue
36
+ puts ""
37
+ puts "I died on #{@url}"
38
+ $stderr.puts $!
39
+ @errors[@url] = $!
40
+ next
41
+ ensure
42
+ # Stuff you want to make sure gets printed out
43
+ puts " done!"
44
+ end
45
+ end
46
+
47
+ puts "Visited #{@visited_links.size} links."
48
+ end
49
+
50
+ # Authoritative list of URLs to be processed by Rcrawl
51
+ def url_server
52
+ unless @links_to_visit.empty?
53
+ @url = @links_to_visit.pop
54
+ end
55
+ end
56
+
57
+ # Download the document
58
+ def fetch_http(url)
59
+ # Make sure robots.txt has been parsed for this site first,
60
+ # if not, parse robots.txt then grab document.
61
+ uri = URI.parse(url)
62
+ print "Visiting: #{url}"
63
+ @document = uri.read
64
+ @visited_links << url
65
+ end
66
+
67
+ # Rewind Input Stream, for storing and reading of raw HTML
68
+ def ris(document)
69
+ print "."
70
+ # Store raw HTML into local variable
71
+ # Based on MIME type, invoke the proper processing modules
72
+ case document.content_type
73
+ when "text/html"
74
+ link_extractor(document)
75
+ process_html(document)
76
+ else
77
+ print "... not HTML, skipping..."
78
+ end
79
+ end
80
+
81
+ # HTML processing module for extracting links
82
+ def link_extractor(document)
83
+ print "."
84
+ # Parse all links from HTML into an array
85
+ # Set up the scrAPI (http://labnotes.org)
86
+ links = Scraper.define do
87
+ array :urls
88
+ process "a[href]", :urls => "@href"
89
+ result :urls
90
+ end
91
+
92
+ urls = links.scrape(document)
93
+
94
+ urls.each { |url|
95
+ uri = URI.parse(url)
96
+
97
+ # Derelativeize links if necessary
98
+ if uri.relative?
99
+ url = @site.merge(url).to_s
100
+ uri = URI.parse(url)
101
+ end
102
+
103
+ # Check domain, if in same domain, keep link, else trash it
104
+ if uri.host != @site.host
105
+ @external_links << url
106
+ @external_links.uniq!
107
+ next
108
+ end
109
+
110
+ # Find out if we've seen this link already
111
+ if (@visited_links.include? url) || (@links_to_visit.include? url)
112
+ next
113
+ end
114
+
115
+ @links_to_visit << url
116
+ }
117
+ end
118
+
119
+ # HTML processing module for raw HTML storage
120
+ def process_html(document)
121
+
122
+ # Add link and raw HTML to a hash as key/value
123
+ # for later storage in database
124
+ unless @raw_html.has_value?(document)
125
+ print "."
126
+ @raw_html[@document.base_uri] = document
127
+ end
128
+
129
+ end
130
+
131
+ # robots.txt parsing
132
+ def robot_safe?(url)
133
+ uri = URI.parse(url)
134
+ location = "#{uri.host}:#{uri.port}"
135
+
136
+ return true unless %w{http https}.include?(uri.scheme)
137
+
138
+ unless @sites.include? location
139
+ @sites[location] = true
140
+
141
+ robot_url = "http://#{location}/robots.txt"
142
+ begin
143
+ robot_file = open(robot_url) { |page| page.read }
144
+ rescue
145
+ return true
146
+ end
147
+ @rules.parse(robot_url, robot_file)
148
+ end
149
+
150
+ @rules.allowed? url
151
+ end
152
+
153
+ end # class Crawler
154
+
155
+ end # module Rcrawl
@@ -0,0 +1,5 @@
1
+ module Rcrawl
2
+ class Crawler
3
+ VERSION = "0.4.5"
4
+ end
5
+ end
data/lib/rcrawl.rb CHANGED
@@ -4,128 +4,6 @@ require 'rubygems'
4
4
  require 'open-uri'
5
5
  require 'scrapi'
6
6
  require 'rcrawl/robot_rules'
7
- require 'rcrawl/process/html'
7
+ require 'rcrawl/crawler'
8
+ require 'rcrawl/version'
8
9
 
9
- module Rcrawl
10
-
11
- # Crawler will retrieve an entire website, one page at a time,
12
- # parsing the page using whatever modules you pass it to.
13
- class Crawler
14
-
15
- # Initializes various variables when a new Rcrawl object is instantiated
16
- def initialize(site)
17
- @links_to_visit = Array.new
18
- @visited_links = Array.new
19
- @external_links = Array.new
20
- @raw_html = Hash.new
21
- @rules = RobotRules.new("Rcrawl")
22
- @sites = Hash.new
23
- @site = URI.parse(site)
24
- @links_to_visit << site
25
- @errors = Hash.new
26
- puts "Site is #{site}"
27
- end
28
-
29
- # Coordinates the whole crawling process
30
- def crawl
31
- until @links_to_visit.empty? do
32
- begin
33
- # Get link
34
- url_server
35
- next unless robot_safe? @url
36
- # Parse robots.txt, then download document if robot_safe
37
- fetch_http(@url)
38
- # Store raw HTML in variable to read/reread as needed
39
- # Then call any processing modules you need for the current document
40
- ris(@document)
41
- rescue
42
- puts ""
43
- puts "I died on #{@url}"
44
- $stderr.puts $!
45
- @errors[@url] = $!
46
- next
47
- ensure
48
- # Stuff you want to make sure gets printed out
49
- puts " done!"
50
- end
51
- end
52
-
53
- puts "Visited #{@visited_links.size} links."
54
- end
55
-
56
- # Authoritative list of URLs to be processed by Rcrawl
57
- def url_server
58
- unless @links_to_visit.empty?
59
- @url = @links_to_visit.pop
60
- end
61
- end
62
-
63
- # Download the document
64
- def fetch_http(url)
65
- # Make sure robots.txt has been parsed for this site first,
66
- # if not, parse robots.txt then grab document.
67
- uri = URI.parse(url)
68
- print "Visiting: #{url}"
69
- @document = uri.read
70
- @visited_links << url
71
- end
72
-
73
- # Rewind Input Stream, for storing and reading of raw HTML
74
- def ris(document)
75
- print "."
76
- # Store raw HTML into local variable
77
- # Based on MIME type, invoke the proper processing modules
78
- case document.content_type
79
- when "text/html"
80
- Rcrawl::Process::HTML.link_extractor(document)
81
- Rcrawl::Process::HTML.process_html(document)
82
- else
83
- print "... not HTML, skipping..."
84
- end
85
- end
86
-
87
- # robots.txt parsing
88
- def robot_safe?(url)
89
- uri = URI.parse(url)
90
- location = "#{uri.host}:#{uri.port}"
91
-
92
- return true unless %w{http https}.include?(uri.scheme)
93
-
94
- unless @sites.include? location
95
- @sites[location] = true
96
-
97
- robot_url = "http://#{location}/robots.txt"
98
- begin
99
- robot_file = open(robot_url) { |page| page.read }
100
- rescue
101
- return true
102
- end
103
- @rules.parse(robot_url, robot_file)
104
- end
105
-
106
- @rules.allowed? url
107
- end
108
-
109
- # Returns array of links visited during crawl
110
- def visited_links
111
- return @visited_links
112
- end
113
-
114
- # Returns array of external links
115
- def external_links
116
- return @external_links
117
- end
118
-
119
- # Returns a hash where {key => URL, value => HTML} from all pages crawled
120
- def dump
121
- return @raw_html
122
- end
123
-
124
- # Returns a hash where {key => URL, value => "Error message"} from any
125
- # errors encountered during the crawl
126
- def errors
127
- return @errors
128
- end
129
-
130
- end
131
- end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: rcrawl
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.3.5
7
- date: 2006-09-23 00:00:00 -05:00
6
+ version: 0.4.5
7
+ date: 2006-09-26 00:00:00 -05:00
8
8
  summary: A web crawler written in ruby
9
9
  require_paths:
10
10
  - lib
@@ -31,10 +31,9 @@ authors:
31
31
  files:
32
32
  - lib/rcrawl.rb
33
33
  - lib/rcrawl
34
- - lib/rcrawl/crawler.rb
35
34
  - lib/rcrawl/robot_rules.rb
36
- - lib/rcrawl/process
37
- - lib/rcrawl/process/html.rb
35
+ - lib/rcrawl/crawler.rb
36
+ - lib/rcrawl/version.rb
38
37
  - README
39
38
  - MIT-LICENSE
40
39
  - Rakefile
@@ -1,61 +0,0 @@
1
- module Rcrawl
2
-
3
- module Process
4
-
5
- module HTML
6
-
7
- # HTML processing module for extracting links
8
- def HTML.link_extractor(document)
9
- print "."
10
- # Parse all links from HTML into an array
11
- # Set up the scrAPI (http://labnotes.org)
12
- links = Scraper.define do
13
- array :urls
14
- process "a[href]", :urls => "@href"
15
- result :urls
16
- end
17
-
18
- urls = links.scrape(document)
19
-
20
- urls.each { |url|
21
- uri = URI.parse(url)
22
-
23
- # Derelativeize links if necessary
24
- if uri.relative?
25
- url = @site.merge(url).to_s
26
- uri = URI.parse(url)
27
- end
28
-
29
- # Check domain, if in same domain, keep link, else trash it
30
- if uri.host != @site.host
31
- @external_links << url
32
- @external_links.uniq!
33
- next
34
- end
35
-
36
- # Find out if we've seen this link already
37
- if (@visited_links.include? url) || (@links_to_visit.include? url)
38
- next
39
- end
40
-
41
- @links_to_visit << url
42
- }
43
- end
44
-
45
- # HTML processing module for raw HTML storage
46
- def HTML.process_html(document)
47
-
48
- # Add link and raw HTML to a hash as key/value
49
- # for later storage in database
50
- unless @raw_html.has_value?(document)
51
- print "."
52
- @raw_html[document.base_uri] = document
53
- end
54
-
55
- end
56
-
57
- end
58
-
59
- end
60
-
61
- end