rcrawl 0.3.5 → 0.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README CHANGED
@@ -32,7 +32,7 @@ The structure of the crawling process was inspired by the specs of the Mercator
32
32
 
33
33
  # Returns a hash where the key is a url and the value is
34
34
  # the raw html from that url
35
- crawler.dump
35
+ crawler.raw_html
36
36
 
37
37
 
38
38
  # Returns a hash where the key is a URL and the value is
@@ -48,4 +48,4 @@ Copyright © 2006 Digital Duckies, LLC, under MIT License
48
48
 
49
49
  Developed for http://digitalduckies.net
50
50
 
51
- News, code, and documentation at http://digitalduckies.net
51
+ News, code, and documentation at http://blog.digitalduckies.net
data/Rakefile CHANGED
@@ -18,7 +18,7 @@ end
18
18
 
19
19
  spec = Gem::Specification.new do |s|
20
20
  s.name = "rcrawl"
21
- s.version = "0.3.5"
21
+ s.version = "0.4.5"
22
22
  s.author = "Digital Duckies"
23
23
  s.email = "rcrawl@digitalduckies.net"
24
24
  s.homepage = "http://digitalduckies.net"
@@ -1,4 +1,155 @@
1
1
  module Rcrawl
2
+
2
3
  class Crawler
3
- end
4
- end
4
+
5
+ attr_accessor :links_to_visit, :site
6
+ attr_reader :visited_links, :external_links, :raw_html, :rules, :sites,
7
+ :errors
8
+ # Initializes various variables when a new Crawler object is instantiated
9
+ def initialize(site)
10
+ puts "Rcrawl Version #{VERSION} initializing..."
11
+ @links_to_visit = Array.new
12
+ @visited_links = Array.new
13
+ @external_links = Array.new
14
+ @raw_html = Hash.new
15
+ @rules = RobotRules.new("rcrawl/#{VERSION}")
16
+ @sites = Hash.new
17
+ @errors = Hash.new
18
+ @site = URI.parse(site) || raise("You didn't give me a site to crawl")
19
+ @links_to_visit << site
20
+ puts "Ready to crawl #{site}"
21
+ end
22
+
23
+ # Coordinates the whole crawling process
24
+ def crawl
25
+ until @links_to_visit.empty? do
26
+ begin
27
+ # Get link
28
+ url_server
29
+ next unless robot_safe? @url
30
+ # Parse robots.txt, then download document if robot_safe
31
+ fetch_http(@url)
32
+ # Store raw HTML in variable to read/reread as needed
33
+ # Then call any processing modules you need for the current document
34
+ ris(@document)
35
+ rescue
36
+ puts ""
37
+ puts "I died on #{@url}"
38
+ $stderr.puts $!
39
+ @errors[@url] = $!
40
+ next
41
+ ensure
42
+ # Stuff you want to make sure gets printed out
43
+ puts " done!"
44
+ end
45
+ end
46
+
47
+ puts "Visited #{@visited_links.size} links."
48
+ end
49
+
50
+ # Authoritative list of URLs to be processed by Rcrawl
51
+ def url_server
52
+ unless @links_to_visit.empty?
53
+ @url = @links_to_visit.pop
54
+ end
55
+ end
56
+
57
+ # Download the document
58
+ def fetch_http(url)
59
+ # Make sure robots.txt has been parsed for this site first,
60
+ # if not, parse robots.txt then grab document.
61
+ uri = URI.parse(url)
62
+ print "Visiting: #{url}"
63
+ @document = uri.read
64
+ @visited_links << url
65
+ end
66
+
67
+ # Rewind Input Stream, for storing and reading of raw HTML
68
+ def ris(document)
69
+ print "."
70
+ # Store raw HTML into local variable
71
+ # Based on MIME type, invoke the proper processing modules
72
+ case document.content_type
73
+ when "text/html"
74
+ link_extractor(document)
75
+ process_html(document)
76
+ else
77
+ print "... not HTML, skipping..."
78
+ end
79
+ end
80
+
81
+ # HTML processing module for extracting links
82
+ def link_extractor(document)
83
+ print "."
84
+ # Parse all links from HTML into an array
85
+ # Set up the scrAPI (http://labnotes.org)
86
+ links = Scraper.define do
87
+ array :urls
88
+ process "a[href]", :urls => "@href"
89
+ result :urls
90
+ end
91
+
92
+ urls = links.scrape(document)
93
+
94
+ urls.each { |url|
95
+ uri = URI.parse(url)
96
+
97
+ # Derelativeize links if necessary
98
+ if uri.relative?
99
+ url = @site.merge(url).to_s
100
+ uri = URI.parse(url)
101
+ end
102
+
103
+ # Check domain, if in same domain, keep link, else trash it
104
+ if uri.host != @site.host
105
+ @external_links << url
106
+ @external_links.uniq!
107
+ next
108
+ end
109
+
110
+ # Find out if we've seen this link already
111
+ if (@visited_links.include? url) || (@links_to_visit.include? url)
112
+ next
113
+ end
114
+
115
+ @links_to_visit << url
116
+ }
117
+ end
118
+
119
+ # HTML processing module for raw HTML storage
120
+ def process_html(document)
121
+
122
+ # Add link and raw HTML to a hash as key/value
123
+ # for later storage in database
124
+ unless @raw_html.has_value?(document)
125
+ print "."
126
+ @raw_html[@document.base_uri] = document
127
+ end
128
+
129
+ end
130
+
131
+ # robots.txt parsing
132
+ def robot_safe?(url)
133
+ uri = URI.parse(url)
134
+ location = "#{uri.host}:#{uri.port}"
135
+
136
+ return true unless %w{http https}.include?(uri.scheme)
137
+
138
+ unless @sites.include? location
139
+ @sites[location] = true
140
+
141
+ robot_url = "http://#{location}/robots.txt"
142
+ begin
143
+ robot_file = open(robot_url) { |page| page.read }
144
+ rescue
145
+ return true
146
+ end
147
+ @rules.parse(robot_url, robot_file)
148
+ end
149
+
150
+ @rules.allowed? url
151
+ end
152
+
153
+ end # class Crawler
154
+
155
+ end # module Rcrawl
@@ -0,0 +1,5 @@
1
+ module Rcrawl
2
+ class Crawler
3
+ VERSION = "0.4.5"
4
+ end
5
+ end
data/lib/rcrawl.rb CHANGED
@@ -4,128 +4,6 @@ require 'rubygems'
4
4
  require 'open-uri'
5
5
  require 'scrapi'
6
6
  require 'rcrawl/robot_rules'
7
- require 'rcrawl/process/html'
7
+ require 'rcrawl/crawler'
8
+ require 'rcrawl/version'
8
9
 
9
- module Rcrawl
10
-
11
- # Crawler will retrieve an entire website, one page at a time,
12
- # parsing the page using whatever modules you pass it to.
13
- class Crawler
14
-
15
- # Initializes various variables when a new Rcrawl object is instantiated
16
- def initialize(site)
17
- @links_to_visit = Array.new
18
- @visited_links = Array.new
19
- @external_links = Array.new
20
- @raw_html = Hash.new
21
- @rules = RobotRules.new("Rcrawl")
22
- @sites = Hash.new
23
- @site = URI.parse(site)
24
- @links_to_visit << site
25
- @errors = Hash.new
26
- puts "Site is #{site}"
27
- end
28
-
29
- # Coordinates the whole crawling process
30
- def crawl
31
- until @links_to_visit.empty? do
32
- begin
33
- # Get link
34
- url_server
35
- next unless robot_safe? @url
36
- # Parse robots.txt, then download document if robot_safe
37
- fetch_http(@url)
38
- # Store raw HTML in variable to read/reread as needed
39
- # Then call any processing modules you need for the current document
40
- ris(@document)
41
- rescue
42
- puts ""
43
- puts "I died on #{@url}"
44
- $stderr.puts $!
45
- @errors[@url] = $!
46
- next
47
- ensure
48
- # Stuff you want to make sure gets printed out
49
- puts " done!"
50
- end
51
- end
52
-
53
- puts "Visited #{@visited_links.size} links."
54
- end
55
-
56
- # Authoritative list of URLs to be processed by Rcrawl
57
- def url_server
58
- unless @links_to_visit.empty?
59
- @url = @links_to_visit.pop
60
- end
61
- end
62
-
63
- # Download the document
64
- def fetch_http(url)
65
- # Make sure robots.txt has been parsed for this site first,
66
- # if not, parse robots.txt then grab document.
67
- uri = URI.parse(url)
68
- print "Visiting: #{url}"
69
- @document = uri.read
70
- @visited_links << url
71
- end
72
-
73
- # Rewind Input Stream, for storing and reading of raw HTML
74
- def ris(document)
75
- print "."
76
- # Store raw HTML into local variable
77
- # Based on MIME type, invoke the proper processing modules
78
- case document.content_type
79
- when "text/html"
80
- Rcrawl::Process::HTML.link_extractor(document)
81
- Rcrawl::Process::HTML.process_html(document)
82
- else
83
- print "... not HTML, skipping..."
84
- end
85
- end
86
-
87
- # robots.txt parsing
88
- def robot_safe?(url)
89
- uri = URI.parse(url)
90
- location = "#{uri.host}:#{uri.port}"
91
-
92
- return true unless %w{http https}.include?(uri.scheme)
93
-
94
- unless @sites.include? location
95
- @sites[location] = true
96
-
97
- robot_url = "http://#{location}/robots.txt"
98
- begin
99
- robot_file = open(robot_url) { |page| page.read }
100
- rescue
101
- return true
102
- end
103
- @rules.parse(robot_url, robot_file)
104
- end
105
-
106
- @rules.allowed? url
107
- end
108
-
109
- # Returns array of links visited during crawl
110
- def visited_links
111
- return @visited_links
112
- end
113
-
114
- # Returns array of external links
115
- def external_links
116
- return @external_links
117
- end
118
-
119
- # Returns a hash where {key => URL, value => HTML} from all pages crawled
120
- def dump
121
- return @raw_html
122
- end
123
-
124
- # Returns a hash where {key => URL, value => "Error message"} from any
125
- # errors encountered during the crawl
126
- def errors
127
- return @errors
128
- end
129
-
130
- end
131
- end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: rcrawl
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.3.5
7
- date: 2006-09-23 00:00:00 -05:00
6
+ version: 0.4.5
7
+ date: 2006-09-26 00:00:00 -05:00
8
8
  summary: A web crawler written in ruby
9
9
  require_paths:
10
10
  - lib
@@ -31,10 +31,9 @@ authors:
31
31
  files:
32
32
  - lib/rcrawl.rb
33
33
  - lib/rcrawl
34
- - lib/rcrawl/crawler.rb
35
34
  - lib/rcrawl/robot_rules.rb
36
- - lib/rcrawl/process
37
- - lib/rcrawl/process/html.rb
35
+ - lib/rcrawl/crawler.rb
36
+ - lib/rcrawl/version.rb
38
37
  - README
39
38
  - MIT-LICENSE
40
39
  - Rakefile
@@ -1,61 +0,0 @@
1
- module Rcrawl
2
-
3
- module Process
4
-
5
- module HTML
6
-
7
- # HTML processing module for extracting links
8
- def HTML.link_extractor(document)
9
- print "."
10
- # Parse all links from HTML into an array
11
- # Set up the scrAPI (http://labnotes.org)
12
- links = Scraper.define do
13
- array :urls
14
- process "a[href]", :urls => "@href"
15
- result :urls
16
- end
17
-
18
- urls = links.scrape(document)
19
-
20
- urls.each { |url|
21
- uri = URI.parse(url)
22
-
23
- # Derelativeize links if necessary
24
- if uri.relative?
25
- url = @site.merge(url).to_s
26
- uri = URI.parse(url)
27
- end
28
-
29
- # Check domain, if in same domain, keep link, else trash it
30
- if uri.host != @site.host
31
- @external_links << url
32
- @external_links.uniq!
33
- next
34
- end
35
-
36
- # Find out if we've seen this link already
37
- if (@visited_links.include? url) || (@links_to_visit.include? url)
38
- next
39
- end
40
-
41
- @links_to_visit << url
42
- }
43
- end
44
-
45
- # HTML processing module for raw HTML storage
46
- def HTML.process_html(document)
47
-
48
- # Add link and raw HTML to a hash as key/value
49
- # for later storage in database
50
- unless @raw_html.has_value?(document)
51
- print "."
52
- @raw_html[document.base_uri] = document
53
- end
54
-
55
- end
56
-
57
- end
58
-
59
- end
60
-
61
- end