rcrawl 0.4.6 → 0.4.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +3 -0
- data/Rakefile +1 -1
- data/TODO +7 -1
- data/lib/rcrawl/crawler.rb +5 -4
- metadata +2 -2
data/README
CHANGED
@@ -43,6 +43,9 @@ The structure of the crawling process was inspired by the specs of the Mercator
|
|
43
43
|
# Returns an array of external links
|
44
44
|
crawler.external_links
|
45
45
|
|
46
|
+
# Set user agent
|
47
|
+
crawler.user_agent = "Your fancy crawler name here"
|
48
|
+
|
46
49
|
== License
|
47
50
|
Copyright © 2006 Digital Duckies, LLC, under MIT License
|
48
51
|
|
data/Rakefile
CHANGED
data/TODO
CHANGED
data/lib/rcrawl/crawler.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
module Rcrawl
|
2
2
|
|
3
3
|
class Crawler
|
4
|
-
|
5
|
-
attr_accessor :links_to_visit, :site
|
4
|
+
|
5
|
+
attr_accessor :links_to_visit, :site, :user_agent
|
6
6
|
attr_reader :visited_links, :external_links, :raw_html, :rules, :sites,
|
7
7
|
:errors
|
8
8
|
# Initializes various variables when a new Crawler object is instantiated
|
@@ -12,7 +12,8 @@ module Rcrawl
|
|
12
12
|
@visited_links = Array.new
|
13
13
|
@external_links = Array.new
|
14
14
|
@raw_html = Hash.new
|
15
|
-
@rules = RobotRules.new(
|
15
|
+
@rules = RobotRules.new('Rcrawl')
|
16
|
+
@user_agent = "Rcrawl/#{VERSION} (http://rubyforge.org/projects/rcrawl/)"
|
16
17
|
@sites = Hash.new
|
17
18
|
@errors = Hash.new
|
18
19
|
@site = URI.parse(site) || raise("You didn't give me a site to crawl")
|
@@ -64,7 +65,7 @@ module Rcrawl
|
|
64
65
|
# if not, parse robots.txt then grab document.
|
65
66
|
uri = URI.parse(url)
|
66
67
|
print "Visiting: #{url}"
|
67
|
-
@document = uri.read
|
68
|
+
@document = uri.read("User-Agent" => @user_agent, "Referer" => url)
|
68
69
|
@visited_links << url
|
69
70
|
end
|
70
71
|
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: rcrawl
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.4.
|
7
|
-
date: 2006-09-
|
6
|
+
version: 0.4.7
|
7
|
+
date: 2006-09-27 00:00:00 -05:00
|
8
8
|
summary: A web crawler written in ruby
|
9
9
|
require_paths:
|
10
10
|
- lib
|