rcrawl 0.4.6 → 0.4.7
Sign up to get free protection for your applications and to get access to all the features.
- data/README +3 -0
- data/Rakefile +1 -1
- data/TODO +7 -1
- data/lib/rcrawl/crawler.rb +5 -4
- metadata +2 -2
data/README
CHANGED
@@ -43,6 +43,9 @@ The structure of the crawling process was inspired by the specs of the Mercator
|
|
43
43
|
# Returns an array of external links
|
44
44
|
crawler.external_links
|
45
45
|
|
46
|
+
# Set user agent
|
47
|
+
crawler.user_agent = "Your fancy crawler name here"
|
48
|
+
|
46
49
|
== License
|
47
50
|
Copyright © 2006 Digital Duckies, LLC, under MIT License
|
48
51
|
|
data/Rakefile
CHANGED
data/TODO
CHANGED
data/lib/rcrawl/crawler.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
module Rcrawl
|
2
2
|
|
3
3
|
class Crawler
|
4
|
-
|
5
|
-
attr_accessor :links_to_visit, :site
|
4
|
+
|
5
|
+
attr_accessor :links_to_visit, :site, :user_agent
|
6
6
|
attr_reader :visited_links, :external_links, :raw_html, :rules, :sites,
|
7
7
|
:errors
|
8
8
|
# Initializes various variables when a new Crawler object is instantiated
|
@@ -12,7 +12,8 @@ module Rcrawl
|
|
12
12
|
@visited_links = Array.new
|
13
13
|
@external_links = Array.new
|
14
14
|
@raw_html = Hash.new
|
15
|
-
@rules = RobotRules.new(
|
15
|
+
@rules = RobotRules.new('Rcrawl')
|
16
|
+
@user_agent = "Rcrawl/#{VERSION} (http://rubyforge.org/projects/rcrawl/)"
|
16
17
|
@sites = Hash.new
|
17
18
|
@errors = Hash.new
|
18
19
|
@site = URI.parse(site) || raise("You didn't give me a site to crawl")
|
@@ -64,7 +65,7 @@ module Rcrawl
|
|
64
65
|
# if not, parse robots.txt then grab document.
|
65
66
|
uri = URI.parse(url)
|
66
67
|
print "Visiting: #{url}"
|
67
|
-
@document = uri.read
|
68
|
+
@document = uri.read("User-Agent" => @user_agent, "Referer" => url)
|
68
69
|
@visited_links << url
|
69
70
|
end
|
70
71
|
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: rcrawl
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.4.
|
7
|
-
date: 2006-09-
|
6
|
+
version: 0.4.7
|
7
|
+
date: 2006-09-27 00:00:00 -05:00
|
8
8
|
summary: A web crawler written in ruby
|
9
9
|
require_paths:
|
10
10
|
- lib
|