cobweb 0.0.57 → 0.0.58
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +3 -1
- data/lib/cobweb.rb +23 -2
- data/lib/cobweb_crawler.rb +1 -0
- data/lib/cobweb_links.rb +9 -0
- data/lib/cobweb_version.rb +1 -1
- data/lib/robots.rb +71 -3
- data/lib/server.rb +0 -1
- data/spec/cobweb/cobweb_crawler_spec.rb +4 -4
- data/spec/cobweb/cobweb_job_spec.rb +0 -10
- data/spec/cobweb/robots_spec.rb +70 -0
- data/spec/samples/robots.txt +294 -0
- data/spec/samples/sample_site/robots.txt +13 -0
- data/spec/spec_helper.rb +25 -0
- metadata +25 -22
data/README.textile
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
|
2
|
-
h1. Cobweb v0.0.
|
2
|
+
h1. Cobweb v0.0.58
|
3
3
|
!https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
|
4
4
|
|
5
5
|
h2. Intro
|
@@ -41,6 +41,8 @@ h3. Data Returned
|
|
41
41
|
* :crawl_id - the id used internally for identifying the crawl. Can be used by the processing job to seperate crawls
|
42
42
|
* :internal_urls - an array of urls with * wildcards that represent urls internal to the site (ie pages within the same domain)
|
43
43
|
* :external_urls - an array of urls with * wildcards that represent urls external to the site (overrides internal_urls)
|
44
|
+
* :obey_robots - boolean determining if robots.txt should be honoured. (default: false)
|
45
|
+
* :user_agent - user agent string to match in robots.txt (not sent as user_agent of requests yet) (default: cobweb)
|
44
46
|
|
45
47
|
The source for the links can be overridden, contact me for the syntax (don't have time to put it into this documentation, will as soon as i have time!)
|
46
48
|
|
data/lib/cobweb.rb
CHANGED
@@ -43,6 +43,9 @@ class Cobweb
|
|
43
43
|
default_redis_options_to Hash.new
|
44
44
|
default_internal_urls_to []
|
45
45
|
default_first_page_redirect_internal_to true
|
46
|
+
default_text_mime_types_to ["text/*", "application/xhtml+xml"]
|
47
|
+
default_obey_robots_to false
|
48
|
+
default_user_agent_to "cobweb"
|
46
49
|
|
47
50
|
end
|
48
51
|
|
@@ -177,7 +180,7 @@ class Cobweb
|
|
177
180
|
content[:character_set] = charset
|
178
181
|
end
|
179
182
|
content[:length] = response.content_length
|
180
|
-
if
|
183
|
+
if text_content?(content[:mime_type])
|
181
184
|
if response["Content-Encoding"]=="gzip"
|
182
185
|
content[:body] = Zlib::GzipReader.new(StringIO.new(response.body)).read
|
183
186
|
else
|
@@ -384,5 +387,23 @@ class Cobweb
|
|
384
387
|
content
|
385
388
|
end
|
386
389
|
|
387
|
-
end
|
390
|
+
end
|
391
|
+
|
392
|
+
private
|
393
|
+
# checks if the mime_type is textual
|
394
|
+
def text_content?(content_type)
|
395
|
+
@options[:text_mime_types].each do |mime_type|
|
396
|
+
return true if content_type.match(escape_pattern_for_regex(mime_type))
|
397
|
+
end
|
398
|
+
false
|
399
|
+
end
|
400
|
+
|
401
|
+
# escapes characters with meaning in regular expressions and adds wildcard expression
|
402
|
+
def escape_pattern_for_regex(pattern)
|
403
|
+
pattern = pattern.gsub(".", "\\.")
|
404
|
+
pattern = pattern.gsub("?", "\\?")
|
405
|
+
pattern = pattern.gsub("*", ".*?")
|
406
|
+
pattern
|
407
|
+
end
|
408
|
+
|
388
409
|
end
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -64,6 +64,7 @@ class CobwebCrawler
|
|
64
64
|
@redis.incr "crawl-counter"
|
65
65
|
|
66
66
|
internal_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https])
|
67
|
+
ap internal_links
|
67
68
|
|
68
69
|
# select the link if its internal (eliminate external before expensive lookups in queued and crawled)
|
69
70
|
cobweb_links = CobwebLinks.new(@options)
|
data/lib/cobweb_links.rb
CHANGED
@@ -17,6 +17,15 @@ class CobwebLinks
|
|
17
17
|
|
18
18
|
end
|
19
19
|
|
20
|
+
def allowed?(link)
|
21
|
+
if @options[:obey_robots]
|
22
|
+
robot = Robots.new(:url => link, :user_agent => @options[:user_agent])
|
23
|
+
return robot.allowed?(link)
|
24
|
+
else
|
25
|
+
return true
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
20
29
|
# Returns true if the link is matched to an internal_url and not matched to an external_url
|
21
30
|
def internal?(link)
|
22
31
|
if @options[:debug]
|
data/lib/cobweb_version.rb
CHANGED
data/lib/robots.rb
CHANGED
@@ -2,10 +2,78 @@
|
|
2
2
|
class Robots
|
3
3
|
|
4
4
|
# Processes the robots.txt file
|
5
|
-
def initialize(
|
5
|
+
def initialize(options)
|
6
|
+
@options = options
|
7
|
+
raise "options should be a hash" unless options.kind_of? Hash
|
8
|
+
raise ":url is required" unless @options.has_key? :url
|
9
|
+
@options[:file] = "robots.txt" unless @options.has_key? :file
|
10
|
+
@options[:user_agent] = "cobweb" unless @options.has_key? :user_agent
|
11
|
+
|
12
|
+
uri = URI.parse(@options[:url])
|
13
|
+
content = Cobweb.new(:cache => nil, :text_mime_types => ["text/html", "application/xhtml+xml", "text/plain"]).get([uri.scheme, "://", uri.host, ":", uri.port, "/", @options[:file]].join)
|
14
|
+
if content[:mime_type][0..4] == "text/"
|
15
|
+
@raw_data = parse_data(content[:body])
|
16
|
+
|
17
|
+
if @options.has_key?(:user_agent) && @raw_data.has_key?(@options[:user_agent].to_s.downcase.to_sym)
|
18
|
+
@params = @raw_data[@options[:user_agent].to_s.downcase.to_sym]
|
19
|
+
else
|
20
|
+
raise "Wildcard user-agent is not present" unless @raw_data.has_key? :*
|
21
|
+
@params = @raw_data[:*]
|
22
|
+
end
|
23
|
+
else
|
24
|
+
raise "Invalid mime type: #{content[:content_type]}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def allowed?(url)
|
6
29
|
uri = URI.parse(url)
|
7
|
-
[
|
8
|
-
|
30
|
+
@params[:allow].each do |pattern|
|
31
|
+
return true if uri.path.match(escape_pattern_for_regex(pattern))
|
32
|
+
end
|
33
|
+
@params[:disallow].each do |pattern|
|
34
|
+
return false if uri.path.match(escape_pattern_for_regex(pattern))
|
35
|
+
end
|
36
|
+
true
|
37
|
+
end
|
38
|
+
|
39
|
+
def user_agent_settings
|
40
|
+
@params
|
41
|
+
end
|
42
|
+
|
43
|
+
def contents
|
44
|
+
@raw_data
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
# escapes characters with meaning in regular expressions and adds wildcard expression
|
49
|
+
def escape_pattern_for_regex(pattern)
|
50
|
+
pattern = pattern.gsub(".", "\\.")
|
51
|
+
pattern = pattern.gsub("?", "\\?")
|
52
|
+
pattern = pattern.gsub("*", ".*?")
|
53
|
+
pattern
|
54
|
+
end
|
55
|
+
|
56
|
+
def parse_data(data)
|
57
|
+
user_agents = {}
|
58
|
+
lines = data.split("\n")
|
59
|
+
lines.map!{|line| line.strip}
|
60
|
+
lines.reject!{|line| line == "" || line[0] == "#"}
|
61
|
+
current_user_agent = nil
|
9
62
|
|
63
|
+
lines.each do |line|
|
64
|
+
if line[0..10].downcase == "user-agent:"
|
65
|
+
current_user_agent = line.split(":")[1..-1].join.downcase.strip.to_sym
|
66
|
+
user_agents[current_user_agent] = {:allow => [], :disallow => []}
|
67
|
+
else
|
68
|
+
if current_user_agent
|
69
|
+
values = line.split(":")
|
70
|
+
unless values[1..-1].join.strip == ""
|
71
|
+
user_agents[current_user_agent][values[0].downcase.strip.to_sym] = [] unless user_agents[current_user_agent].has_key? values[0].downcase.to_sym
|
72
|
+
user_agents[current_user_agent][values[0].downcase.strip.to_sym] << values[1..-1].join.strip
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
user_agents
|
10
78
|
end
|
11
79
|
end
|
data/lib/server.rb
CHANGED
@@ -4,7 +4,7 @@ describe CobwebCrawler do
|
|
4
4
|
|
5
5
|
before(:each) do
|
6
6
|
|
7
|
-
@base_url = "http://
|
7
|
+
@base_url = "http://localhost:3532/"
|
8
8
|
|
9
9
|
@default_headers = {"Cache-Control" => "private, max-age=0",
|
10
10
|
"Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
|
@@ -27,9 +27,9 @@ describe CobwebCrawler do
|
|
27
27
|
|
28
28
|
# temporary tests to run crawler - proper specs to follow.. honest
|
29
29
|
|
30
|
-
crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug =>
|
30
|
+
crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => true})
|
31
31
|
|
32
|
-
statistics = crawler.crawl(
|
32
|
+
statistics = crawler.crawl(@base_url)
|
33
33
|
|
34
34
|
statistics.should_not be_nil
|
35
35
|
statistics.should be_an_instance_of Hash
|
@@ -42,7 +42,7 @@ describe CobwebCrawler do
|
|
42
42
|
|
43
43
|
crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
|
44
44
|
|
45
|
-
statistics = crawler.crawl(
|
45
|
+
statistics = crawler.crawl(@base_url) do |content, statistics|
|
46
46
|
content[:url].should_not be_nil
|
47
47
|
statistics[:average_length].should_not be_nil
|
48
48
|
end
|
@@ -12,15 +12,6 @@ describe Cobweb, :local_only => true do
|
|
12
12
|
io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=5 QUEUE=cobweb_crawl_job > log/output.log &")
|
13
13
|
puts "Workers Started."
|
14
14
|
|
15
|
-
# START THIN SERVER TO HOST THE SAMPLE SITE FOR CRAWLING
|
16
|
-
@thin = nil
|
17
|
-
Thread.new do
|
18
|
-
@thin = Thin::Server.start("0.0.0.0", 3532, SampleServer.app)
|
19
|
-
end
|
20
|
-
|
21
|
-
# WAIT FOR START TO COMPLETE
|
22
|
-
sleep 1
|
23
|
-
|
24
15
|
end
|
25
16
|
|
26
17
|
before(:each) do
|
@@ -147,7 +138,6 @@ describe Cobweb, :local_only => true do
|
|
147
138
|
@all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
148
139
|
command = "kill #{(@all_processes - @existing_processes).join(" ")}"
|
149
140
|
IO.popen(command)
|
150
|
-
#@thin.stop!
|
151
141
|
end
|
152
142
|
|
153
143
|
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
describe Robots do
|
4
|
+
|
5
|
+
before(:each) do
|
6
|
+
@cobweb = Cobweb.new :quiet => true, :cache => nil
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "default user-agent" do
|
10
|
+
before(:each) do
|
11
|
+
@options = {:url => "http://localhost/"}
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should parse a valid robots.txt" do
|
15
|
+
lambda {Robots.new(@options)}.should_not raise_error
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should allow urls marked as allow" do
|
19
|
+
robot = Robots.new(@options)
|
20
|
+
robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should be_true
|
21
|
+
end
|
22
|
+
it "should disallow urls specified as disallow" do
|
23
|
+
robot = Robots.new(@options)
|
24
|
+
robot.allowed?("http://localhost/globalmarketfinder/").should be_false
|
25
|
+
robot.allowed?("http://localhost/globalmarketfinder/asdf").should be_false
|
26
|
+
end
|
27
|
+
it "should allow urls not listed" do
|
28
|
+
robot = Robots.new(@options)
|
29
|
+
robot.allowed?("http://localhost/notlistedinrobotsfile").should be_true
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
describe "google user-agent" do
|
35
|
+
before(:each) do
|
36
|
+
@options = {:url => "http://localhost/", :user_agent => "google"}
|
37
|
+
end
|
38
|
+
it "should parse a valid robots.txt" do
|
39
|
+
lambda {Robots.new(@options)}.should_not raise_error
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should disallow all urls" do
|
43
|
+
robot = Robots.new(@options)
|
44
|
+
robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should be_false
|
45
|
+
robot.allowed?("http://localhost/globalmarketfinder/").should be_false
|
46
|
+
robot.allowed?("http://localhost/globalmarketfinder/asdf").should be_false
|
47
|
+
robot.allowed?("http://localhost/notlistedinrobotsfile").should be_false
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
|
52
|
+
describe "cybermapper user-agent" do
|
53
|
+
before(:each) do
|
54
|
+
@options = {:url => "http://localhost/", :user_agent => "cybermapper"}
|
55
|
+
end
|
56
|
+
it "should parse a valid robots.txt" do
|
57
|
+
lambda {Robots.new(@options)}.should_not raise_error
|
58
|
+
end
|
59
|
+
|
60
|
+
it "should disallow all urls" do
|
61
|
+
robot = Robots.new(@options)
|
62
|
+
robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should be_true
|
63
|
+
robot.allowed?("http://localhost/globalmarketfinder/").should be_true
|
64
|
+
robot.allowed?("http://localhost/globalmarketfinder/asdf").should be_true
|
65
|
+
robot.allowed?("http://localhost/notlistedinrobotsfile").should be_true
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
@@ -0,0 +1,294 @@
|
|
1
|
+
# robots.txt for http://www.example.com/
|
2
|
+
|
3
|
+
User-agent: cybermapper
|
4
|
+
Disallow:
|
5
|
+
|
6
|
+
User-agent: google
|
7
|
+
Disallow: /
|
8
|
+
|
9
|
+
User-agent: *
|
10
|
+
Disallow: /search
|
11
|
+
Disallow: /sdch
|
12
|
+
Disallow: /groups
|
13
|
+
Disallow: /images
|
14
|
+
Disallow: /catalogs
|
15
|
+
Allow: /catalogs/about
|
16
|
+
Allow: /catalogs/p?
|
17
|
+
Disallow: /catalogues
|
18
|
+
Disallow: /news
|
19
|
+
Allow: /news/directory
|
20
|
+
Disallow: /nwshp
|
21
|
+
Disallow: /setnewsprefs?
|
22
|
+
Disallow: /index.html?
|
23
|
+
Disallow: /?
|
24
|
+
Allow: /?hl=
|
25
|
+
Disallow: /?hl=*&
|
26
|
+
Disallow: /addurl/image?
|
27
|
+
Disallow: /pagead/
|
28
|
+
Disallow: /relpage/
|
29
|
+
Disallow: /relcontent
|
30
|
+
Disallow: /imgres
|
31
|
+
Disallow: /imglanding
|
32
|
+
Disallow: /sbd
|
33
|
+
Disallow: /keyword/
|
34
|
+
Disallow: /u/
|
35
|
+
Disallow: /univ/
|
36
|
+
Disallow: /cobrand
|
37
|
+
Disallow: /custom
|
38
|
+
Disallow: /advanced_group_search
|
39
|
+
Disallow: /googlesite
|
40
|
+
Disallow: /preferences
|
41
|
+
Disallow: /setprefs
|
42
|
+
Disallow: /swr
|
43
|
+
Disallow: /url
|
44
|
+
Disallow: /default
|
45
|
+
Disallow: /m?
|
46
|
+
Disallow: /m/?
|
47
|
+
Disallow: /m/blogs?
|
48
|
+
Disallow: /m/directions?
|
49
|
+
Disallow: /m/ig
|
50
|
+
Disallow: /m/images?
|
51
|
+
Disallow: /m/imgres?
|
52
|
+
Disallow: /m/local?
|
53
|
+
Disallow: /m/movies?
|
54
|
+
Disallow: /m/news?
|
55
|
+
Disallow: /m/news/i?
|
56
|
+
Disallow: /m/place?
|
57
|
+
Disallow: /m/products?
|
58
|
+
Disallow: /m/products/
|
59
|
+
Disallow: /m/setnewsprefs?
|
60
|
+
Disallow: /m/search?
|
61
|
+
Disallow: /m/swmloptin?
|
62
|
+
Disallow: /m/trends
|
63
|
+
Disallow: /m/video?
|
64
|
+
Disallow: /wml?
|
65
|
+
Disallow: /wml/?
|
66
|
+
Disallow: /wml/search?
|
67
|
+
Disallow: /xhtml?
|
68
|
+
Disallow: /xhtml/?
|
69
|
+
Disallow: /xhtml/search?
|
70
|
+
Disallow: /xml?
|
71
|
+
Disallow: /imode?
|
72
|
+
Disallow: /imode/?
|
73
|
+
Disallow: /imode/search?
|
74
|
+
Disallow: /jsky?
|
75
|
+
Disallow: /jsky/?
|
76
|
+
Disallow: /jsky/search?
|
77
|
+
Disallow: /pda?
|
78
|
+
Disallow: /pda/?
|
79
|
+
Disallow: /pda/search?
|
80
|
+
Disallow: /sprint_xhtml
|
81
|
+
Disallow: /sprint_wml
|
82
|
+
Disallow: /pqa
|
83
|
+
Disallow: /palm
|
84
|
+
Disallow: /gwt/
|
85
|
+
Disallow: /purchases
|
86
|
+
Disallow: /hws
|
87
|
+
Disallow: /bsd?
|
88
|
+
Disallow: /linux?
|
89
|
+
Disallow: /mac?
|
90
|
+
Disallow: /microsoft?
|
91
|
+
Disallow: /unclesam?
|
92
|
+
Disallow: /answers/search?q=
|
93
|
+
Disallow: /local?
|
94
|
+
Disallow: /local_url
|
95
|
+
Disallow: /shihui?
|
96
|
+
Disallow: /shihui/
|
97
|
+
Disallow: /froogle?
|
98
|
+
Disallow: /products?
|
99
|
+
Disallow: /products/
|
100
|
+
Disallow: /froogle_
|
101
|
+
Disallow: /product_
|
102
|
+
Disallow: /products_
|
103
|
+
Disallow: /products;
|
104
|
+
Disallow: /print
|
105
|
+
Disallow: /books/
|
106
|
+
Disallow: /bkshp?*q=*
|
107
|
+
Disallow: /books?*q=*
|
108
|
+
Disallow: /books?*output=*
|
109
|
+
Disallow: /books?*pg=*
|
110
|
+
Disallow: /books?*jtp=*
|
111
|
+
Disallow: /books?*jscmd=*
|
112
|
+
Disallow: /books?*buy=*
|
113
|
+
Disallow: /books?*zoom=*
|
114
|
+
Allow: /books?*q=related:*
|
115
|
+
Allow: /books?*q=editions:*
|
116
|
+
Allow: /books?*q=subject:*
|
117
|
+
Allow: /books/about
|
118
|
+
Allow: /booksrightsholders
|
119
|
+
Allow: /books?*zoom=1*
|
120
|
+
Allow: /books?*zoom=5*
|
121
|
+
Disallow: /ebooks/
|
122
|
+
Disallow: /ebooks?*q=*
|
123
|
+
Disallow: /ebooks?*output=*
|
124
|
+
Disallow: /ebooks?*pg=*
|
125
|
+
Disallow: /ebooks?*jscmd=*
|
126
|
+
Disallow: /ebooks?*buy=*
|
127
|
+
Disallow: /ebooks?*zoom=*
|
128
|
+
Allow: /ebooks?*q=related:*
|
129
|
+
Allow: /ebooks?*q=editions:*
|
130
|
+
Allow: /ebooks?*q=subject:*
|
131
|
+
Allow: /ebooks?*zoom=1*
|
132
|
+
Allow: /ebooks?*zoom=5*
|
133
|
+
Disallow: /patents?
|
134
|
+
Allow: /patents?id=
|
135
|
+
Allow: /patents?vid=
|
136
|
+
Disallow: /scholar
|
137
|
+
Disallow: /citations?
|
138
|
+
Allow: /citations?user=
|
139
|
+
Allow: /citations?view_op=new_profile
|
140
|
+
Allow: /citations?view_op=top_venues
|
141
|
+
Disallow: /complete
|
142
|
+
Disallow: /s?
|
143
|
+
Disallow: /sponsoredlinks
|
144
|
+
Disallow: /videosearch?
|
145
|
+
Disallow: /videopreview?
|
146
|
+
Disallow: /videoprograminfo?
|
147
|
+
Disallow: /maps?
|
148
|
+
Disallow: /mapstt?
|
149
|
+
Disallow: /mapslt?
|
150
|
+
Disallow: /maps/stk/
|
151
|
+
Disallow: /maps/br?
|
152
|
+
Disallow: /mapabcpoi?
|
153
|
+
Disallow: /maphp?
|
154
|
+
Disallow: /mapprint?
|
155
|
+
Disallow: /maps/api/js/StaticMapService.GetMapImage?
|
156
|
+
Disallow: /maps/api/staticmap?
|
157
|
+
Disallow: /mld?
|
158
|
+
Disallow: /staticmap?
|
159
|
+
Disallow: /places/
|
160
|
+
Allow: /places/$
|
161
|
+
Disallow: /maps/place
|
162
|
+
Disallow: /help/maps/streetview/partners/welcome/
|
163
|
+
Disallow: /lochp?
|
164
|
+
Disallow: /center
|
165
|
+
Disallow: /ie?
|
166
|
+
Disallow: /sms/demo?
|
167
|
+
Disallow: /katrina?
|
168
|
+
Disallow: /blogsearch?
|
169
|
+
Disallow: /blogsearch/
|
170
|
+
Disallow: /blogsearch_feeds
|
171
|
+
Disallow: /advanced_blog_search
|
172
|
+
Disallow: /reader/
|
173
|
+
Allow: /reader/play
|
174
|
+
Disallow: /uds/
|
175
|
+
Disallow: /chart?
|
176
|
+
Disallow: /transit?
|
177
|
+
Disallow: /mbd?
|
178
|
+
Disallow: /extern_js/
|
179
|
+
Disallow: /calendar/feeds/
|
180
|
+
Disallow: /calendar/ical/
|
181
|
+
Disallow: /cl2/feeds/
|
182
|
+
Disallow: /cl2/ical/
|
183
|
+
Disallow: /coop/directory
|
184
|
+
Disallow: /coop/manage
|
185
|
+
Disallow: /trends?
|
186
|
+
Disallow: /trends/music?
|
187
|
+
Disallow: /trends/hottrends?
|
188
|
+
Disallow: /trends/viz?
|
189
|
+
Disallow: /notebook/search?
|
190
|
+
Disallow: /musica
|
191
|
+
Disallow: /musicad
|
192
|
+
Disallow: /musicas
|
193
|
+
Disallow: /musicl
|
194
|
+
Disallow: /musics
|
195
|
+
Disallow: /musicsearch
|
196
|
+
Disallow: /musicsp
|
197
|
+
Disallow: /musiclp
|
198
|
+
Disallow: /browsersync
|
199
|
+
Disallow: /call
|
200
|
+
Disallow: /archivesearch?
|
201
|
+
Disallow: /archivesearch/url
|
202
|
+
Disallow: /archivesearch/advanced_search
|
203
|
+
Disallow: /base/reportbadoffer
|
204
|
+
Disallow: /urchin_test/
|
205
|
+
Disallow: /movies?
|
206
|
+
Disallow: /codesearch?
|
207
|
+
Disallow: /codesearch/feeds/search?
|
208
|
+
Disallow: /wapsearch?
|
209
|
+
Disallow: /safebrowsing
|
210
|
+
Allow: /safebrowsing/diagnostic
|
211
|
+
Allow: /safebrowsing/report_badware/
|
212
|
+
Allow: /safebrowsing/report_error/
|
213
|
+
Allow: /safebrowsing/report_phish/
|
214
|
+
Disallow: /reviews/search?
|
215
|
+
Disallow: /orkut/albums
|
216
|
+
Allow: /jsapi
|
217
|
+
Disallow: /views?
|
218
|
+
Disallow: /c/
|
219
|
+
Disallow: /cbk
|
220
|
+
Allow: /cbk?output=tile&cb_client=maps_sv
|
221
|
+
Disallow: /recharge/dashboard/car
|
222
|
+
Disallow: /recharge/dashboard/static/
|
223
|
+
Disallow: /translate_a/
|
224
|
+
Disallow: /translate_c
|
225
|
+
Disallow: /translate_f
|
226
|
+
Disallow: /translate_static/
|
227
|
+
Disallow: /translate_suggestion
|
228
|
+
Disallow: /profiles/me
|
229
|
+
Allow: /profiles
|
230
|
+
Disallow: /s2/profiles/me
|
231
|
+
Allow: /s2/profiles
|
232
|
+
Allow: /s2/photos
|
233
|
+
Allow: /s2/static
|
234
|
+
Disallow: /s2
|
235
|
+
Disallow: /transconsole/portal/
|
236
|
+
Disallow: /gcc/
|
237
|
+
Disallow: /aclk
|
238
|
+
Disallow: /cse?
|
239
|
+
Disallow: /cse/home
|
240
|
+
Disallow: /cse/panel
|
241
|
+
Disallow: /cse/manage
|
242
|
+
Disallow: /tbproxy/
|
243
|
+
Disallow: /imesync/
|
244
|
+
Disallow: /shenghuo/search?
|
245
|
+
Disallow: /support/forum/search?
|
246
|
+
Disallow: /reviews/polls/
|
247
|
+
Disallow: /hosted/images/
|
248
|
+
Disallow: /ppob/?
|
249
|
+
Disallow: /ppob?
|
250
|
+
Disallow: /ig/add?
|
251
|
+
Disallow: /adwordsresellers
|
252
|
+
Disallow: /accounts/o8
|
253
|
+
Allow: /accounts/o8/id
|
254
|
+
Disallow: /topicsearch?q=
|
255
|
+
Disallow: /xfx7/
|
256
|
+
Disallow: /squared/api
|
257
|
+
Disallow: /squared/search
|
258
|
+
Disallow: /squared/table
|
259
|
+
Disallow: /toolkit/
|
260
|
+
Allow: /toolkit/*.html
|
261
|
+
Disallow: /globalmarketfinder/
|
262
|
+
Allow: /globalmarketfinder/*.html
|
263
|
+
Disallow: /qnasearch?
|
264
|
+
Disallow: /app/updates
|
265
|
+
Disallow: /sidewiki/entry/
|
266
|
+
Disallow: /quality_form?
|
267
|
+
Disallow: /labs/popgadget/search
|
268
|
+
Disallow: /buzz/post
|
269
|
+
Disallow: /compressiontest/
|
270
|
+
Disallow: /analytics/reporting/
|
271
|
+
Disallow: /analytics/admin/
|
272
|
+
Disallow: /analytics/web/
|
273
|
+
Disallow: /analytics/feeds/
|
274
|
+
Disallow: /analytics/settings/
|
275
|
+
Disallow: /alerts/
|
276
|
+
Disallow: /ads/preferences/
|
277
|
+
Allow: /ads/preferences/html/
|
278
|
+
Allow: /ads/preferences/plugin
|
279
|
+
Disallow: /settings/ads/onweb/
|
280
|
+
Disallow: /phone/compare/?
|
281
|
+
Allow: /alerts/manage
|
282
|
+
Disallow: /travel/clk
|
283
|
+
Disallow: /hotelfinder/rpc
|
284
|
+
Disallow: /flights/rpc
|
285
|
+
Disallow: /commercesearch/services/
|
286
|
+
Disallow: /evaluation/
|
287
|
+
Disallow: /webstore/search
|
288
|
+
Disallow: /chrome/browser/mobile/tour
|
289
|
+
Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
|
290
|
+
Sitemap: http://www.google.com/hostednews/sitemap_index.xml
|
291
|
+
Sitemap: http://www.google.com/ventures/sitemap_ventures.xml
|
292
|
+
Sitemap: http://www.google.com/sitemaps_webmasters.xml
|
293
|
+
Sitemap: http://www.gstatic.com/trends/websites/sitemaps/sitemapindex.xml
|
294
|
+
Sitemap: http://www.gstatic.com/dictionary/static/sitemaps/sitemap_index.xml
|
data/spec/spec_helper.rb
CHANGED
@@ -13,6 +13,17 @@ RSpec.configure do |config|
|
|
13
13
|
config.filter_run_excluding :local_only => true
|
14
14
|
end
|
15
15
|
|
16
|
+
config.before(:all) {
|
17
|
+
# START THIN SERVER TO HOST THE SAMPLE SITE FOR CRAWLING
|
18
|
+
@thin = nil
|
19
|
+
Thread.new do
|
20
|
+
@thin = Thin::Server.start("0.0.0.0", 3532, SampleServer.app)
|
21
|
+
end
|
22
|
+
|
23
|
+
# WAIT FOR START TO COMPLETE
|
24
|
+
sleep 1
|
25
|
+
}
|
26
|
+
|
16
27
|
config.before(:each) {
|
17
28
|
|
18
29
|
#redis_mock = double("redis")
|
@@ -40,10 +51,12 @@ RSpec.configure do |config|
|
|
40
51
|
|
41
52
|
@mock_http_client = mock(Net::HTTP)
|
42
53
|
@mock_http_request = mock(Net::HTTPRequest)
|
54
|
+
@mock_http_robot_request = mock(Net::HTTPRequest)
|
43
55
|
@mock_http_redirect_request = mock(Net::HTTPRequest)
|
44
56
|
@mock_http_redirect_request2 = mock(Net::HTTPRequest)
|
45
57
|
|
46
58
|
@mock_http_response = mock(Net::HTTPResponse)
|
59
|
+
@mock_http_robot_response = mock(Net::HTTPResponse)
|
47
60
|
@mock_http_redirect_response = mock(Net::HTTPRedirection)
|
48
61
|
@mock_http_redirect_response2 = mock(Net::HTTPRedirection)
|
49
62
|
@mock_http_get = mock(Net::HTTP::Get)
|
@@ -51,11 +64,13 @@ RSpec.configure do |config|
|
|
51
64
|
Net::HTTP.stub!(:new).and_return(@mock_http_client)
|
52
65
|
Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
|
53
66
|
Net::HTTP::Get.stub!(:new).with("/redirect.html", {}).and_return(@mock_http_redirect_request)
|
67
|
+
Net::HTTP::Get.stub!(:new).with("/robots.txt", {}).and_return(@mock_http_robot_request)
|
54
68
|
Net::HTTP::Get.stub!(:new).with("/redirect2.html", {}).and_return(@mock_http_redirect_request2)
|
55
69
|
|
56
70
|
Net::HTTP::Head.stub!(:new).and_return(@mock_http_request)
|
57
71
|
|
58
72
|
@mock_http_client.stub!(:request).with(@mock_http_request).and_return(@mock_http_response)
|
73
|
+
@mock_http_client.stub!(:request).with(@mock_http_robot_request).and_return(@mock_http_robot_response)
|
59
74
|
@mock_http_client.stub!(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
60
75
|
@mock_http_client.stub!(:request).with(@mock_http_redirect_request2).and_return(@mock_http_redirect_response2)
|
61
76
|
@mock_http_client.stub!(:read_timeout=).and_return(nil)
|
@@ -64,6 +79,16 @@ RSpec.configure do |config|
|
|
64
79
|
@mock_http_client.stub!(:address).and_return("www.baseurl.com")
|
65
80
|
@mock_http_client.stub!(:port).and_return("80 ")
|
66
81
|
|
82
|
+
@mock_http_robot_response.stub!(:code).and_return(200)
|
83
|
+
@mock_http_robot_response.stub!(:body).and_return(File.open(File.dirname(__FILE__) + '/../spec/samples/robots.txt', "r").read)
|
84
|
+
@mock_http_robot_response.stub!(:content_type).and_return("text/plain")
|
85
|
+
@mock_http_robot_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
|
86
|
+
@mock_http_robot_response.stub!(:[]).with("location").and_return(@default_headers["location"])
|
87
|
+
@mock_http_robot_response.stub!(:[]).with("Content-Encoding").and_return(@default_headers["Content-Encoding"])
|
88
|
+
@mock_http_robot_response.stub!(:content_length).and_return(1024)
|
89
|
+
@mock_http_robot_response.stub!(:get_fields).with('set-cookie').and_return(["session=al98axx; expires=Fri, 31-Dec-1999 23:58:23", "query=rubyscript; expires=Fri, 31-Dec-1999 23:58:23"])
|
90
|
+
@mock_http_robot_response.stub!(:to_hash).and_return(@default_headers)
|
91
|
+
|
67
92
|
@mock_http_response.stub!(:code).and_return(200)
|
68
93
|
@mock_http_response.stub!(:content_type).and_return("text/html")
|
69
94
|
@mock_http_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.58
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-06-
|
12
|
+
date: 2012-06-30 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70328776801460 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70328776801460
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70328776799760 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70328776799760
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70328776798960 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70328776798960
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: addressable
|
49
|
-
requirement: &
|
49
|
+
requirement: &70328776797840 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70328776797840
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &70328776796300 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70328776796300
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: awesome_print
|
71
|
-
requirement: &
|
71
|
+
requirement: &70328776811560 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70328776811560
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: sinatra
|
82
|
-
requirement: &
|
82
|
+
requirement: &70328776810940 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70328776810940
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: thin
|
93
|
-
requirement: &
|
93
|
+
requirement: &70328776810380 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70328776810380
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: haml
|
104
|
-
requirement: &
|
104
|
+
requirement: &70328776809840 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70328776809840
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: namespaced_redis
|
115
|
-
requirement: &
|
115
|
+
requirement: &70328776809160 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,7 +120,7 @@ dependencies:
|
|
120
120
|
version: 1.0.2
|
121
121
|
type: :runtime
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70328776809160
|
124
124
|
description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
125
125
|
crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
|
126
126
|
is also a standalone crawler that has a sophisticated statistics monitoring interface
|
@@ -136,6 +136,8 @@ files:
|
|
136
136
|
- spec/cobweb/cobweb_links_spec.rb
|
137
137
|
- spec/cobweb/cobweb_spec.rb
|
138
138
|
- spec/cobweb/content_link_parser_spec.rb
|
139
|
+
- spec/cobweb/robots_spec.rb
|
140
|
+
- spec/samples/robots.txt
|
139
141
|
- spec/samples/sample_html_links.html
|
140
142
|
- spec/samples/sample_server.rb
|
141
143
|
- spec/samples/sample_site/boxgrid.html
|
@@ -293,6 +295,7 @@ files:
|
|
293
295
|
- spec/samples/sample_site/js/superfish.js
|
294
296
|
- spec/samples/sample_site/js/supersubs.js
|
295
297
|
- spec/samples/sample_site/more.html
|
298
|
+
- spec/samples/sample_site/robots.txt
|
296
299
|
- spec/samples/sample_site/tables.html
|
297
300
|
- spec/samples/sample_site/text/museosans-webfont.eot
|
298
301
|
- spec/samples/sample_site/text/museosans-webfont.svg
|