cobweb 0.0.57 → 0.0.58
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +3 -1
- data/lib/cobweb.rb +23 -2
- data/lib/cobweb_crawler.rb +1 -0
- data/lib/cobweb_links.rb +9 -0
- data/lib/cobweb_version.rb +1 -1
- data/lib/robots.rb +71 -3
- data/lib/server.rb +0 -1
- data/spec/cobweb/cobweb_crawler_spec.rb +4 -4
- data/spec/cobweb/cobweb_job_spec.rb +0 -10
- data/spec/cobweb/robots_spec.rb +70 -0
- data/spec/samples/robots.txt +294 -0
- data/spec/samples/sample_site/robots.txt +13 -0
- data/spec/spec_helper.rb +25 -0
- metadata +25 -22
data/README.textile
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
|
2
|
-
h1. Cobweb v0.0.
|
2
|
+
h1. Cobweb v0.0.58
|
3
3
|
!https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
|
4
4
|
|
5
5
|
h2. Intro
|
@@ -41,6 +41,8 @@ h3. Data Returned
|
|
41
41
|
* :crawl_id - the id used internally for identifying the crawl. Can be used by the processing job to seperate crawls
|
42
42
|
* :internal_urls - an array of urls with * wildcards that represent urls internal to the site (ie pages within the same domain)
|
43
43
|
* :external_urls - an array of urls with * wildcards that represent urls external to the site (overrides internal_urls)
|
44
|
+
* :obey_robots - boolean determining if robots.txt should be honoured. (default: false)
|
45
|
+
* :user_agent - user agent string to match in robots.txt (not sent as user_agent of requests yet) (default: cobweb)
|
44
46
|
|
45
47
|
The source for the links can be overridden, contact me for the syntax (don't have time to put it into this documentation, will as soon as i have time!)
|
46
48
|
|
data/lib/cobweb.rb
CHANGED
@@ -43,6 +43,9 @@ class Cobweb
|
|
43
43
|
default_redis_options_to Hash.new
|
44
44
|
default_internal_urls_to []
|
45
45
|
default_first_page_redirect_internal_to true
|
46
|
+
default_text_mime_types_to ["text/*", "application/xhtml+xml"]
|
47
|
+
default_obey_robots_to false
|
48
|
+
default_user_agent_to "cobweb"
|
46
49
|
|
47
50
|
end
|
48
51
|
|
@@ -177,7 +180,7 @@ class Cobweb
|
|
177
180
|
content[:character_set] = charset
|
178
181
|
end
|
179
182
|
content[:length] = response.content_length
|
180
|
-
if
|
183
|
+
if text_content?(content[:mime_type])
|
181
184
|
if response["Content-Encoding"]=="gzip"
|
182
185
|
content[:body] = Zlib::GzipReader.new(StringIO.new(response.body)).read
|
183
186
|
else
|
@@ -384,5 +387,23 @@ class Cobweb
|
|
384
387
|
content
|
385
388
|
end
|
386
389
|
|
387
|
-
end
|
390
|
+
end
|
391
|
+
|
392
|
+
private
|
393
|
+
# checks if the mime_type is textual
|
394
|
+
def text_content?(content_type)
|
395
|
+
@options[:text_mime_types].each do |mime_type|
|
396
|
+
return true if content_type.match(escape_pattern_for_regex(mime_type))
|
397
|
+
end
|
398
|
+
false
|
399
|
+
end
|
400
|
+
|
401
|
+
# escapes characters with meaning in regular expressions and adds wildcard expression
|
402
|
+
def escape_pattern_for_regex(pattern)
|
403
|
+
pattern = pattern.gsub(".", "\\.")
|
404
|
+
pattern = pattern.gsub("?", "\\?")
|
405
|
+
pattern = pattern.gsub("*", ".*?")
|
406
|
+
pattern
|
407
|
+
end
|
408
|
+
|
388
409
|
end
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -64,6 +64,7 @@ class CobwebCrawler
|
|
64
64
|
@redis.incr "crawl-counter"
|
65
65
|
|
66
66
|
internal_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https])
|
67
|
+
ap internal_links
|
67
68
|
|
68
69
|
# select the link if its internal (eliminate external before expensive lookups in queued and crawled)
|
69
70
|
cobweb_links = CobwebLinks.new(@options)
|
data/lib/cobweb_links.rb
CHANGED
@@ -17,6 +17,15 @@ class CobwebLinks
|
|
17
17
|
|
18
18
|
end
|
19
19
|
|
20
|
+
def allowed?(link)
|
21
|
+
if @options[:obey_robots]
|
22
|
+
robot = Robots.new(:url => link, :user_agent => @options[:user_agent])
|
23
|
+
return robot.allowed?(link)
|
24
|
+
else
|
25
|
+
return true
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
20
29
|
# Returns true if the link is matched to an internal_url and not matched to an external_url
|
21
30
|
def internal?(link)
|
22
31
|
if @options[:debug]
|
data/lib/cobweb_version.rb
CHANGED
data/lib/robots.rb
CHANGED
@@ -2,10 +2,78 @@
|
|
2
2
|
class Robots
|
3
3
|
|
4
4
|
# Processes the robots.txt file
|
5
|
-
def initialize(
|
5
|
+
def initialize(options)
|
6
|
+
@options = options
|
7
|
+
raise "options should be a hash" unless options.kind_of? Hash
|
8
|
+
raise ":url is required" unless @options.has_key? :url
|
9
|
+
@options[:file] = "robots.txt" unless @options.has_key? :file
|
10
|
+
@options[:user_agent] = "cobweb" unless @options.has_key? :user_agent
|
11
|
+
|
12
|
+
uri = URI.parse(@options[:url])
|
13
|
+
content = Cobweb.new(:cache => nil, :text_mime_types => ["text/html", "application/xhtml+xml", "text/plain"]).get([uri.scheme, "://", uri.host, ":", uri.port, "/", @options[:file]].join)
|
14
|
+
if content[:mime_type][0..4] == "text/"
|
15
|
+
@raw_data = parse_data(content[:body])
|
16
|
+
|
17
|
+
if @options.has_key?(:user_agent) && @raw_data.has_key?(@options[:user_agent].to_s.downcase.to_sym)
|
18
|
+
@params = @raw_data[@options[:user_agent].to_s.downcase.to_sym]
|
19
|
+
else
|
20
|
+
raise "Wildcard user-agent is not present" unless @raw_data.has_key? :*
|
21
|
+
@params = @raw_data[:*]
|
22
|
+
end
|
23
|
+
else
|
24
|
+
raise "Invalid mime type: #{content[:content_type]}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def allowed?(url)
|
6
29
|
uri = URI.parse(url)
|
7
|
-
[
|
8
|
-
|
30
|
+
@params[:allow].each do |pattern|
|
31
|
+
return true if uri.path.match(escape_pattern_for_regex(pattern))
|
32
|
+
end
|
33
|
+
@params[:disallow].each do |pattern|
|
34
|
+
return false if uri.path.match(escape_pattern_for_regex(pattern))
|
35
|
+
end
|
36
|
+
true
|
37
|
+
end
|
38
|
+
|
39
|
+
def user_agent_settings
|
40
|
+
@params
|
41
|
+
end
|
42
|
+
|
43
|
+
def contents
|
44
|
+
@raw_data
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
# escapes characters with meaning in regular expressions and adds wildcard expression
|
49
|
+
def escape_pattern_for_regex(pattern)
|
50
|
+
pattern = pattern.gsub(".", "\\.")
|
51
|
+
pattern = pattern.gsub("?", "\\?")
|
52
|
+
pattern = pattern.gsub("*", ".*?")
|
53
|
+
pattern
|
54
|
+
end
|
55
|
+
|
56
|
+
def parse_data(data)
|
57
|
+
user_agents = {}
|
58
|
+
lines = data.split("\n")
|
59
|
+
lines.map!{|line| line.strip}
|
60
|
+
lines.reject!{|line| line == "" || line[0] == "#"}
|
61
|
+
current_user_agent = nil
|
9
62
|
|
63
|
+
lines.each do |line|
|
64
|
+
if line[0..10].downcase == "user-agent:"
|
65
|
+
current_user_agent = line.split(":")[1..-1].join.downcase.strip.to_sym
|
66
|
+
user_agents[current_user_agent] = {:allow => [], :disallow => []}
|
67
|
+
else
|
68
|
+
if current_user_agent
|
69
|
+
values = line.split(":")
|
70
|
+
unless values[1..-1].join.strip == ""
|
71
|
+
user_agents[current_user_agent][values[0].downcase.strip.to_sym] = [] unless user_agents[current_user_agent].has_key? values[0].downcase.to_sym
|
72
|
+
user_agents[current_user_agent][values[0].downcase.strip.to_sym] << values[1..-1].join.strip
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
user_agents
|
10
78
|
end
|
11
79
|
end
|
data/lib/server.rb
CHANGED
@@ -4,7 +4,7 @@ describe CobwebCrawler do
|
|
4
4
|
|
5
5
|
before(:each) do
|
6
6
|
|
7
|
-
@base_url = "http://
|
7
|
+
@base_url = "http://localhost:3532/"
|
8
8
|
|
9
9
|
@default_headers = {"Cache-Control" => "private, max-age=0",
|
10
10
|
"Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
|
@@ -27,9 +27,9 @@ describe CobwebCrawler do
|
|
27
27
|
|
28
28
|
# temporary tests to run crawler - proper specs to follow.. honest
|
29
29
|
|
30
|
-
crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug =>
|
30
|
+
crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => true})
|
31
31
|
|
32
|
-
statistics = crawler.crawl(
|
32
|
+
statistics = crawler.crawl(@base_url)
|
33
33
|
|
34
34
|
statistics.should_not be_nil
|
35
35
|
statistics.should be_an_instance_of Hash
|
@@ -42,7 +42,7 @@ describe CobwebCrawler do
|
|
42
42
|
|
43
43
|
crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
|
44
44
|
|
45
|
-
statistics = crawler.crawl(
|
45
|
+
statistics = crawler.crawl(@base_url) do |content, statistics|
|
46
46
|
content[:url].should_not be_nil
|
47
47
|
statistics[:average_length].should_not be_nil
|
48
48
|
end
|
@@ -12,15 +12,6 @@ describe Cobweb, :local_only => true do
|
|
12
12
|
io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=5 QUEUE=cobweb_crawl_job > log/output.log &")
|
13
13
|
puts "Workers Started."
|
14
14
|
|
15
|
-
# START THIN SERVER TO HOST THE SAMPLE SITE FOR CRAWLING
|
16
|
-
@thin = nil
|
17
|
-
Thread.new do
|
18
|
-
@thin = Thin::Server.start("0.0.0.0", 3532, SampleServer.app)
|
19
|
-
end
|
20
|
-
|
21
|
-
# WAIT FOR START TO COMPLETE
|
22
|
-
sleep 1
|
23
|
-
|
24
15
|
end
|
25
16
|
|
26
17
|
before(:each) do
|
@@ -147,7 +138,6 @@ describe Cobweb, :local_only => true do
|
|
147
138
|
@all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
148
139
|
command = "kill #{(@all_processes - @existing_processes).join(" ")}"
|
149
140
|
IO.popen(command)
|
150
|
-
#@thin.stop!
|
151
141
|
end
|
152
142
|
|
153
143
|
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
describe Robots do
|
4
|
+
|
5
|
+
before(:each) do
|
6
|
+
@cobweb = Cobweb.new :quiet => true, :cache => nil
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "default user-agent" do
|
10
|
+
before(:each) do
|
11
|
+
@options = {:url => "http://localhost/"}
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should parse a valid robots.txt" do
|
15
|
+
lambda {Robots.new(@options)}.should_not raise_error
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should allow urls marked as allow" do
|
19
|
+
robot = Robots.new(@options)
|
20
|
+
robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should be_true
|
21
|
+
end
|
22
|
+
it "should disallow urls specified as disallow" do
|
23
|
+
robot = Robots.new(@options)
|
24
|
+
robot.allowed?("http://localhost/globalmarketfinder/").should be_false
|
25
|
+
robot.allowed?("http://localhost/globalmarketfinder/asdf").should be_false
|
26
|
+
end
|
27
|
+
it "should allow urls not listed" do
|
28
|
+
robot = Robots.new(@options)
|
29
|
+
robot.allowed?("http://localhost/notlistedinrobotsfile").should be_true
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
describe "google user-agent" do
|
35
|
+
before(:each) do
|
36
|
+
@options = {:url => "http://localhost/", :user_agent => "google"}
|
37
|
+
end
|
38
|
+
it "should parse a valid robots.txt" do
|
39
|
+
lambda {Robots.new(@options)}.should_not raise_error
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should disallow all urls" do
|
43
|
+
robot = Robots.new(@options)
|
44
|
+
robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should be_false
|
45
|
+
robot.allowed?("http://localhost/globalmarketfinder/").should be_false
|
46
|
+
robot.allowed?("http://localhost/globalmarketfinder/asdf").should be_false
|
47
|
+
robot.allowed?("http://localhost/notlistedinrobotsfile").should be_false
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
|
52
|
+
describe "cybermapper user-agent" do
|
53
|
+
before(:each) do
|
54
|
+
@options = {:url => "http://localhost/", :user_agent => "cybermapper"}
|
55
|
+
end
|
56
|
+
it "should parse a valid robots.txt" do
|
57
|
+
lambda {Robots.new(@options)}.should_not raise_error
|
58
|
+
end
|
59
|
+
|
60
|
+
it "should disallow all urls" do
|
61
|
+
robot = Robots.new(@options)
|
62
|
+
robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should be_true
|
63
|
+
robot.allowed?("http://localhost/globalmarketfinder/").should be_true
|
64
|
+
robot.allowed?("http://localhost/globalmarketfinder/asdf").should be_true
|
65
|
+
robot.allowed?("http://localhost/notlistedinrobotsfile").should be_true
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
@@ -0,0 +1,294 @@
|
|
1
|
+
# robots.txt for http://www.example.com/
|
2
|
+
|
3
|
+
User-agent: cybermapper
|
4
|
+
Disallow:
|
5
|
+
|
6
|
+
User-agent: google
|
7
|
+
Disallow: /
|
8
|
+
|
9
|
+
User-agent: *
|
10
|
+
Disallow: /search
|
11
|
+
Disallow: /sdch
|
12
|
+
Disallow: /groups
|
13
|
+
Disallow: /images
|
14
|
+
Disallow: /catalogs
|
15
|
+
Allow: /catalogs/about
|
16
|
+
Allow: /catalogs/p?
|
17
|
+
Disallow: /catalogues
|
18
|
+
Disallow: /news
|
19
|
+
Allow: /news/directory
|
20
|
+
Disallow: /nwshp
|
21
|
+
Disallow: /setnewsprefs?
|
22
|
+
Disallow: /index.html?
|
23
|
+
Disallow: /?
|
24
|
+
Allow: /?hl=
|
25
|
+
Disallow: /?hl=*&
|
26
|
+
Disallow: /addurl/image?
|
27
|
+
Disallow: /pagead/
|
28
|
+
Disallow: /relpage/
|
29
|
+
Disallow: /relcontent
|
30
|
+
Disallow: /imgres
|
31
|
+
Disallow: /imglanding
|
32
|
+
Disallow: /sbd
|
33
|
+
Disallow: /keyword/
|
34
|
+
Disallow: /u/
|
35
|
+
Disallow: /univ/
|
36
|
+
Disallow: /cobrand
|
37
|
+
Disallow: /custom
|
38
|
+
Disallow: /advanced_group_search
|
39
|
+
Disallow: /googlesite
|
40
|
+
Disallow: /preferences
|
41
|
+
Disallow: /setprefs
|
42
|
+
Disallow: /swr
|
43
|
+
Disallow: /url
|
44
|
+
Disallow: /default
|
45
|
+
Disallow: /m?
|
46
|
+
Disallow: /m/?
|
47
|
+
Disallow: /m/blogs?
|
48
|
+
Disallow: /m/directions?
|
49
|
+
Disallow: /m/ig
|
50
|
+
Disallow: /m/images?
|
51
|
+
Disallow: /m/imgres?
|
52
|
+
Disallow: /m/local?
|
53
|
+
Disallow: /m/movies?
|
54
|
+
Disallow: /m/news?
|
55
|
+
Disallow: /m/news/i?
|
56
|
+
Disallow: /m/place?
|
57
|
+
Disallow: /m/products?
|
58
|
+
Disallow: /m/products/
|
59
|
+
Disallow: /m/setnewsprefs?
|
60
|
+
Disallow: /m/search?
|
61
|
+
Disallow: /m/swmloptin?
|
62
|
+
Disallow: /m/trends
|
63
|
+
Disallow: /m/video?
|
64
|
+
Disallow: /wml?
|
65
|
+
Disallow: /wml/?
|
66
|
+
Disallow: /wml/search?
|
67
|
+
Disallow: /xhtml?
|
68
|
+
Disallow: /xhtml/?
|
69
|
+
Disallow: /xhtml/search?
|
70
|
+
Disallow: /xml?
|
71
|
+
Disallow: /imode?
|
72
|
+
Disallow: /imode/?
|
73
|
+
Disallow: /imode/search?
|
74
|
+
Disallow: /jsky?
|
75
|
+
Disallow: /jsky/?
|
76
|
+
Disallow: /jsky/search?
|
77
|
+
Disallow: /pda?
|
78
|
+
Disallow: /pda/?
|
79
|
+
Disallow: /pda/search?
|
80
|
+
Disallow: /sprint_xhtml
|
81
|
+
Disallow: /sprint_wml
|
82
|
+
Disallow: /pqa
|
83
|
+
Disallow: /palm
|
84
|
+
Disallow: /gwt/
|
85
|
+
Disallow: /purchases
|
86
|
+
Disallow: /hws
|
87
|
+
Disallow: /bsd?
|
88
|
+
Disallow: /linux?
|
89
|
+
Disallow: /mac?
|
90
|
+
Disallow: /microsoft?
|
91
|
+
Disallow: /unclesam?
|
92
|
+
Disallow: /answers/search?q=
|
93
|
+
Disallow: /local?
|
94
|
+
Disallow: /local_url
|
95
|
+
Disallow: /shihui?
|
96
|
+
Disallow: /shihui/
|
97
|
+
Disallow: /froogle?
|
98
|
+
Disallow: /products?
|
99
|
+
Disallow: /products/
|
100
|
+
Disallow: /froogle_
|
101
|
+
Disallow: /product_
|
102
|
+
Disallow: /products_
|
103
|
+
Disallow: /products;
|
104
|
+
Disallow: /print
|
105
|
+
Disallow: /books/
|
106
|
+
Disallow: /bkshp?*q=*
|
107
|
+
Disallow: /books?*q=*
|
108
|
+
Disallow: /books?*output=*
|
109
|
+
Disallow: /books?*pg=*
|
110
|
+
Disallow: /books?*jtp=*
|
111
|
+
Disallow: /books?*jscmd=*
|
112
|
+
Disallow: /books?*buy=*
|
113
|
+
Disallow: /books?*zoom=*
|
114
|
+
Allow: /books?*q=related:*
|
115
|
+
Allow: /books?*q=editions:*
|
116
|
+
Allow: /books?*q=subject:*
|
117
|
+
Allow: /books/about
|
118
|
+
Allow: /booksrightsholders
|
119
|
+
Allow: /books?*zoom=1*
|
120
|
+
Allow: /books?*zoom=5*
|
121
|
+
Disallow: /ebooks/
|
122
|
+
Disallow: /ebooks?*q=*
|
123
|
+
Disallow: /ebooks?*output=*
|
124
|
+
Disallow: /ebooks?*pg=*
|
125
|
+
Disallow: /ebooks?*jscmd=*
|
126
|
+
Disallow: /ebooks?*buy=*
|
127
|
+
Disallow: /ebooks?*zoom=*
|
128
|
+
Allow: /ebooks?*q=related:*
|
129
|
+
Allow: /ebooks?*q=editions:*
|
130
|
+
Allow: /ebooks?*q=subject:*
|
131
|
+
Allow: /ebooks?*zoom=1*
|
132
|
+
Allow: /ebooks?*zoom=5*
|
133
|
+
Disallow: /patents?
|
134
|
+
Allow: /patents?id=
|
135
|
+
Allow: /patents?vid=
|
136
|
+
Disallow: /scholar
|
137
|
+
Disallow: /citations?
|
138
|
+
Allow: /citations?user=
|
139
|
+
Allow: /citations?view_op=new_profile
|
140
|
+
Allow: /citations?view_op=top_venues
|
141
|
+
Disallow: /complete
|
142
|
+
Disallow: /s?
|
143
|
+
Disallow: /sponsoredlinks
|
144
|
+
Disallow: /videosearch?
|
145
|
+
Disallow: /videopreview?
|
146
|
+
Disallow: /videoprograminfo?
|
147
|
+
Disallow: /maps?
|
148
|
+
Disallow: /mapstt?
|
149
|
+
Disallow: /mapslt?
|
150
|
+
Disallow: /maps/stk/
|
151
|
+
Disallow: /maps/br?
|
152
|
+
Disallow: /mapabcpoi?
|
153
|
+
Disallow: /maphp?
|
154
|
+
Disallow: /mapprint?
|
155
|
+
Disallow: /maps/api/js/StaticMapService.GetMapImage?
|
156
|
+
Disallow: /maps/api/staticmap?
|
157
|
+
Disallow: /mld?
|
158
|
+
Disallow: /staticmap?
|
159
|
+
Disallow: /places/
|
160
|
+
Allow: /places/$
|
161
|
+
Disallow: /maps/place
|
162
|
+
Disallow: /help/maps/streetview/partners/welcome/
|
163
|
+
Disallow: /lochp?
|
164
|
+
Disallow: /center
|
165
|
+
Disallow: /ie?
|
166
|
+
Disallow: /sms/demo?
|
167
|
+
Disallow: /katrina?
|
168
|
+
Disallow: /blogsearch?
|
169
|
+
Disallow: /blogsearch/
|
170
|
+
Disallow: /blogsearch_feeds
|
171
|
+
Disallow: /advanced_blog_search
|
172
|
+
Disallow: /reader/
|
173
|
+
Allow: /reader/play
|
174
|
+
Disallow: /uds/
|
175
|
+
Disallow: /chart?
|
176
|
+
Disallow: /transit?
|
177
|
+
Disallow: /mbd?
|
178
|
+
Disallow: /extern_js/
|
179
|
+
Disallow: /calendar/feeds/
|
180
|
+
Disallow: /calendar/ical/
|
181
|
+
Disallow: /cl2/feeds/
|
182
|
+
Disallow: /cl2/ical/
|
183
|
+
Disallow: /coop/directory
|
184
|
+
Disallow: /coop/manage
|
185
|
+
Disallow: /trends?
|
186
|
+
Disallow: /trends/music?
|
187
|
+
Disallow: /trends/hottrends?
|
188
|
+
Disallow: /trends/viz?
|
189
|
+
Disallow: /notebook/search?
|
190
|
+
Disallow: /musica
|
191
|
+
Disallow: /musicad
|
192
|
+
Disallow: /musicas
|
193
|
+
Disallow: /musicl
|
194
|
+
Disallow: /musics
|
195
|
+
Disallow: /musicsearch
|
196
|
+
Disallow: /musicsp
|
197
|
+
Disallow: /musiclp
|
198
|
+
Disallow: /browsersync
|
199
|
+
Disallow: /call
|
200
|
+
Disallow: /archivesearch?
|
201
|
+
Disallow: /archivesearch/url
|
202
|
+
Disallow: /archivesearch/advanced_search
|
203
|
+
Disallow: /base/reportbadoffer
|
204
|
+
Disallow: /urchin_test/
|
205
|
+
Disallow: /movies?
|
206
|
+
Disallow: /codesearch?
|
207
|
+
Disallow: /codesearch/feeds/search?
|
208
|
+
Disallow: /wapsearch?
|
209
|
+
Disallow: /safebrowsing
|
210
|
+
Allow: /safebrowsing/diagnostic
|
211
|
+
Allow: /safebrowsing/report_badware/
|
212
|
+
Allow: /safebrowsing/report_error/
|
213
|
+
Allow: /safebrowsing/report_phish/
|
214
|
+
Disallow: /reviews/search?
|
215
|
+
Disallow: /orkut/albums
|
216
|
+
Allow: /jsapi
|
217
|
+
Disallow: /views?
|
218
|
+
Disallow: /c/
|
219
|
+
Disallow: /cbk
|
220
|
+
Allow: /cbk?output=tile&cb_client=maps_sv
|
221
|
+
Disallow: /recharge/dashboard/car
|
222
|
+
Disallow: /recharge/dashboard/static/
|
223
|
+
Disallow: /translate_a/
|
224
|
+
Disallow: /translate_c
|
225
|
+
Disallow: /translate_f
|
226
|
+
Disallow: /translate_static/
|
227
|
+
Disallow: /translate_suggestion
|
228
|
+
Disallow: /profiles/me
|
229
|
+
Allow: /profiles
|
230
|
+
Disallow: /s2/profiles/me
|
231
|
+
Allow: /s2/profiles
|
232
|
+
Allow: /s2/photos
|
233
|
+
Allow: /s2/static
|
234
|
+
Disallow: /s2
|
235
|
+
Disallow: /transconsole/portal/
|
236
|
+
Disallow: /gcc/
|
237
|
+
Disallow: /aclk
|
238
|
+
Disallow: /cse?
|
239
|
+
Disallow: /cse/home
|
240
|
+
Disallow: /cse/panel
|
241
|
+
Disallow: /cse/manage
|
242
|
+
Disallow: /tbproxy/
|
243
|
+
Disallow: /imesync/
|
244
|
+
Disallow: /shenghuo/search?
|
245
|
+
Disallow: /support/forum/search?
|
246
|
+
Disallow: /reviews/polls/
|
247
|
+
Disallow: /hosted/images/
|
248
|
+
Disallow: /ppob/?
|
249
|
+
Disallow: /ppob?
|
250
|
+
Disallow: /ig/add?
|
251
|
+
Disallow: /adwordsresellers
|
252
|
+
Disallow: /accounts/o8
|
253
|
+
Allow: /accounts/o8/id
|
254
|
+
Disallow: /topicsearch?q=
|
255
|
+
Disallow: /xfx7/
|
256
|
+
Disallow: /squared/api
|
257
|
+
Disallow: /squared/search
|
258
|
+
Disallow: /squared/table
|
259
|
+
Disallow: /toolkit/
|
260
|
+
Allow: /toolkit/*.html
|
261
|
+
Disallow: /globalmarketfinder/
|
262
|
+
Allow: /globalmarketfinder/*.html
|
263
|
+
Disallow: /qnasearch?
|
264
|
+
Disallow: /app/updates
|
265
|
+
Disallow: /sidewiki/entry/
|
266
|
+
Disallow: /quality_form?
|
267
|
+
Disallow: /labs/popgadget/search
|
268
|
+
Disallow: /buzz/post
|
269
|
+
Disallow: /compressiontest/
|
270
|
+
Disallow: /analytics/reporting/
|
271
|
+
Disallow: /analytics/admin/
|
272
|
+
Disallow: /analytics/web/
|
273
|
+
Disallow: /analytics/feeds/
|
274
|
+
Disallow: /analytics/settings/
|
275
|
+
Disallow: /alerts/
|
276
|
+
Disallow: /ads/preferences/
|
277
|
+
Allow: /ads/preferences/html/
|
278
|
+
Allow: /ads/preferences/plugin
|
279
|
+
Disallow: /settings/ads/onweb/
|
280
|
+
Disallow: /phone/compare/?
|
281
|
+
Allow: /alerts/manage
|
282
|
+
Disallow: /travel/clk
|
283
|
+
Disallow: /hotelfinder/rpc
|
284
|
+
Disallow: /flights/rpc
|
285
|
+
Disallow: /commercesearch/services/
|
286
|
+
Disallow: /evaluation/
|
287
|
+
Disallow: /webstore/search
|
288
|
+
Disallow: /chrome/browser/mobile/tour
|
289
|
+
Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
|
290
|
+
Sitemap: http://www.google.com/hostednews/sitemap_index.xml
|
291
|
+
Sitemap: http://www.google.com/ventures/sitemap_ventures.xml
|
292
|
+
Sitemap: http://www.google.com/sitemaps_webmasters.xml
|
293
|
+
Sitemap: http://www.gstatic.com/trends/websites/sitemaps/sitemapindex.xml
|
294
|
+
Sitemap: http://www.gstatic.com/dictionary/static/sitemaps/sitemap_index.xml
|
data/spec/spec_helper.rb
CHANGED
@@ -13,6 +13,17 @@ RSpec.configure do |config|
|
|
13
13
|
config.filter_run_excluding :local_only => true
|
14
14
|
end
|
15
15
|
|
16
|
+
config.before(:all) {
|
17
|
+
# START THIN SERVER TO HOST THE SAMPLE SITE FOR CRAWLING
|
18
|
+
@thin = nil
|
19
|
+
Thread.new do
|
20
|
+
@thin = Thin::Server.start("0.0.0.0", 3532, SampleServer.app)
|
21
|
+
end
|
22
|
+
|
23
|
+
# WAIT FOR START TO COMPLETE
|
24
|
+
sleep 1
|
25
|
+
}
|
26
|
+
|
16
27
|
config.before(:each) {
|
17
28
|
|
18
29
|
#redis_mock = double("redis")
|
@@ -40,10 +51,12 @@ RSpec.configure do |config|
|
|
40
51
|
|
41
52
|
@mock_http_client = mock(Net::HTTP)
|
42
53
|
@mock_http_request = mock(Net::HTTPRequest)
|
54
|
+
@mock_http_robot_request = mock(Net::HTTPRequest)
|
43
55
|
@mock_http_redirect_request = mock(Net::HTTPRequest)
|
44
56
|
@mock_http_redirect_request2 = mock(Net::HTTPRequest)
|
45
57
|
|
46
58
|
@mock_http_response = mock(Net::HTTPResponse)
|
59
|
+
@mock_http_robot_response = mock(Net::HTTPResponse)
|
47
60
|
@mock_http_redirect_response = mock(Net::HTTPRedirection)
|
48
61
|
@mock_http_redirect_response2 = mock(Net::HTTPRedirection)
|
49
62
|
@mock_http_get = mock(Net::HTTP::Get)
|
@@ -51,11 +64,13 @@ RSpec.configure do |config|
|
|
51
64
|
Net::HTTP.stub!(:new).and_return(@mock_http_client)
|
52
65
|
Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
|
53
66
|
Net::HTTP::Get.stub!(:new).with("/redirect.html", {}).and_return(@mock_http_redirect_request)
|
67
|
+
Net::HTTP::Get.stub!(:new).with("/robots.txt", {}).and_return(@mock_http_robot_request)
|
54
68
|
Net::HTTP::Get.stub!(:new).with("/redirect2.html", {}).and_return(@mock_http_redirect_request2)
|
55
69
|
|
56
70
|
Net::HTTP::Head.stub!(:new).and_return(@mock_http_request)
|
57
71
|
|
58
72
|
@mock_http_client.stub!(:request).with(@mock_http_request).and_return(@mock_http_response)
|
73
|
+
@mock_http_client.stub!(:request).with(@mock_http_robot_request).and_return(@mock_http_robot_response)
|
59
74
|
@mock_http_client.stub!(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
60
75
|
@mock_http_client.stub!(:request).with(@mock_http_redirect_request2).and_return(@mock_http_redirect_response2)
|
61
76
|
@mock_http_client.stub!(:read_timeout=).and_return(nil)
|
@@ -64,6 +79,16 @@ RSpec.configure do |config|
|
|
64
79
|
@mock_http_client.stub!(:address).and_return("www.baseurl.com")
|
65
80
|
@mock_http_client.stub!(:port).and_return("80 ")
|
66
81
|
|
82
|
+
@mock_http_robot_response.stub!(:code).and_return(200)
|
83
|
+
@mock_http_robot_response.stub!(:body).and_return(File.open(File.dirname(__FILE__) + '/../spec/samples/robots.txt', "r").read)
|
84
|
+
@mock_http_robot_response.stub!(:content_type).and_return("text/plain")
|
85
|
+
@mock_http_robot_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
|
86
|
+
@mock_http_robot_response.stub!(:[]).with("location").and_return(@default_headers["location"])
|
87
|
+
@mock_http_robot_response.stub!(:[]).with("Content-Encoding").and_return(@default_headers["Content-Encoding"])
|
88
|
+
@mock_http_robot_response.stub!(:content_length).and_return(1024)
|
89
|
+
@mock_http_robot_response.stub!(:get_fields).with('set-cookie').and_return(["session=al98axx; expires=Fri, 31-Dec-1999 23:58:23", "query=rubyscript; expires=Fri, 31-Dec-1999 23:58:23"])
|
90
|
+
@mock_http_robot_response.stub!(:to_hash).and_return(@default_headers)
|
91
|
+
|
67
92
|
@mock_http_response.stub!(:code).and_return(200)
|
68
93
|
@mock_http_response.stub!(:content_type).and_return("text/html")
|
69
94
|
@mock_http_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.58
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-06-
|
12
|
+
date: 2012-06-30 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70328776801460 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70328776801460
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70328776799760 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70328776799760
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70328776798960 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70328776798960
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: addressable
|
49
|
-
requirement: &
|
49
|
+
requirement: &70328776797840 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70328776797840
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &70328776796300 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70328776796300
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: awesome_print
|
71
|
-
requirement: &
|
71
|
+
requirement: &70328776811560 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70328776811560
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: sinatra
|
82
|
-
requirement: &
|
82
|
+
requirement: &70328776810940 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70328776810940
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: thin
|
93
|
-
requirement: &
|
93
|
+
requirement: &70328776810380 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70328776810380
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: haml
|
104
|
-
requirement: &
|
104
|
+
requirement: &70328776809840 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70328776809840
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: namespaced_redis
|
115
|
-
requirement: &
|
115
|
+
requirement: &70328776809160 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,7 +120,7 @@ dependencies:
|
|
120
120
|
version: 1.0.2
|
121
121
|
type: :runtime
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70328776809160
|
124
124
|
description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
125
125
|
crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
|
126
126
|
is also a standalone crawler that has a sophisticated statistics monitoring interface
|
@@ -136,6 +136,8 @@ files:
|
|
136
136
|
- spec/cobweb/cobweb_links_spec.rb
|
137
137
|
- spec/cobweb/cobweb_spec.rb
|
138
138
|
- spec/cobweb/content_link_parser_spec.rb
|
139
|
+
- spec/cobweb/robots_spec.rb
|
140
|
+
- spec/samples/robots.txt
|
139
141
|
- spec/samples/sample_html_links.html
|
140
142
|
- spec/samples/sample_server.rb
|
141
143
|
- spec/samples/sample_site/boxgrid.html
|
@@ -293,6 +295,7 @@ files:
|
|
293
295
|
- spec/samples/sample_site/js/superfish.js
|
294
296
|
- spec/samples/sample_site/js/supersubs.js
|
295
297
|
- spec/samples/sample_site/more.html
|
298
|
+
- spec/samples/sample_site/robots.txt
|
296
299
|
- spec/samples/sample_site/tables.html
|
297
300
|
- spec/samples/sample_site/text/museosans-webfont.eot
|
298
301
|
- spec/samples/sample_site/text/museosans-webfont.svg
|