cobweb 0.0.57 → 0.0.58

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.textile CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.57
2
+ h1. Cobweb v0.0.58
3
3
  !https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
4
4
 
5
5
  h2. Intro
@@ -41,6 +41,8 @@ h3. Data Returned
41
41
  * :crawl_id - the id used internally for identifying the crawl. Can be used by the processing job to seperate crawls
42
42
  * :internal_urls - an array of urls with * wildcards that represent urls internal to the site (ie pages within the same domain)
43
43
  * :external_urls - an array of urls with * wildcards that represent urls external to the site (overrides internal_urls)
44
+ * :obey_robots - boolean determining if robots.txt should be honoured. (default: false)
45
+ * :user_agent - user agent string to match in robots.txt (not sent as user_agent of requests yet) (default: cobweb)
44
46
 
45
47
  The source for the links can be overridden, contact me for the syntax (don't have time to put it into this documentation, will as soon as i have time!)
46
48
 
data/lib/cobweb.rb CHANGED
@@ -43,6 +43,9 @@ class Cobweb
43
43
  default_redis_options_to Hash.new
44
44
  default_internal_urls_to []
45
45
  default_first_page_redirect_internal_to true
46
+ default_text_mime_types_to ["text/*", "application/xhtml+xml"]
47
+ default_obey_robots_to false
48
+ default_user_agent_to "cobweb"
46
49
 
47
50
  end
48
51
 
@@ -177,7 +180,7 @@ class Cobweb
177
180
  content[:character_set] = charset
178
181
  end
179
182
  content[:length] = response.content_length
180
- if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
183
+ if text_content?(content[:mime_type])
181
184
  if response["Content-Encoding"]=="gzip"
182
185
  content[:body] = Zlib::GzipReader.new(StringIO.new(response.body)).read
183
186
  else
@@ -384,5 +387,23 @@ class Cobweb
384
387
  content
385
388
  end
386
389
 
387
- end
390
+ end
391
+
392
+ private
393
+ # checks if the mime_type is textual
394
+ def text_content?(content_type)
395
+ @options[:text_mime_types].each do |mime_type|
396
+ return true if content_type.match(escape_pattern_for_regex(mime_type))
397
+ end
398
+ false
399
+ end
400
+
401
+ # escapes characters with meaning in regular expressions and adds wildcard expression
402
+ def escape_pattern_for_regex(pattern)
403
+ pattern = pattern.gsub(".", "\\.")
404
+ pattern = pattern.gsub("?", "\\?")
405
+ pattern = pattern.gsub("*", ".*?")
406
+ pattern
407
+ end
408
+
388
409
  end
@@ -64,6 +64,7 @@ class CobwebCrawler
64
64
  @redis.incr "crawl-counter"
65
65
 
66
66
  internal_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https])
67
+ ap internal_links
67
68
 
68
69
  # select the link if its internal (eliminate external before expensive lookups in queued and crawled)
69
70
  cobweb_links = CobwebLinks.new(@options)
data/lib/cobweb_links.rb CHANGED
@@ -17,6 +17,15 @@ class CobwebLinks
17
17
 
18
18
  end
19
19
 
20
+ def allowed?(link)
21
+ if @options[:obey_robots]
22
+ robot = Robots.new(:url => link, :user_agent => @options[:user_agent])
23
+ return robot.allowed?(link)
24
+ else
25
+ return true
26
+ end
27
+ end
28
+
20
29
  # Returns true if the link is matched to an internal_url and not matched to an external_url
21
30
  def internal?(link)
22
31
  if @options[:debug]
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "0.0.57"
6
+ "0.0.58"
7
7
  end
8
8
 
9
9
  end
data/lib/robots.rb CHANGED
@@ -2,10 +2,78 @@
2
2
  class Robots
3
3
 
4
4
  # Processes the robots.txt file
5
- def initialize(url, file_name="robots.txt")
5
+ def initialize(options)
6
+ @options = options
7
+ raise "options should be a hash" unless options.kind_of? Hash
8
+ raise ":url is required" unless @options.has_key? :url
9
+ @options[:file] = "robots.txt" unless @options.has_key? :file
10
+ @options[:user_agent] = "cobweb" unless @options.has_key? :user_agent
11
+
12
+ uri = URI.parse(@options[:url])
13
+ content = Cobweb.new(:cache => nil, :text_mime_types => ["text/html", "application/xhtml+xml", "text/plain"]).get([uri.scheme, "://", uri.host, ":", uri.port, "/", @options[:file]].join)
14
+ if content[:mime_type][0..4] == "text/"
15
+ @raw_data = parse_data(content[:body])
16
+
17
+ if @options.has_key?(:user_agent) && @raw_data.has_key?(@options[:user_agent].to_s.downcase.to_sym)
18
+ @params = @raw_data[@options[:user_agent].to_s.downcase.to_sym]
19
+ else
20
+ raise "Wildcard user-agent is not present" unless @raw_data.has_key? :*
21
+ @params = @raw_data[:*]
22
+ end
23
+ else
24
+ raise "Invalid mime type: #{content[:content_type]}"
25
+ end
26
+ end
27
+
28
+ def allowed?(url)
6
29
  uri = URI.parse(url)
7
- [uri.scheme, "://", uri.host, ":", uri.port, "/", file_name].join
8
- Cobweb.new(:cache => 6000).get([uri.scheme, "://", uri.host, ":", uri.port, "/", file_name].join)
30
+ @params[:allow].each do |pattern|
31
+ return true if uri.path.match(escape_pattern_for_regex(pattern))
32
+ end
33
+ @params[:disallow].each do |pattern|
34
+ return false if uri.path.match(escape_pattern_for_regex(pattern))
35
+ end
36
+ true
37
+ end
38
+
39
+ def user_agent_settings
40
+ @params
41
+ end
42
+
43
+ def contents
44
+ @raw_data
45
+ end
46
+
47
+ private
48
+ # escapes characters with meaning in regular expressions and adds wildcard expression
49
+ def escape_pattern_for_regex(pattern)
50
+ pattern = pattern.gsub(".", "\\.")
51
+ pattern = pattern.gsub("?", "\\?")
52
+ pattern = pattern.gsub("*", ".*?")
53
+ pattern
54
+ end
55
+
56
+ def parse_data(data)
57
+ user_agents = {}
58
+ lines = data.split("\n")
59
+ lines.map!{|line| line.strip}
60
+ lines.reject!{|line| line == "" || line[0] == "#"}
61
+ current_user_agent = nil
9
62
 
63
+ lines.each do |line|
64
+ if line[0..10].downcase == "user-agent:"
65
+ current_user_agent = line.split(":")[1..-1].join.downcase.strip.to_sym
66
+ user_agents[current_user_agent] = {:allow => [], :disallow => []}
67
+ else
68
+ if current_user_agent
69
+ values = line.split(":")
70
+ unless values[1..-1].join.strip == ""
71
+ user_agents[current_user_agent][values[0].downcase.strip.to_sym] = [] unless user_agents[current_user_agent].has_key? values[0].downcase.to_sym
72
+ user_agents[current_user_agent][values[0].downcase.strip.to_sym] << values[1..-1].join.strip
73
+ end
74
+ end
75
+ end
76
+ end
77
+ user_agents
10
78
  end
11
79
  end
data/lib/server.rb CHANGED
@@ -5,7 +5,6 @@ require 'haml'
5
5
  class Server < Sinatra::Base
6
6
 
7
7
  set :views, settings.root + '/../views'
8
- puts "#{settings.root}/../public"
9
8
  set :public_folder, settings.root + '/../public'
10
9
  enable :static
11
10
 
@@ -4,7 +4,7 @@ describe CobwebCrawler do
4
4
 
5
5
  before(:each) do
6
6
 
7
- @base_url = "http://www.baseurl.com/"
7
+ @base_url = "http://localhost:3532/"
8
8
 
9
9
  @default_headers = {"Cache-Control" => "private, max-age=0",
10
10
  "Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
@@ -27,9 +27,9 @@ describe CobwebCrawler do
27
27
 
28
28
  # temporary tests to run crawler - proper specs to follow.. honest
29
29
 
30
- crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
30
+ crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => true})
31
31
 
32
- statistics = crawler.crawl("http://rockwellcottage.heroku.com/")
32
+ statistics = crawler.crawl(@base_url)
33
33
 
34
34
  statistics.should_not be_nil
35
35
  statistics.should be_an_instance_of Hash
@@ -42,7 +42,7 @@ describe CobwebCrawler do
42
42
 
43
43
  crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
44
44
 
45
- statistics = crawler.crawl("http://www.rockwellcottage.com/") do |content, statistics|
45
+ statistics = crawler.crawl(@base_url) do |content, statistics|
46
46
  content[:url].should_not be_nil
47
47
  statistics[:average_length].should_not be_nil
48
48
  end
@@ -12,15 +12,6 @@ describe Cobweb, :local_only => true do
12
12
  io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=5 QUEUE=cobweb_crawl_job > log/output.log &")
13
13
  puts "Workers Started."
14
14
 
15
- # START THIN SERVER TO HOST THE SAMPLE SITE FOR CRAWLING
16
- @thin = nil
17
- Thread.new do
18
- @thin = Thin::Server.start("0.0.0.0", 3532, SampleServer.app)
19
- end
20
-
21
- # WAIT FOR START TO COMPLETE
22
- sleep 1
23
-
24
15
  end
25
16
 
26
17
  before(:each) do
@@ -147,7 +138,6 @@ describe Cobweb, :local_only => true do
147
138
  @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
148
139
  command = "kill #{(@all_processes - @existing_processes).join(" ")}"
149
140
  IO.popen(command)
150
- #@thin.stop!
151
141
  end
152
142
 
153
143
  end
@@ -0,0 +1,70 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ describe Robots do
4
+
5
+ before(:each) do
6
+ @cobweb = Cobweb.new :quiet => true, :cache => nil
7
+ end
8
+
9
+ describe "default user-agent" do
10
+ before(:each) do
11
+ @options = {:url => "http://localhost/"}
12
+ end
13
+
14
+ it "should parse a valid robots.txt" do
15
+ lambda {Robots.new(@options)}.should_not raise_error
16
+ end
17
+
18
+ it "should allow urls marked as allow" do
19
+ robot = Robots.new(@options)
20
+ robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should be_true
21
+ end
22
+ it "should disallow urls specified as disallow" do
23
+ robot = Robots.new(@options)
24
+ robot.allowed?("http://localhost/globalmarketfinder/").should be_false
25
+ robot.allowed?("http://localhost/globalmarketfinder/asdf").should be_false
26
+ end
27
+ it "should allow urls not listed" do
28
+ robot = Robots.new(@options)
29
+ robot.allowed?("http://localhost/notlistedinrobotsfile").should be_true
30
+ end
31
+
32
+ end
33
+
34
+ describe "google user-agent" do
35
+ before(:each) do
36
+ @options = {:url => "http://localhost/", :user_agent => "google"}
37
+ end
38
+ it "should parse a valid robots.txt" do
39
+ lambda {Robots.new(@options)}.should_not raise_error
40
+ end
41
+
42
+ it "should disallow all urls" do
43
+ robot = Robots.new(@options)
44
+ robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should be_false
45
+ robot.allowed?("http://localhost/globalmarketfinder/").should be_false
46
+ robot.allowed?("http://localhost/globalmarketfinder/asdf").should be_false
47
+ robot.allowed?("http://localhost/notlistedinrobotsfile").should be_false
48
+ end
49
+
50
+ end
51
+
52
+ describe "cybermapper user-agent" do
53
+ before(:each) do
54
+ @options = {:url => "http://localhost/", :user_agent => "cybermapper"}
55
+ end
56
+ it "should parse a valid robots.txt" do
57
+ lambda {Robots.new(@options)}.should_not raise_error
58
+ end
59
+
60
+ it "should disallow all urls" do
61
+ robot = Robots.new(@options)
62
+ robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should be_true
63
+ robot.allowed?("http://localhost/globalmarketfinder/").should be_true
64
+ robot.allowed?("http://localhost/globalmarketfinder/asdf").should be_true
65
+ robot.allowed?("http://localhost/notlistedinrobotsfile").should be_true
66
+ end
67
+
68
+ end
69
+
70
+ end
@@ -0,0 +1,294 @@
1
+ # robots.txt for http://www.example.com/
2
+
3
+ User-agent: cybermapper
4
+ Disallow:
5
+
6
+ User-agent: google
7
+ Disallow: /
8
+
9
+ User-agent: *
10
+ Disallow: /search
11
+ Disallow: /sdch
12
+ Disallow: /groups
13
+ Disallow: /images
14
+ Disallow: /catalogs
15
+ Allow: /catalogs/about
16
+ Allow: /catalogs/p?
17
+ Disallow: /catalogues
18
+ Disallow: /news
19
+ Allow: /news/directory
20
+ Disallow: /nwshp
21
+ Disallow: /setnewsprefs?
22
+ Disallow: /index.html?
23
+ Disallow: /?
24
+ Allow: /?hl=
25
+ Disallow: /?hl=*&
26
+ Disallow: /addurl/image?
27
+ Disallow: /pagead/
28
+ Disallow: /relpage/
29
+ Disallow: /relcontent
30
+ Disallow: /imgres
31
+ Disallow: /imglanding
32
+ Disallow: /sbd
33
+ Disallow: /keyword/
34
+ Disallow: /u/
35
+ Disallow: /univ/
36
+ Disallow: /cobrand
37
+ Disallow: /custom
38
+ Disallow: /advanced_group_search
39
+ Disallow: /googlesite
40
+ Disallow: /preferences
41
+ Disallow: /setprefs
42
+ Disallow: /swr
43
+ Disallow: /url
44
+ Disallow: /default
45
+ Disallow: /m?
46
+ Disallow: /m/?
47
+ Disallow: /m/blogs?
48
+ Disallow: /m/directions?
49
+ Disallow: /m/ig
50
+ Disallow: /m/images?
51
+ Disallow: /m/imgres?
52
+ Disallow: /m/local?
53
+ Disallow: /m/movies?
54
+ Disallow: /m/news?
55
+ Disallow: /m/news/i?
56
+ Disallow: /m/place?
57
+ Disallow: /m/products?
58
+ Disallow: /m/products/
59
+ Disallow: /m/setnewsprefs?
60
+ Disallow: /m/search?
61
+ Disallow: /m/swmloptin?
62
+ Disallow: /m/trends
63
+ Disallow: /m/video?
64
+ Disallow: /wml?
65
+ Disallow: /wml/?
66
+ Disallow: /wml/search?
67
+ Disallow: /xhtml?
68
+ Disallow: /xhtml/?
69
+ Disallow: /xhtml/search?
70
+ Disallow: /xml?
71
+ Disallow: /imode?
72
+ Disallow: /imode/?
73
+ Disallow: /imode/search?
74
+ Disallow: /jsky?
75
+ Disallow: /jsky/?
76
+ Disallow: /jsky/search?
77
+ Disallow: /pda?
78
+ Disallow: /pda/?
79
+ Disallow: /pda/search?
80
+ Disallow: /sprint_xhtml
81
+ Disallow: /sprint_wml
82
+ Disallow: /pqa
83
+ Disallow: /palm
84
+ Disallow: /gwt/
85
+ Disallow: /purchases
86
+ Disallow: /hws
87
+ Disallow: /bsd?
88
+ Disallow: /linux?
89
+ Disallow: /mac?
90
+ Disallow: /microsoft?
91
+ Disallow: /unclesam?
92
+ Disallow: /answers/search?q=
93
+ Disallow: /local?
94
+ Disallow: /local_url
95
+ Disallow: /shihui?
96
+ Disallow: /shihui/
97
+ Disallow: /froogle?
98
+ Disallow: /products?
99
+ Disallow: /products/
100
+ Disallow: /froogle_
101
+ Disallow: /product_
102
+ Disallow: /products_
103
+ Disallow: /products;
104
+ Disallow: /print
105
+ Disallow: /books/
106
+ Disallow: /bkshp?*q=*
107
+ Disallow: /books?*q=*
108
+ Disallow: /books?*output=*
109
+ Disallow: /books?*pg=*
110
+ Disallow: /books?*jtp=*
111
+ Disallow: /books?*jscmd=*
112
+ Disallow: /books?*buy=*
113
+ Disallow: /books?*zoom=*
114
+ Allow: /books?*q=related:*
115
+ Allow: /books?*q=editions:*
116
+ Allow: /books?*q=subject:*
117
+ Allow: /books/about
118
+ Allow: /booksrightsholders
119
+ Allow: /books?*zoom=1*
120
+ Allow: /books?*zoom=5*
121
+ Disallow: /ebooks/
122
+ Disallow: /ebooks?*q=*
123
+ Disallow: /ebooks?*output=*
124
+ Disallow: /ebooks?*pg=*
125
+ Disallow: /ebooks?*jscmd=*
126
+ Disallow: /ebooks?*buy=*
127
+ Disallow: /ebooks?*zoom=*
128
+ Allow: /ebooks?*q=related:*
129
+ Allow: /ebooks?*q=editions:*
130
+ Allow: /ebooks?*q=subject:*
131
+ Allow: /ebooks?*zoom=1*
132
+ Allow: /ebooks?*zoom=5*
133
+ Disallow: /patents?
134
+ Allow: /patents?id=
135
+ Allow: /patents?vid=
136
+ Disallow: /scholar
137
+ Disallow: /citations?
138
+ Allow: /citations?user=
139
+ Allow: /citations?view_op=new_profile
140
+ Allow: /citations?view_op=top_venues
141
+ Disallow: /complete
142
+ Disallow: /s?
143
+ Disallow: /sponsoredlinks
144
+ Disallow: /videosearch?
145
+ Disallow: /videopreview?
146
+ Disallow: /videoprograminfo?
147
+ Disallow: /maps?
148
+ Disallow: /mapstt?
149
+ Disallow: /mapslt?
150
+ Disallow: /maps/stk/
151
+ Disallow: /maps/br?
152
+ Disallow: /mapabcpoi?
153
+ Disallow: /maphp?
154
+ Disallow: /mapprint?
155
+ Disallow: /maps/api/js/StaticMapService.GetMapImage?
156
+ Disallow: /maps/api/staticmap?
157
+ Disallow: /mld?
158
+ Disallow: /staticmap?
159
+ Disallow: /places/
160
+ Allow: /places/$
161
+ Disallow: /maps/place
162
+ Disallow: /help/maps/streetview/partners/welcome/
163
+ Disallow: /lochp?
164
+ Disallow: /center
165
+ Disallow: /ie?
166
+ Disallow: /sms/demo?
167
+ Disallow: /katrina?
168
+ Disallow: /blogsearch?
169
+ Disallow: /blogsearch/
170
+ Disallow: /blogsearch_feeds
171
+ Disallow: /advanced_blog_search
172
+ Disallow: /reader/
173
+ Allow: /reader/play
174
+ Disallow: /uds/
175
+ Disallow: /chart?
176
+ Disallow: /transit?
177
+ Disallow: /mbd?
178
+ Disallow: /extern_js/
179
+ Disallow: /calendar/feeds/
180
+ Disallow: /calendar/ical/
181
+ Disallow: /cl2/feeds/
182
+ Disallow: /cl2/ical/
183
+ Disallow: /coop/directory
184
+ Disallow: /coop/manage
185
+ Disallow: /trends?
186
+ Disallow: /trends/music?
187
+ Disallow: /trends/hottrends?
188
+ Disallow: /trends/viz?
189
+ Disallow: /notebook/search?
190
+ Disallow: /musica
191
+ Disallow: /musicad
192
+ Disallow: /musicas
193
+ Disallow: /musicl
194
+ Disallow: /musics
195
+ Disallow: /musicsearch
196
+ Disallow: /musicsp
197
+ Disallow: /musiclp
198
+ Disallow: /browsersync
199
+ Disallow: /call
200
+ Disallow: /archivesearch?
201
+ Disallow: /archivesearch/url
202
+ Disallow: /archivesearch/advanced_search
203
+ Disallow: /base/reportbadoffer
204
+ Disallow: /urchin_test/
205
+ Disallow: /movies?
206
+ Disallow: /codesearch?
207
+ Disallow: /codesearch/feeds/search?
208
+ Disallow: /wapsearch?
209
+ Disallow: /safebrowsing
210
+ Allow: /safebrowsing/diagnostic
211
+ Allow: /safebrowsing/report_badware/
212
+ Allow: /safebrowsing/report_error/
213
+ Allow: /safebrowsing/report_phish/
214
+ Disallow: /reviews/search?
215
+ Disallow: /orkut/albums
216
+ Allow: /jsapi
217
+ Disallow: /views?
218
+ Disallow: /c/
219
+ Disallow: /cbk
220
+ Allow: /cbk?output=tile&cb_client=maps_sv
221
+ Disallow: /recharge/dashboard/car
222
+ Disallow: /recharge/dashboard/static/
223
+ Disallow: /translate_a/
224
+ Disallow: /translate_c
225
+ Disallow: /translate_f
226
+ Disallow: /translate_static/
227
+ Disallow: /translate_suggestion
228
+ Disallow: /profiles/me
229
+ Allow: /profiles
230
+ Disallow: /s2/profiles/me
231
+ Allow: /s2/profiles
232
+ Allow: /s2/photos
233
+ Allow: /s2/static
234
+ Disallow: /s2
235
+ Disallow: /transconsole/portal/
236
+ Disallow: /gcc/
237
+ Disallow: /aclk
238
+ Disallow: /cse?
239
+ Disallow: /cse/home
240
+ Disallow: /cse/panel
241
+ Disallow: /cse/manage
242
+ Disallow: /tbproxy/
243
+ Disallow: /imesync/
244
+ Disallow: /shenghuo/search?
245
+ Disallow: /support/forum/search?
246
+ Disallow: /reviews/polls/
247
+ Disallow: /hosted/images/
248
+ Disallow: /ppob/?
249
+ Disallow: /ppob?
250
+ Disallow: /ig/add?
251
+ Disallow: /adwordsresellers
252
+ Disallow: /accounts/o8
253
+ Allow: /accounts/o8/id
254
+ Disallow: /topicsearch?q=
255
+ Disallow: /xfx7/
256
+ Disallow: /squared/api
257
+ Disallow: /squared/search
258
+ Disallow: /squared/table
259
+ Disallow: /toolkit/
260
+ Allow: /toolkit/*.html
261
+ Disallow: /globalmarketfinder/
262
+ Allow: /globalmarketfinder/*.html
263
+ Disallow: /qnasearch?
264
+ Disallow: /app/updates
265
+ Disallow: /sidewiki/entry/
266
+ Disallow: /quality_form?
267
+ Disallow: /labs/popgadget/search
268
+ Disallow: /buzz/post
269
+ Disallow: /compressiontest/
270
+ Disallow: /analytics/reporting/
271
+ Disallow: /analytics/admin/
272
+ Disallow: /analytics/web/
273
+ Disallow: /analytics/feeds/
274
+ Disallow: /analytics/settings/
275
+ Disallow: /alerts/
276
+ Disallow: /ads/preferences/
277
+ Allow: /ads/preferences/html/
278
+ Allow: /ads/preferences/plugin
279
+ Disallow: /settings/ads/onweb/
280
+ Disallow: /phone/compare/?
281
+ Allow: /alerts/manage
282
+ Disallow: /travel/clk
283
+ Disallow: /hotelfinder/rpc
284
+ Disallow: /flights/rpc
285
+ Disallow: /commercesearch/services/
286
+ Disallow: /evaluation/
287
+ Disallow: /webstore/search
288
+ Disallow: /chrome/browser/mobile/tour
289
+ Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
290
+ Sitemap: http://www.google.com/hostednews/sitemap_index.xml
291
+ Sitemap: http://www.google.com/ventures/sitemap_ventures.xml
292
+ Sitemap: http://www.google.com/sitemaps_webmasters.xml
293
+ Sitemap: http://www.gstatic.com/trends/websites/sitemaps/sitemapindex.xml
294
+ Sitemap: http://www.gstatic.com/dictionary/static/sitemaps/sitemap_index.xml
@@ -0,0 +1,13 @@
1
+ # robots.txt for http://www.example.com/
2
+
3
+ User-agent: cybermapper
4
+ Disallow:
5
+
6
+ User-agent: google
7
+ Disallow: /
8
+
9
+ User-agent: *
10
+ Disallow: /forms
11
+ Disallow: /gallery
12
+ Disallow: /more.html
13
+ Allow: /catalogs/about
data/spec/spec_helper.rb CHANGED
@@ -13,6 +13,17 @@ RSpec.configure do |config|
13
13
  config.filter_run_excluding :local_only => true
14
14
  end
15
15
 
16
+ config.before(:all) {
17
+ # START THIN SERVER TO HOST THE SAMPLE SITE FOR CRAWLING
18
+ @thin = nil
19
+ Thread.new do
20
+ @thin = Thin::Server.start("0.0.0.0", 3532, SampleServer.app)
21
+ end
22
+
23
+ # WAIT FOR START TO COMPLETE
24
+ sleep 1
25
+ }
26
+
16
27
  config.before(:each) {
17
28
 
18
29
  #redis_mock = double("redis")
@@ -40,10 +51,12 @@ RSpec.configure do |config|
40
51
 
41
52
  @mock_http_client = mock(Net::HTTP)
42
53
  @mock_http_request = mock(Net::HTTPRequest)
54
+ @mock_http_robot_request = mock(Net::HTTPRequest)
43
55
  @mock_http_redirect_request = mock(Net::HTTPRequest)
44
56
  @mock_http_redirect_request2 = mock(Net::HTTPRequest)
45
57
 
46
58
  @mock_http_response = mock(Net::HTTPResponse)
59
+ @mock_http_robot_response = mock(Net::HTTPResponse)
47
60
  @mock_http_redirect_response = mock(Net::HTTPRedirection)
48
61
  @mock_http_redirect_response2 = mock(Net::HTTPRedirection)
49
62
  @mock_http_get = mock(Net::HTTP::Get)
@@ -51,11 +64,13 @@ RSpec.configure do |config|
51
64
  Net::HTTP.stub!(:new).and_return(@mock_http_client)
52
65
  Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
53
66
  Net::HTTP::Get.stub!(:new).with("/redirect.html", {}).and_return(@mock_http_redirect_request)
67
+ Net::HTTP::Get.stub!(:new).with("/robots.txt", {}).and_return(@mock_http_robot_request)
54
68
  Net::HTTP::Get.stub!(:new).with("/redirect2.html", {}).and_return(@mock_http_redirect_request2)
55
69
 
56
70
  Net::HTTP::Head.stub!(:new).and_return(@mock_http_request)
57
71
 
58
72
  @mock_http_client.stub!(:request).with(@mock_http_request).and_return(@mock_http_response)
73
+ @mock_http_client.stub!(:request).with(@mock_http_robot_request).and_return(@mock_http_robot_response)
59
74
  @mock_http_client.stub!(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
60
75
  @mock_http_client.stub!(:request).with(@mock_http_redirect_request2).and_return(@mock_http_redirect_response2)
61
76
  @mock_http_client.stub!(:read_timeout=).and_return(nil)
@@ -64,6 +79,16 @@ RSpec.configure do |config|
64
79
  @mock_http_client.stub!(:address).and_return("www.baseurl.com")
65
80
  @mock_http_client.stub!(:port).and_return("80 ")
66
81
 
82
+ @mock_http_robot_response.stub!(:code).and_return(200)
83
+ @mock_http_robot_response.stub!(:body).and_return(File.open(File.dirname(__FILE__) + '/../spec/samples/robots.txt', "r").read)
84
+ @mock_http_robot_response.stub!(:content_type).and_return("text/plain")
85
+ @mock_http_robot_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
86
+ @mock_http_robot_response.stub!(:[]).with("location").and_return(@default_headers["location"])
87
+ @mock_http_robot_response.stub!(:[]).with("Content-Encoding").and_return(@default_headers["Content-Encoding"])
88
+ @mock_http_robot_response.stub!(:content_length).and_return(1024)
89
+ @mock_http_robot_response.stub!(:get_fields).with('set-cookie').and_return(["session=al98axx; expires=Fri, 31-Dec-1999 23:58:23", "query=rubyscript; expires=Fri, 31-Dec-1999 23:58:23"])
90
+ @mock_http_robot_response.stub!(:to_hash).and_return(@default_headers)
91
+
67
92
  @mock_http_response.stub!(:code).and_return(200)
68
93
  @mock_http_response.stub!(:content_type).and_return("text/html")
69
94
  @mock_http_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.57
4
+ version: 0.0.58
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-06-13 00:00:00.000000000 Z
12
+ date: 2012-06-30 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70194152226040 !ruby/object:Gem::Requirement
16
+ requirement: &70328776801460 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70194152226040
24
+ version_requirements: *70328776801460
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70194152224380 !ruby/object:Gem::Requirement
27
+ requirement: &70328776799760 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70194152224380
35
+ version_requirements: *70328776799760
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70194152223560 !ruby/object:Gem::Requirement
38
+ requirement: &70328776798960 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70194152223560
46
+ version_requirements: *70328776798960
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70194152222500 !ruby/object:Gem::Requirement
49
+ requirement: &70328776797840 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70194152222500
57
+ version_requirements: *70328776797840
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70194152220920 !ruby/object:Gem::Requirement
60
+ requirement: &70328776796300 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70194152220920
68
+ version_requirements: *70328776796300
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70194152236080 !ruby/object:Gem::Requirement
71
+ requirement: &70328776811560 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70194152236080
79
+ version_requirements: *70328776811560
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70194152235480 !ruby/object:Gem::Requirement
82
+ requirement: &70328776810940 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70194152235480
90
+ version_requirements: *70328776810940
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70194152234920 !ruby/object:Gem::Requirement
93
+ requirement: &70328776810380 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70194152234920
101
+ version_requirements: *70328776810380
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70194152234340 !ruby/object:Gem::Requirement
104
+ requirement: &70328776809840 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70194152234340
112
+ version_requirements: *70328776809840
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70194152233680 !ruby/object:Gem::Requirement
115
+ requirement: &70328776809160 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70194152233680
123
+ version_requirements: *70328776809160
124
124
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
125
125
  crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
126
126
  is also a standalone crawler that has a sophisticated statistics monitoring interface
@@ -136,6 +136,8 @@ files:
136
136
  - spec/cobweb/cobweb_links_spec.rb
137
137
  - spec/cobweb/cobweb_spec.rb
138
138
  - spec/cobweb/content_link_parser_spec.rb
139
+ - spec/cobweb/robots_spec.rb
140
+ - spec/samples/robots.txt
139
141
  - spec/samples/sample_html_links.html
140
142
  - spec/samples/sample_server.rb
141
143
  - spec/samples/sample_site/boxgrid.html
@@ -293,6 +295,7 @@ files:
293
295
  - spec/samples/sample_site/js/superfish.js
294
296
  - spec/samples/sample_site/js/supersubs.js
295
297
  - spec/samples/sample_site/more.html
298
+ - spec/samples/sample_site/robots.txt
296
299
  - spec/samples/sample_site/tables.html
297
300
  - spec/samples/sample_site/text/museosans-webfont.eot
298
301
  - spec/samples/sample_site/text/museosans-webfont.svg