cobweb 0.0.57 → 0.0.58

Sign up to get free protection for your applications and to get access to all the features.
data/README.textile CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.57
2
+ h1. Cobweb v0.0.58
3
3
  !https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
4
4
 
5
5
  h2. Intro
@@ -41,6 +41,8 @@ h3. Data Returned
41
41
  * :crawl_id - the id used internally for identifying the crawl. Can be used by the processing job to seperate crawls
42
42
  * :internal_urls - an array of urls with * wildcards that represent urls internal to the site (ie pages within the same domain)
43
43
  * :external_urls - an array of urls with * wildcards that represent urls external to the site (overrides internal_urls)
44
+ * :obey_robots - boolean determining if robots.txt should be honoured. (default: false)
45
+ * :user_agent - user agent string to match in robots.txt (not sent as user_agent of requests yet) (default: cobweb)
44
46
 
45
47
  The source for the links can be overridden, contact me for the syntax (don't have time to put it into this documentation, will as soon as i have time!)
46
48
 
data/lib/cobweb.rb CHANGED
@@ -43,6 +43,9 @@ class Cobweb
43
43
  default_redis_options_to Hash.new
44
44
  default_internal_urls_to []
45
45
  default_first_page_redirect_internal_to true
46
+ default_text_mime_types_to ["text/*", "application/xhtml+xml"]
47
+ default_obey_robots_to false
48
+ default_user_agent_to "cobweb"
46
49
 
47
50
  end
48
51
 
@@ -177,7 +180,7 @@ class Cobweb
177
180
  content[:character_set] = charset
178
181
  end
179
182
  content[:length] = response.content_length
180
- if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
183
+ if text_content?(content[:mime_type])
181
184
  if response["Content-Encoding"]=="gzip"
182
185
  content[:body] = Zlib::GzipReader.new(StringIO.new(response.body)).read
183
186
  else
@@ -384,5 +387,23 @@ class Cobweb
384
387
  content
385
388
  end
386
389
 
387
- end
390
+ end
391
+
392
+ private
393
+ # checks if the mime_type is textual
394
+ def text_content?(content_type)
395
+ @options[:text_mime_types].each do |mime_type|
396
+ return true if content_type.match(escape_pattern_for_regex(mime_type))
397
+ end
398
+ false
399
+ end
400
+
401
+ # escapes characters with meaning in regular expressions and adds wildcard expression
402
+ def escape_pattern_for_regex(pattern)
403
+ pattern = pattern.gsub(".", "\\.")
404
+ pattern = pattern.gsub("?", "\\?")
405
+ pattern = pattern.gsub("*", ".*?")
406
+ pattern
407
+ end
408
+
388
409
  end
@@ -64,6 +64,7 @@ class CobwebCrawler
64
64
  @redis.incr "crawl-counter"
65
65
 
66
66
  internal_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https])
67
+ ap internal_links
67
68
 
68
69
  # select the link if its internal (eliminate external before expensive lookups in queued and crawled)
69
70
  cobweb_links = CobwebLinks.new(@options)
data/lib/cobweb_links.rb CHANGED
@@ -17,6 +17,15 @@ class CobwebLinks
17
17
 
18
18
  end
19
19
 
20
+ def allowed?(link)
21
+ if @options[:obey_robots]
22
+ robot = Robots.new(:url => link, :user_agent => @options[:user_agent])
23
+ return robot.allowed?(link)
24
+ else
25
+ return true
26
+ end
27
+ end
28
+
20
29
  # Returns true if the link is matched to an internal_url and not matched to an external_url
21
30
  def internal?(link)
22
31
  if @options[:debug]
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "0.0.57"
6
+ "0.0.58"
7
7
  end
8
8
 
9
9
  end
data/lib/robots.rb CHANGED
@@ -2,10 +2,78 @@
2
2
  class Robots
3
3
 
4
4
  # Processes the robots.txt file
5
- def initialize(url, file_name="robots.txt")
5
+ def initialize(options)
6
+ @options = options
7
+ raise "options should be a hash" unless options.kind_of? Hash
8
+ raise ":url is required" unless @options.has_key? :url
9
+ @options[:file] = "robots.txt" unless @options.has_key? :file
10
+ @options[:user_agent] = "cobweb" unless @options.has_key? :user_agent
11
+
12
+ uri = URI.parse(@options[:url])
13
+ content = Cobweb.new(:cache => nil, :text_mime_types => ["text/html", "application/xhtml+xml", "text/plain"]).get([uri.scheme, "://", uri.host, ":", uri.port, "/", @options[:file]].join)
14
+ if content[:mime_type][0..4] == "text/"
15
+ @raw_data = parse_data(content[:body])
16
+
17
+ if @options.has_key?(:user_agent) && @raw_data.has_key?(@options[:user_agent].to_s.downcase.to_sym)
18
+ @params = @raw_data[@options[:user_agent].to_s.downcase.to_sym]
19
+ else
20
+ raise "Wildcard user-agent is not present" unless @raw_data.has_key? :*
21
+ @params = @raw_data[:*]
22
+ end
23
+ else
24
+ raise "Invalid mime type: #{content[:content_type]}"
25
+ end
26
+ end
27
+
28
+ def allowed?(url)
6
29
  uri = URI.parse(url)
7
- [uri.scheme, "://", uri.host, ":", uri.port, "/", file_name].join
8
- Cobweb.new(:cache => 6000).get([uri.scheme, "://", uri.host, ":", uri.port, "/", file_name].join)
30
+ @params[:allow].each do |pattern|
31
+ return true if uri.path.match(escape_pattern_for_regex(pattern))
32
+ end
33
+ @params[:disallow].each do |pattern|
34
+ return false if uri.path.match(escape_pattern_for_regex(pattern))
35
+ end
36
+ true
37
+ end
38
+
39
+ def user_agent_settings
40
+ @params
41
+ end
42
+
43
+ def contents
44
+ @raw_data
45
+ end
46
+
47
+ private
48
+ # escapes characters with meaning in regular expressions and adds wildcard expression
49
+ def escape_pattern_for_regex(pattern)
50
+ pattern = pattern.gsub(".", "\\.")
51
+ pattern = pattern.gsub("?", "\\?")
52
+ pattern = pattern.gsub("*", ".*?")
53
+ pattern
54
+ end
55
+
56
+ def parse_data(data)
57
+ user_agents = {}
58
+ lines = data.split("\n")
59
+ lines.map!{|line| line.strip}
60
+ lines.reject!{|line| line == "" || line[0] == "#"}
61
+ current_user_agent = nil
9
62
 
63
+ lines.each do |line|
64
+ if line[0..10].downcase == "user-agent:"
65
+ current_user_agent = line.split(":")[1..-1].join.downcase.strip.to_sym
66
+ user_agents[current_user_agent] = {:allow => [], :disallow => []}
67
+ else
68
+ if current_user_agent
69
+ values = line.split(":")
70
+ unless values[1..-1].join.strip == ""
71
+ user_agents[current_user_agent][values[0].downcase.strip.to_sym] = [] unless user_agents[current_user_agent].has_key? values[0].downcase.to_sym
72
+ user_agents[current_user_agent][values[0].downcase.strip.to_sym] << values[1..-1].join.strip
73
+ end
74
+ end
75
+ end
76
+ end
77
+ user_agents
10
78
  end
11
79
  end
data/lib/server.rb CHANGED
@@ -5,7 +5,6 @@ require 'haml'
5
5
  class Server < Sinatra::Base
6
6
 
7
7
  set :views, settings.root + '/../views'
8
- puts "#{settings.root}/../public"
9
8
  set :public_folder, settings.root + '/../public'
10
9
  enable :static
11
10
 
@@ -4,7 +4,7 @@ describe CobwebCrawler do
4
4
 
5
5
  before(:each) do
6
6
 
7
- @base_url = "http://www.baseurl.com/"
7
+ @base_url = "http://localhost:3532/"
8
8
 
9
9
  @default_headers = {"Cache-Control" => "private, max-age=0",
10
10
  "Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
@@ -27,9 +27,9 @@ describe CobwebCrawler do
27
27
 
28
28
  # temporary tests to run crawler - proper specs to follow.. honest
29
29
 
30
- crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
30
+ crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => true})
31
31
 
32
- statistics = crawler.crawl("http://rockwellcottage.heroku.com/")
32
+ statistics = crawler.crawl(@base_url)
33
33
 
34
34
  statistics.should_not be_nil
35
35
  statistics.should be_an_instance_of Hash
@@ -42,7 +42,7 @@ describe CobwebCrawler do
42
42
 
43
43
  crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
44
44
 
45
- statistics = crawler.crawl("http://www.rockwellcottage.com/") do |content, statistics|
45
+ statistics = crawler.crawl(@base_url) do |content, statistics|
46
46
  content[:url].should_not be_nil
47
47
  statistics[:average_length].should_not be_nil
48
48
  end
@@ -12,15 +12,6 @@ describe Cobweb, :local_only => true do
12
12
  io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=5 QUEUE=cobweb_crawl_job > log/output.log &")
13
13
  puts "Workers Started."
14
14
 
15
- # START THIN SERVER TO HOST THE SAMPLE SITE FOR CRAWLING
16
- @thin = nil
17
- Thread.new do
18
- @thin = Thin::Server.start("0.0.0.0", 3532, SampleServer.app)
19
- end
20
-
21
- # WAIT FOR START TO COMPLETE
22
- sleep 1
23
-
24
15
  end
25
16
 
26
17
  before(:each) do
@@ -147,7 +138,6 @@ describe Cobweb, :local_only => true do
147
138
  @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
148
139
  command = "kill #{(@all_processes - @existing_processes).join(" ")}"
149
140
  IO.popen(command)
150
- #@thin.stop!
151
141
  end
152
142
 
153
143
  end
@@ -0,0 +1,70 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ describe Robots do
4
+
5
+ before(:each) do
6
+ @cobweb = Cobweb.new :quiet => true, :cache => nil
7
+ end
8
+
9
+ describe "default user-agent" do
10
+ before(:each) do
11
+ @options = {:url => "http://localhost/"}
12
+ end
13
+
14
+ it "should parse a valid robots.txt" do
15
+ lambda {Robots.new(@options)}.should_not raise_error
16
+ end
17
+
18
+ it "should allow urls marked as allow" do
19
+ robot = Robots.new(@options)
20
+ robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should be_true
21
+ end
22
+ it "should disallow urls specified as disallow" do
23
+ robot = Robots.new(@options)
24
+ robot.allowed?("http://localhost/globalmarketfinder/").should be_false
25
+ robot.allowed?("http://localhost/globalmarketfinder/asdf").should be_false
26
+ end
27
+ it "should allow urls not listed" do
28
+ robot = Robots.new(@options)
29
+ robot.allowed?("http://localhost/notlistedinrobotsfile").should be_true
30
+ end
31
+
32
+ end
33
+
34
+ describe "google user-agent" do
35
+ before(:each) do
36
+ @options = {:url => "http://localhost/", :user_agent => "google"}
37
+ end
38
+ it "should parse a valid robots.txt" do
39
+ lambda {Robots.new(@options)}.should_not raise_error
40
+ end
41
+
42
+ it "should disallow all urls" do
43
+ robot = Robots.new(@options)
44
+ robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should be_false
45
+ robot.allowed?("http://localhost/globalmarketfinder/").should be_false
46
+ robot.allowed?("http://localhost/globalmarketfinder/asdf").should be_false
47
+ robot.allowed?("http://localhost/notlistedinrobotsfile").should be_false
48
+ end
49
+
50
+ end
51
+
52
+ describe "cybermapper user-agent" do
53
+ before(:each) do
54
+ @options = {:url => "http://localhost/", :user_agent => "cybermapper"}
55
+ end
56
+ it "should parse a valid robots.txt" do
57
+ lambda {Robots.new(@options)}.should_not raise_error
58
+ end
59
+
60
+ it "should disallow all urls" do
61
+ robot = Robots.new(@options)
62
+ robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should be_true
63
+ robot.allowed?("http://localhost/globalmarketfinder/").should be_true
64
+ robot.allowed?("http://localhost/globalmarketfinder/asdf").should be_true
65
+ robot.allowed?("http://localhost/notlistedinrobotsfile").should be_true
66
+ end
67
+
68
+ end
69
+
70
+ end
@@ -0,0 +1,294 @@
1
+ # robots.txt for http://www.example.com/
2
+
3
+ User-agent: cybermapper
4
+ Disallow:
5
+
6
+ User-agent: google
7
+ Disallow: /
8
+
9
+ User-agent: *
10
+ Disallow: /search
11
+ Disallow: /sdch
12
+ Disallow: /groups
13
+ Disallow: /images
14
+ Disallow: /catalogs
15
+ Allow: /catalogs/about
16
+ Allow: /catalogs/p?
17
+ Disallow: /catalogues
18
+ Disallow: /news
19
+ Allow: /news/directory
20
+ Disallow: /nwshp
21
+ Disallow: /setnewsprefs?
22
+ Disallow: /index.html?
23
+ Disallow: /?
24
+ Allow: /?hl=
25
+ Disallow: /?hl=*&
26
+ Disallow: /addurl/image?
27
+ Disallow: /pagead/
28
+ Disallow: /relpage/
29
+ Disallow: /relcontent
30
+ Disallow: /imgres
31
+ Disallow: /imglanding
32
+ Disallow: /sbd
33
+ Disallow: /keyword/
34
+ Disallow: /u/
35
+ Disallow: /univ/
36
+ Disallow: /cobrand
37
+ Disallow: /custom
38
+ Disallow: /advanced_group_search
39
+ Disallow: /googlesite
40
+ Disallow: /preferences
41
+ Disallow: /setprefs
42
+ Disallow: /swr
43
+ Disallow: /url
44
+ Disallow: /default
45
+ Disallow: /m?
46
+ Disallow: /m/?
47
+ Disallow: /m/blogs?
48
+ Disallow: /m/directions?
49
+ Disallow: /m/ig
50
+ Disallow: /m/images?
51
+ Disallow: /m/imgres?
52
+ Disallow: /m/local?
53
+ Disallow: /m/movies?
54
+ Disallow: /m/news?
55
+ Disallow: /m/news/i?
56
+ Disallow: /m/place?
57
+ Disallow: /m/products?
58
+ Disallow: /m/products/
59
+ Disallow: /m/setnewsprefs?
60
+ Disallow: /m/search?
61
+ Disallow: /m/swmloptin?
62
+ Disallow: /m/trends
63
+ Disallow: /m/video?
64
+ Disallow: /wml?
65
+ Disallow: /wml/?
66
+ Disallow: /wml/search?
67
+ Disallow: /xhtml?
68
+ Disallow: /xhtml/?
69
+ Disallow: /xhtml/search?
70
+ Disallow: /xml?
71
+ Disallow: /imode?
72
+ Disallow: /imode/?
73
+ Disallow: /imode/search?
74
+ Disallow: /jsky?
75
+ Disallow: /jsky/?
76
+ Disallow: /jsky/search?
77
+ Disallow: /pda?
78
+ Disallow: /pda/?
79
+ Disallow: /pda/search?
80
+ Disallow: /sprint_xhtml
81
+ Disallow: /sprint_wml
82
+ Disallow: /pqa
83
+ Disallow: /palm
84
+ Disallow: /gwt/
85
+ Disallow: /purchases
86
+ Disallow: /hws
87
+ Disallow: /bsd?
88
+ Disallow: /linux?
89
+ Disallow: /mac?
90
+ Disallow: /microsoft?
91
+ Disallow: /unclesam?
92
+ Disallow: /answers/search?q=
93
+ Disallow: /local?
94
+ Disallow: /local_url
95
+ Disallow: /shihui?
96
+ Disallow: /shihui/
97
+ Disallow: /froogle?
98
+ Disallow: /products?
99
+ Disallow: /products/
100
+ Disallow: /froogle_
101
+ Disallow: /product_
102
+ Disallow: /products_
103
+ Disallow: /products;
104
+ Disallow: /print
105
+ Disallow: /books/
106
+ Disallow: /bkshp?*q=*
107
+ Disallow: /books?*q=*
108
+ Disallow: /books?*output=*
109
+ Disallow: /books?*pg=*
110
+ Disallow: /books?*jtp=*
111
+ Disallow: /books?*jscmd=*
112
+ Disallow: /books?*buy=*
113
+ Disallow: /books?*zoom=*
114
+ Allow: /books?*q=related:*
115
+ Allow: /books?*q=editions:*
116
+ Allow: /books?*q=subject:*
117
+ Allow: /books/about
118
+ Allow: /booksrightsholders
119
+ Allow: /books?*zoom=1*
120
+ Allow: /books?*zoom=5*
121
+ Disallow: /ebooks/
122
+ Disallow: /ebooks?*q=*
123
+ Disallow: /ebooks?*output=*
124
+ Disallow: /ebooks?*pg=*
125
+ Disallow: /ebooks?*jscmd=*
126
+ Disallow: /ebooks?*buy=*
127
+ Disallow: /ebooks?*zoom=*
128
+ Allow: /ebooks?*q=related:*
129
+ Allow: /ebooks?*q=editions:*
130
+ Allow: /ebooks?*q=subject:*
131
+ Allow: /ebooks?*zoom=1*
132
+ Allow: /ebooks?*zoom=5*
133
+ Disallow: /patents?
134
+ Allow: /patents?id=
135
+ Allow: /patents?vid=
136
+ Disallow: /scholar
137
+ Disallow: /citations?
138
+ Allow: /citations?user=
139
+ Allow: /citations?view_op=new_profile
140
+ Allow: /citations?view_op=top_venues
141
+ Disallow: /complete
142
+ Disallow: /s?
143
+ Disallow: /sponsoredlinks
144
+ Disallow: /videosearch?
145
+ Disallow: /videopreview?
146
+ Disallow: /videoprograminfo?
147
+ Disallow: /maps?
148
+ Disallow: /mapstt?
149
+ Disallow: /mapslt?
150
+ Disallow: /maps/stk/
151
+ Disallow: /maps/br?
152
+ Disallow: /mapabcpoi?
153
+ Disallow: /maphp?
154
+ Disallow: /mapprint?
155
+ Disallow: /maps/api/js/StaticMapService.GetMapImage?
156
+ Disallow: /maps/api/staticmap?
157
+ Disallow: /mld?
158
+ Disallow: /staticmap?
159
+ Disallow: /places/
160
+ Allow: /places/$
161
+ Disallow: /maps/place
162
+ Disallow: /help/maps/streetview/partners/welcome/
163
+ Disallow: /lochp?
164
+ Disallow: /center
165
+ Disallow: /ie?
166
+ Disallow: /sms/demo?
167
+ Disallow: /katrina?
168
+ Disallow: /blogsearch?
169
+ Disallow: /blogsearch/
170
+ Disallow: /blogsearch_feeds
171
+ Disallow: /advanced_blog_search
172
+ Disallow: /reader/
173
+ Allow: /reader/play
174
+ Disallow: /uds/
175
+ Disallow: /chart?
176
+ Disallow: /transit?
177
+ Disallow: /mbd?
178
+ Disallow: /extern_js/
179
+ Disallow: /calendar/feeds/
180
+ Disallow: /calendar/ical/
181
+ Disallow: /cl2/feeds/
182
+ Disallow: /cl2/ical/
183
+ Disallow: /coop/directory
184
+ Disallow: /coop/manage
185
+ Disallow: /trends?
186
+ Disallow: /trends/music?
187
+ Disallow: /trends/hottrends?
188
+ Disallow: /trends/viz?
189
+ Disallow: /notebook/search?
190
+ Disallow: /musica
191
+ Disallow: /musicad
192
+ Disallow: /musicas
193
+ Disallow: /musicl
194
+ Disallow: /musics
195
+ Disallow: /musicsearch
196
+ Disallow: /musicsp
197
+ Disallow: /musiclp
198
+ Disallow: /browsersync
199
+ Disallow: /call
200
+ Disallow: /archivesearch?
201
+ Disallow: /archivesearch/url
202
+ Disallow: /archivesearch/advanced_search
203
+ Disallow: /base/reportbadoffer
204
+ Disallow: /urchin_test/
205
+ Disallow: /movies?
206
+ Disallow: /codesearch?
207
+ Disallow: /codesearch/feeds/search?
208
+ Disallow: /wapsearch?
209
+ Disallow: /safebrowsing
210
+ Allow: /safebrowsing/diagnostic
211
+ Allow: /safebrowsing/report_badware/
212
+ Allow: /safebrowsing/report_error/
213
+ Allow: /safebrowsing/report_phish/
214
+ Disallow: /reviews/search?
215
+ Disallow: /orkut/albums
216
+ Allow: /jsapi
217
+ Disallow: /views?
218
+ Disallow: /c/
219
+ Disallow: /cbk
220
+ Allow: /cbk?output=tile&cb_client=maps_sv
221
+ Disallow: /recharge/dashboard/car
222
+ Disallow: /recharge/dashboard/static/
223
+ Disallow: /translate_a/
224
+ Disallow: /translate_c
225
+ Disallow: /translate_f
226
+ Disallow: /translate_static/
227
+ Disallow: /translate_suggestion
228
+ Disallow: /profiles/me
229
+ Allow: /profiles
230
+ Disallow: /s2/profiles/me
231
+ Allow: /s2/profiles
232
+ Allow: /s2/photos
233
+ Allow: /s2/static
234
+ Disallow: /s2
235
+ Disallow: /transconsole/portal/
236
+ Disallow: /gcc/
237
+ Disallow: /aclk
238
+ Disallow: /cse?
239
+ Disallow: /cse/home
240
+ Disallow: /cse/panel
241
+ Disallow: /cse/manage
242
+ Disallow: /tbproxy/
243
+ Disallow: /imesync/
244
+ Disallow: /shenghuo/search?
245
+ Disallow: /support/forum/search?
246
+ Disallow: /reviews/polls/
247
+ Disallow: /hosted/images/
248
+ Disallow: /ppob/?
249
+ Disallow: /ppob?
250
+ Disallow: /ig/add?
251
+ Disallow: /adwordsresellers
252
+ Disallow: /accounts/o8
253
+ Allow: /accounts/o8/id
254
+ Disallow: /topicsearch?q=
255
+ Disallow: /xfx7/
256
+ Disallow: /squared/api
257
+ Disallow: /squared/search
258
+ Disallow: /squared/table
259
+ Disallow: /toolkit/
260
+ Allow: /toolkit/*.html
261
+ Disallow: /globalmarketfinder/
262
+ Allow: /globalmarketfinder/*.html
263
+ Disallow: /qnasearch?
264
+ Disallow: /app/updates
265
+ Disallow: /sidewiki/entry/
266
+ Disallow: /quality_form?
267
+ Disallow: /labs/popgadget/search
268
+ Disallow: /buzz/post
269
+ Disallow: /compressiontest/
270
+ Disallow: /analytics/reporting/
271
+ Disallow: /analytics/admin/
272
+ Disallow: /analytics/web/
273
+ Disallow: /analytics/feeds/
274
+ Disallow: /analytics/settings/
275
+ Disallow: /alerts/
276
+ Disallow: /ads/preferences/
277
+ Allow: /ads/preferences/html/
278
+ Allow: /ads/preferences/plugin
279
+ Disallow: /settings/ads/onweb/
280
+ Disallow: /phone/compare/?
281
+ Allow: /alerts/manage
282
+ Disallow: /travel/clk
283
+ Disallow: /hotelfinder/rpc
284
+ Disallow: /flights/rpc
285
+ Disallow: /commercesearch/services/
286
+ Disallow: /evaluation/
287
+ Disallow: /webstore/search
288
+ Disallow: /chrome/browser/mobile/tour
289
+ Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
290
+ Sitemap: http://www.google.com/hostednews/sitemap_index.xml
291
+ Sitemap: http://www.google.com/ventures/sitemap_ventures.xml
292
+ Sitemap: http://www.google.com/sitemaps_webmasters.xml
293
+ Sitemap: http://www.gstatic.com/trends/websites/sitemaps/sitemapindex.xml
294
+ Sitemap: http://www.gstatic.com/dictionary/static/sitemaps/sitemap_index.xml
@@ -0,0 +1,13 @@
1
+ # robots.txt for http://www.example.com/
2
+
3
+ User-agent: cybermapper
4
+ Disallow:
5
+
6
+ User-agent: google
7
+ Disallow: /
8
+
9
+ User-agent: *
10
+ Disallow: /forms
11
+ Disallow: /gallery
12
+ Disallow: /more.html
13
+ Allow: /catalogs/about
data/spec/spec_helper.rb CHANGED
@@ -13,6 +13,17 @@ RSpec.configure do |config|
13
13
  config.filter_run_excluding :local_only => true
14
14
  end
15
15
 
16
+ config.before(:all) {
17
+ # START THIN SERVER TO HOST THE SAMPLE SITE FOR CRAWLING
18
+ @thin = nil
19
+ Thread.new do
20
+ @thin = Thin::Server.start("0.0.0.0", 3532, SampleServer.app)
21
+ end
22
+
23
+ # WAIT FOR START TO COMPLETE
24
+ sleep 1
25
+ }
26
+
16
27
  config.before(:each) {
17
28
 
18
29
  #redis_mock = double("redis")
@@ -40,10 +51,12 @@ RSpec.configure do |config|
40
51
 
41
52
  @mock_http_client = mock(Net::HTTP)
42
53
  @mock_http_request = mock(Net::HTTPRequest)
54
+ @mock_http_robot_request = mock(Net::HTTPRequest)
43
55
  @mock_http_redirect_request = mock(Net::HTTPRequest)
44
56
  @mock_http_redirect_request2 = mock(Net::HTTPRequest)
45
57
 
46
58
  @mock_http_response = mock(Net::HTTPResponse)
59
+ @mock_http_robot_response = mock(Net::HTTPResponse)
47
60
  @mock_http_redirect_response = mock(Net::HTTPRedirection)
48
61
  @mock_http_redirect_response2 = mock(Net::HTTPRedirection)
49
62
  @mock_http_get = mock(Net::HTTP::Get)
@@ -51,11 +64,13 @@ RSpec.configure do |config|
51
64
  Net::HTTP.stub!(:new).and_return(@mock_http_client)
52
65
  Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
53
66
  Net::HTTP::Get.stub!(:new).with("/redirect.html", {}).and_return(@mock_http_redirect_request)
67
+ Net::HTTP::Get.stub!(:new).with("/robots.txt", {}).and_return(@mock_http_robot_request)
54
68
  Net::HTTP::Get.stub!(:new).with("/redirect2.html", {}).and_return(@mock_http_redirect_request2)
55
69
 
56
70
  Net::HTTP::Head.stub!(:new).and_return(@mock_http_request)
57
71
 
58
72
  @mock_http_client.stub!(:request).with(@mock_http_request).and_return(@mock_http_response)
73
+ @mock_http_client.stub!(:request).with(@mock_http_robot_request).and_return(@mock_http_robot_response)
59
74
  @mock_http_client.stub!(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
60
75
  @mock_http_client.stub!(:request).with(@mock_http_redirect_request2).and_return(@mock_http_redirect_response2)
61
76
  @mock_http_client.stub!(:read_timeout=).and_return(nil)
@@ -64,6 +79,16 @@ RSpec.configure do |config|
64
79
  @mock_http_client.stub!(:address).and_return("www.baseurl.com")
65
80
  @mock_http_client.stub!(:port).and_return("80 ")
66
81
 
82
+ @mock_http_robot_response.stub!(:code).and_return(200)
83
+ @mock_http_robot_response.stub!(:body).and_return(File.open(File.dirname(__FILE__) + '/../spec/samples/robots.txt', "r").read)
84
+ @mock_http_robot_response.stub!(:content_type).and_return("text/plain")
85
+ @mock_http_robot_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
86
+ @mock_http_robot_response.stub!(:[]).with("location").and_return(@default_headers["location"])
87
+ @mock_http_robot_response.stub!(:[]).with("Content-Encoding").and_return(@default_headers["Content-Encoding"])
88
+ @mock_http_robot_response.stub!(:content_length).and_return(1024)
89
+ @mock_http_robot_response.stub!(:get_fields).with('set-cookie').and_return(["session=al98axx; expires=Fri, 31-Dec-1999 23:58:23", "query=rubyscript; expires=Fri, 31-Dec-1999 23:58:23"])
90
+ @mock_http_robot_response.stub!(:to_hash).and_return(@default_headers)
91
+
67
92
  @mock_http_response.stub!(:code).and_return(200)
68
93
  @mock_http_response.stub!(:content_type).and_return("text/html")
69
94
  @mock_http_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.57
4
+ version: 0.0.58
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-06-13 00:00:00.000000000 Z
12
+ date: 2012-06-30 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70194152226040 !ruby/object:Gem::Requirement
16
+ requirement: &70328776801460 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70194152226040
24
+ version_requirements: *70328776801460
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70194152224380 !ruby/object:Gem::Requirement
27
+ requirement: &70328776799760 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70194152224380
35
+ version_requirements: *70328776799760
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70194152223560 !ruby/object:Gem::Requirement
38
+ requirement: &70328776798960 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70194152223560
46
+ version_requirements: *70328776798960
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70194152222500 !ruby/object:Gem::Requirement
49
+ requirement: &70328776797840 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70194152222500
57
+ version_requirements: *70328776797840
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70194152220920 !ruby/object:Gem::Requirement
60
+ requirement: &70328776796300 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70194152220920
68
+ version_requirements: *70328776796300
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70194152236080 !ruby/object:Gem::Requirement
71
+ requirement: &70328776811560 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70194152236080
79
+ version_requirements: *70328776811560
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70194152235480 !ruby/object:Gem::Requirement
82
+ requirement: &70328776810940 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70194152235480
90
+ version_requirements: *70328776810940
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70194152234920 !ruby/object:Gem::Requirement
93
+ requirement: &70328776810380 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70194152234920
101
+ version_requirements: *70328776810380
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70194152234340 !ruby/object:Gem::Requirement
104
+ requirement: &70328776809840 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70194152234340
112
+ version_requirements: *70328776809840
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70194152233680 !ruby/object:Gem::Requirement
115
+ requirement: &70328776809160 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70194152233680
123
+ version_requirements: *70328776809160
124
124
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
125
125
  crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
126
126
  is also a standalone crawler that has a sophisticated statistics monitoring interface
@@ -136,6 +136,8 @@ files:
136
136
  - spec/cobweb/cobweb_links_spec.rb
137
137
  - spec/cobweb/cobweb_spec.rb
138
138
  - spec/cobweb/content_link_parser_spec.rb
139
+ - spec/cobweb/robots_spec.rb
140
+ - spec/samples/robots.txt
139
141
  - spec/samples/sample_html_links.html
140
142
  - spec/samples/sample_server.rb
141
143
  - spec/samples/sample_site/boxgrid.html
@@ -293,6 +295,7 @@ files:
293
295
  - spec/samples/sample_site/js/superfish.js
294
296
  - spec/samples/sample_site/js/supersubs.js
295
297
  - spec/samples/sample_site/more.html
298
+ - spec/samples/sample_site/robots.txt
296
299
  - spec/samples/sample_site/tables.html
297
300
  - spec/samples/sample_site/text/museosans-webfont.eot
298
301
  - spec/samples/sample_site/text/museosans-webfont.svg