cobweb 0.0.22 → 0.0.24

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.21
2
+ h1. Cobweb v0.0.23
3
3
 
4
4
  h2. Intro
5
5
 
@@ -54,14 +54,16 @@ Creates a new crawler object based on a base_url
54
54
 
55
55
  * options - Options are passed in as a hash,
56
56
 
57
- ** :follow_redirects - transparently follows redirects and populates the :redirect_through key in the content hash (Default: true)
58
- ** :redirect_limit - sets the limit to be used for concurrent redirects (Default: 10)
59
- ** :processing_queue - specifies the processing queue for content to be sent to (Default: ContentProcessJob)
60
- ** :debug - enables debug output (Default: false)
61
- ** :quiet - hides default output (Default: false)
62
- ** :cache - sets the ttl for caching pages, set to nil to disable caching (Default: 300)
63
- ** :timeout - http timeout for requests (Default: 10)
64
- ** :redis_options - hash containing the initialization options for redis (e.g. {:host => "redis.mydomain.com"}
57
+ ** :follow_redirects - transparently follows redirects and populates the :redirect_through key in the content hash (Default: true)
58
+ ** :redirect_limit - sets the limit to be used for concurrent redirects (Default: 10)
59
+ ** :processing_queue - specifies the processing queue for content to be sent to (Default: ContentProcessJob)
60
+ ** :debug - enables debug output (Default: false)
61
+ ** :quiet - hides default output (Default: false)
62
+ ** :cache - sets the ttl for caching pages, set to nil to disable caching (Default: 300)
63
+ ** :timeout - http timeout for requests (Default: 10)
64
+ ** :redis_options - hash containing the initialization options for redis (e.g. {:host => "redis.mydomain.com"}
65
+ ** :internal_urls - array of strings representing internal url forms for your site (eg: ['http://test.com/*', 'http://blog.test.com/*', 'http://externaltest.com/*'])
66
+ ** :first_page_redirect_internal - if true and the first page crawled is a redirect, it will add the final destination of redirects to the internal_urls (e.g. http://www.test.com gets redirected to http://test.com)
65
67
 
66
68
  bq. crawler = CobWeb.new(:follow_redirects => false)
67
69
 
@@ -70,6 +72,8 @@ h4. start(base_url)
70
72
  Starts a crawl through resque. Requires the :processing_queue to be set to a valid class for the resque job to work with the data retrieved.
71
73
 
72
74
  * base_url - the url to start the crawl from
75
+
76
+ Once the crawler starts, if the first page is redirected (eg from http://www.test.com to http://test.com) then the endpoint scheme, host and domain is added to the internal_urls automatically.
73
77
 
74
78
  bq. crawler.start("http://www.google.com/")
75
79
 
@@ -19,20 +19,33 @@ class Cobweb
19
19
  # investigate using event machine for single threaded crawling
20
20
 
21
21
  def self.version
22
- "0.0.22"
22
+ "0.0.24"
23
+ end
24
+
25
+ def method_missing(method_sym, *arguments, &block)
26
+ if method_sym.to_s =~ /^default_(.*)_to$/
27
+ tag_name = method_sym.to_s.split("_")[1..-2].join("_").to_sym
28
+ @options[tag_name] = arguments[0] unless @options.has_key?(tag_name)
29
+ else
30
+ super
31
+ end
23
32
  end
24
33
 
25
34
  def initialize(options = {})
26
35
  @options = options
27
- @options[:follow_redirects] = true unless @options.has_key?(:follow_redirects)
28
- @options[:redirect_limit] = 10 unless @options.has_key?(:redirect_limit)
29
- @options[:processing_queue] = CobwebProcessJob unless @options.has_key?(:processing_queue)
30
- @options[:crawl_finished_queue] = CobwebFinishedJob unless @options.has_key?(:crawl_finished_queue)
31
- @options[:quiet] = true unless @options.has_key?(:quiet)
32
- @options[:debug] = false unless @options.has_key?(:debug)
33
- @options[:cache] = 300 unless @options.has_key?(:cache)
34
- @options[:timeout] = 10 unless @options.has_key?(:timeout)
35
- @options[:redis_options] = {} unless @options.has_key?(:redis_options)
36
+
37
+ default_follow_redirects_to true
38
+ default_redirect_limit_to 10
39
+ default_processing_queue_to CobwebProcessJob
40
+ default_crawl_finished_queue_to CobwebFinishedJob
41
+ default_quiet_to true
42
+ default_debug_to false
43
+ default_cache_to 300
44
+ default_timeout_to 10
45
+ default_redis_options_to Hash.new
46
+ default_internal_urls_to []
47
+ default_first_page_redirect_internal_to true
48
+
36
49
  end
37
50
 
38
51
  def start(base_url)
@@ -42,9 +55,20 @@ class Cobweb
42
55
  :url => base_url
43
56
  }
44
57
 
58
+ if @options[:internal_urls].empty?
59
+ uri = Addressable::URI.parse(base_url)
60
+ @options[:internal_urls] << [uri.scheme, "://", uri.host, "/*"].join
61
+ end
62
+
45
63
  request.merge!(@options)
46
64
  @redis = NamespacedRedis.new(Redis.new(request[:redis_options]), "cobweb-#{Cobweb.version}-#{request[:crawl_id]}")
47
65
  @redis.hset "statistics", "queued_at", DateTime.now
66
+ @redis.set("crawl-counter", 0)
67
+ @redis.set("queue-counter", 1)
68
+
69
+
70
+ # add internal_urls into redis
71
+ @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
48
72
 
49
73
  Resque.enqueue(CrawlJob, request)
50
74
  end
@@ -70,7 +94,7 @@ class Cobweb
70
94
  redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}")
71
95
  end
72
96
 
73
- content = {}
97
+ content = {:base_url => url}
74
98
 
75
99
  # check if it has already been cached
76
100
  if redis.get(unique_id) and @options[:cache]
@@ -96,7 +120,7 @@ class Cobweb
96
120
  begin
97
121
  print "Retrieving #{url }... " unless @options[:quiet]
98
122
  request = Net::HTTP::Get.new uri.request_uri
99
-
123
+
100
124
  response = @http.request request
101
125
 
102
126
  if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
@@ -125,7 +149,7 @@ class Cobweb
125
149
  content[:response_time] = Time.now.to_f - request_time
126
150
 
127
151
  puts "Retrieved." unless @options[:quiet]
128
-
152
+
129
153
  # create the content container
130
154
  content[:url] = uri.to_s
131
155
  content[:status_code] = response.code.to_i
@@ -138,12 +162,16 @@ class Cobweb
138
162
  end
139
163
  content[:length] = response.content_length
140
164
  if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
141
- content[:body] = response.body
142
- else
165
+ if response["Content-Encoding"]=="gzip"
166
+ content[:body] = Zlib::GzipReader.new(StringIO.new(response.body)).read
167
+ else
168
+ content[:body] = response.body
169
+ end
170
+ else
143
171
  content[:body] = Base64.encode64(response.body)
144
172
  end
145
173
  content[:location] = response["location"]
146
- content[:headers] = response.to_hash.symbolize_keys
174
+ content[:headers] = response.to_hash.deep_symbolize_keys
147
175
  # parse data for links
148
176
  link_parser = ContentLinkParser.new(content[:url], content[:body])
149
177
  content[:links] = link_parser.link_data
@@ -170,7 +198,7 @@ class Cobweb
170
198
  content[:links] = {}
171
199
 
172
200
  rescue SocketError => e
173
- puts "ERROR: #{e.message}"
201
+ puts "ERROR: SocketError#{e.message}"
174
202
 
175
203
  ## generate a blank content
176
204
  content = {}
@@ -185,7 +213,7 @@ class Cobweb
185
213
  content[:links] = {}
186
214
 
187
215
  rescue Timeout::Error => e
188
- puts "ERROR: #{e.message}"
216
+ puts "ERROR Timeout::Error: #{e.message}"
189
217
 
190
218
  ## generate a blank content
191
219
  content = {}
@@ -207,10 +235,14 @@ class Cobweb
207
235
  raise "url cannot be nil" if url.nil?
208
236
 
209
237
  absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
210
-
238
+
211
239
  # get the unique id for this request
212
240
  unique_id = Digest::SHA1.hexdigest(url)
213
- redirect_limit = options[:redirect_limit]
241
+ if options.has_key?(:redirect_limit) and !options[:redirect_limit].nil?
242
+ redirect_limit = options[:redirect_limit].to_i
243
+ else
244
+ redirect_limit = 10
245
+ end
214
246
 
215
247
  # connect to redis
216
248
  if options.has_key? :crawl_id
@@ -224,7 +256,7 @@ class Cobweb
224
256
  # check if it has already been cached
225
257
  if redis.get("head-#{unique_id}") and @options[:cache]
226
258
  puts "Cache hit for #{url}" unless @options[:quiet]
227
- Marshal.load(redis.get("head-#{unique_id}")).deep_symbolize_keys
259
+ content = Marshal.load(redis.get("head-#{unique_id}")).deep_symbolize_keys
228
260
  else
229
261
  print "Retrieving #{url }... " unless @options[:quiet]
230
262
  uri = Addressable::URI.parse(url.strip)
@@ -247,7 +279,9 @@ class Cobweb
247
279
  puts "redirected... " unless @options[:quiet]
248
280
  url = absolutize.url(response['location']).to_s
249
281
  redirect_limit = redirect_limit - 1
250
- content = head(url, redirect_limit)
282
+ options = options.clone
283
+ options[:redirect_limit]=redirect_limit
284
+ content = head(url, options)
251
285
  content[:url] = uri.to_s
252
286
  content[:redirect_through] = [] if content[:redirect_through].nil?
253
287
  content[:redirect_through].insert(0, url)
@@ -4,7 +4,7 @@ class CobwebProcessJob
4
4
  @queue = :cobweb_process_job
5
5
 
6
6
  def self.perform(content)
7
- content.symbolize_keys
7
+ content = HashHelper.symbolize_keys(content)
8
8
  puts "Dummy Processing for #{content[:url]}"
9
9
 
10
10
  #ap content.keys
@@ -2,7 +2,6 @@
2
2
  class ContentLinkParser
3
3
 
4
4
  require "nokogiri"
5
- require "absolutize"
6
5
 
7
6
  def initialize(url, content, options = {})
8
7
  @options = options
@@ -6,163 +6,142 @@ class CrawlJob
6
6
 
7
7
  @queue = :cobweb_crawl_job
8
8
 
9
- ## redis params used
10
- #
11
- # crawl-counter
12
- # crawled
13
- # queue-counter
14
- # statistics[:average_response_time]
15
- # statistics[:maximum_response_time]
16
- # statistics[:minimum_response_time]
17
- # statistics[:average_length]
18
- # statistics[:maximum_length]
19
- # statistics[:minimum_length]
20
- # statistics[:queued_at]
21
- # statistics[:started_at]
22
- # statistics]:finished_at]
23
- # total_pages
24
- # total_assets
25
- # statistics[:mime_counts]["mime_type"]
26
- # statistics[:status_counts][xxx]
27
-
28
9
  def self.perform(content_request)
29
- # change all hash keys to symbols
30
- content_request.deep_symbolize_keys
31
- redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
10
+
11
+ # change all hash keys to symbols
12
+ content_request = content_request.deep_symbolize_keys
13
+
14
+ @redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
32
15
 
33
16
  @absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
34
-
17
+ @debug = content_request[:debug]
18
+
19
+ refresh_counters
20
+
35
21
  # check we haven't crawled this url before
36
- crawl_counter = redis.get("crawl-counter").to_i
37
- queue_counter = redis.get("queue-counter").to_i
38
- unless redis.sismember "crawled", content_request[:url]
22
+ unless @redis.sismember "crawled", content_request[:url]
39
23
 
40
- # increment counter and check we haven't hit our crawl limit
41
- redis.incr "crawl-counter"
42
- crawl_counter += 1
43
- if crawl_counter <= content_request[:crawl_limit].to_i
24
+ # if there is no limit or we're still under it lets get the url
25
+ if content_request[:crawl_limit].nil? or @crawl_counter <= content_request[:crawl_limit].to_i
44
26
  content = Cobweb.new(content_request).get(content_request[:url], content_request)
45
-
27
+
46
28
  ## update statistics
47
- if redis.hexists "statistics", "average_response_time"
48
- redis.hset("statistics", "average_response_time", (((redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / crawl_counter + 1))
49
- else
50
- redis.hset("statistics", "average_response_time", content[:response_time].to_f)
51
- end
52
- redis.hset "statistics", "maximum_response_time", content[:response_time].to_f if redis.hget("statistics", "maximum_response_time").nil? or content[:response_time].to_f > redis.hget("statistics", "maximum_response_time").to_f
53
- redis.hset "statistics", "minimum_response_time", content[:response_time].to_f if redis.hget("statistics", "minimum_response_time").nil? or content[:response_time].to_f < redis.hget("statistics", "minimum_response_time").to_f
54
- if redis.hexists "statistics", "average_length"
55
- redis.hset("statistics", "average_length", (((redis.hget("statistics", "average_length").to_i*crawl_counter) + content[:length].to_i) / crawl_counter + 1))
56
- else
57
- redis.hset("statistics", "average_length", content[:length].to_i)
29
+ Stats.set_statistics_in_redis(@redis, content)
30
+
31
+ # set the base url if this is the first page
32
+ set_base_url @redis, content, content_request
33
+
34
+ internal_links = all_links_from_content(content).map{|link| link.to_s}
35
+
36
+ # reject the link if we've crawled it or queued it
37
+ internal_links.reject!{|link| @redis.sismember("crawled", link)}
38
+ internal_links.reject!{|link| @redis.sismember("queued", link)}
39
+
40
+ # select the link if its internal
41
+ internal_links.select!{|link| internal_link?(link)}
42
+
43
+ internal_links.each do |link|
44
+ enqueue_content(content_request, link)
58
45
  end
59
- redis.hset "statistics", "maximum_length", content[:length].to_i if redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > redis.hget("statistics", "maximum_length").to_i
60
- redis.hset "statistics", "minimum_length", content[:length].to_i if redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < redis.hget("statistics", "minimum_length").to_i
61
46
 
62
- if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
63
- redis.incr "total_pages"
64
- else
65
- redis.incr "total_assets"
66
- end
67
-
68
- mime_counts = {}
69
- if redis.hexists "statistics", "mime_counts"
70
- mime_counts = JSON.parse(redis.hget("statistics", "mime_counts"))
71
- if mime_counts.has_key? content[:mime_type]
72
- mime_counts[content[:mime_type]] += 1
73
- else
74
- mime_counts[content[:mime_type]] = 1
75
- end
76
- else
77
- mime_counts = {content[:mime_type] => 1}
78
- end
79
- redis.hset "statistics", "mime_counts", mime_counts.to_json
80
-
81
- status_counts = {}
82
- if redis.hexists "statistics", "status_counts"
83
- status_counts = JSON.parse(redis.hget("statistics", "status_counts"))
84
- if status_counts.has_key? content[:status_code].to_i
85
- status_counts[content[:status_code].to_i] += 1
86
- else
87
- status_counts[content[:status_code].to_i] = 1
88
- end
89
- else
90
- status_counts = {content[:status_code].to_i => 1}
91
- end
92
- redis.hset "statistics", "status_counts", status_counts.to_json
93
-
94
- redis.srem "queued", content_request[:url]
95
- redis.sadd "crawled", content_request[:url]
96
- set_base_url redis, content, content_request[:base_url]
97
- content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link|
98
- link = link.to_s
99
- unless redis.sismember "crawled", link
100
- puts "Checking if #{link} matches #{redis.get("base_url")} as internal?" if content_request[:debug]
101
- if link.to_s.match(Regexp.new("^#{redis.get("base_url")}"))
102
- puts "Matched as #{link} as internal" if content_request[:debug]
103
- unless redis.sismember("crawled", link) or redis.sismember("queued", link)
104
- if queue_counter <= content_request[:crawl_limit].to_i
105
- new_request = content_request.clone
106
- new_request[:url] = link
107
- new_request[:parent] = content_request[:url]
108
- Resque.enqueue(CrawlJob, new_request)
109
- redis.sadd "queued", link
110
- redis.incr "queue-counter"
111
- queue_counter += 1
112
- end
113
- end
114
- end
115
- end
116
- end
47
+ # now that we're done, lets update the queues
48
+ @redis.srem "queued", content_request[:url]
49
+ decrement_queue_counter
50
+ @redis.sadd "crawled", content_request[:url]
51
+ increment_crawl_counter
117
52
 
118
53
  # enqueue to processing queue
119
54
  Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]}))
120
55
  puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
121
- puts "Crawled: #{crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{queue_counter}" if content_request[:debug]
122
-
123
-
56
+ puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter}" if content_request[:debug]
57
+
124
58
  else
125
- puts "Crawl Limit Exceeded by #{crawl_counter - content_request[:crawl_limit].to_i} objects" if content_request[:debug]
59
+ puts "Crawl Limit Exceeded by #{@crawl_counter - content_request[:crawl_limit].to_i} objects" if content_request[:debug]
126
60
  end
127
61
  else
128
62
  puts "Already crawled #{content_request[:url]}" if content_request[:debug]
129
63
  end
130
64
 
131
- # detect finished state
132
-
133
- if queue_counter == crawl_counter or content_request[:crawl_limit].to_i <= crawl_counter
65
+ # if the'res nothing left queued or the crawled limit has been reached
66
+ if @queue_counter == 0 || @crawl_counter >= content_request[:crawl_limit].to_i
134
67
 
135
- puts "queue_counter: #{queue_counter}"
136
- puts "crawl_counter: #{crawl_counter}"
68
+ puts "queue_counter: #{@queue_counter}"
69
+ puts "crawl_counter: #{@crawl_counter}"
137
70
  puts "crawl_limit: #{content_request[:crawl_limit]}"
138
71
 
139
72
  # finished
140
73
  puts "FINISHED"
141
- stats = redis.hgetall "statistics"
142
- stats[:total_pages] = redis.get "total_pages"
143
- stats[:total_assets] = redis.get "total_assets"
144
- stats[:crawl_counter] = redis.get "crawl_counter"
145
- stats[:queue_counter] = redis.get "queue_counter"
146
- stats[:crawled] = redis.smembers "crawled"
74
+ stats = @redis.hgetall "statistics"
75
+ stats[:total_pages] = @redis.get "total_pages"
76
+ stats[:total_assets] = @redis.get "total_assets"
77
+ stats[:crawl_counter] = @redis.get "crawl_counter"
78
+ stats[:queue_counter] = @redis.get "queue_counter"
79
+ stats[:crawled] = @redis.smembers "crawled"
147
80
 
148
- Resque.enqueue(const_get(content_request[:crawl_finished_queue]), stats.merge({:source_id => content_request[:source_id]}))
81
+ Resque.enqueue(const_get(content_request[:crawl_finished_queue]), stats.merge({:crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
149
82
 
150
- ap stats
151
83
  end
152
84
  end
153
85
 
154
86
  private
155
- def self.set_base_url(redis, content, base_url)
87
+ def self.set_base_url(redis, content, content_request)
156
88
  if redis.get("base_url").nil?
157
- if content[:status_code] >= 300 and content[:status_code] < 400
158
- #redirect received for first url
159
- redis.set("base_url", @absolutize.url(content[:location]).to_s)
160
- puts "WARNING: base_url given redirects to another location, setting base_url to #{@absolutize.url(content[:location]).to_s}"
161
- else
162
- redis.set("base_url", base_url)
89
+ unless content[:redirect_through].empty? || !content_request[:first_page_redirect_internal]
90
+ uri = Addressable::URI.parse(content[:redirect_through].last)
91
+ redis.sadd("internal_urls", [uri.scheme, "://", uri.host, "/*"].join)
92
+ end
93
+ redis.set("base_url", content[:url])
94
+ end
95
+ end
96
+
97
+ def self.internal_link?(link)
98
+ puts "Checking for internal link for: #{link}" if @debug
99
+ @internal_patterns ||= @redis.smembers("internal_urls").map{|pattern| Regexp.new("^#{pattern.gsub("*", ".*?")}")}
100
+ valid_link = true
101
+ @internal_patterns.each do |pattern|
102
+ puts "Matching against #{pattern.source}" if @debug
103
+ if link.match(pattern)
104
+ puts "Matched as internal" if @debug
105
+ return true
163
106
  end
164
107
  end
108
+ puts "Didn't match any pattern so marked as not internal" if @debug
109
+ false
165
110
  end
166
111
 
112
+ def self.all_links_from_content(content)
113
+ content[:links].keys.map{|key| content[:links][key]}.flatten
114
+ end
167
115
 
116
+ def self.enqueue_content(content_request, link)
117
+ new_request = content_request.clone
118
+ new_request[:url] = link
119
+ new_request[:parent] = content_request[:url]
120
+ Resque.enqueue(CrawlJob, new_request)
121
+ @redis.sadd "queued", link
122
+ increment_queue_counter
123
+ end
124
+
125
+ def self.increment_queue_counter
126
+ @redis.incr "queue-counter"
127
+ refresh_counters
128
+ end
129
+ def self.increment_crawl_counter
130
+ @redis.incr "crawl-counter"
131
+ refresh_counters
132
+ end
133
+ def self.decrement_queue_counter
134
+ @redis.decr "queue-counter"
135
+ refresh_counters
136
+ end
137
+ def self.refresh_counters
138
+ @crawl_counter = @redis.get("crawl-counter").to_i
139
+ @queue_counter = @redis.get("queue-counter").to_i
140
+ end
141
+ def self.reset_counters
142
+ @redis.set("crawl-counter", @redis.smembers("crawled").count)
143
+ @redis.set("queue-counter", @redis.smembers("queued").count)
144
+ @crawl_counter = @redis.get("crawl-counter").to_i
145
+ @queue_counter = @redis.get("queue-counter").to_i
146
+ end
168
147
  end
@@ -11,6 +11,59 @@ class Stats < Sinatra::Base
11
11
  @@status = status
12
12
  end
13
13
 
14
+ def self.set_statistics_in_redis(redis, content)
15
+ crawl_counter = redis.get("crawl-counter").to_i
16
+ queue_counter = redis.get("queue-counter").to_i
17
+
18
+ if redis.hexists "statistics", "average_response_time"
19
+ redis.hset("statistics", "average_response_time", (((redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1)))
20
+ else
21
+ redis.hset("statistics", "average_response_time", content[:response_time].to_f)
22
+ end
23
+ redis.hset "statistics", "maximum_response_time", content[:response_time].to_f if redis.hget("statistics", "maximum_response_time").nil? or content[:response_time].to_f > redis.hget("statistics", "maximum_response_time").to_f
24
+ redis.hset "statistics", "minimum_response_time", content[:response_time].to_f if redis.hget("statistics", "minimum_response_time").nil? or content[:response_time].to_f < redis.hget("statistics", "minimum_response_time").to_f
25
+ if redis.hexists "statistics", "average_length"
26
+ redis.hset("statistics", "average_length", (((redis.hget("statistics", "average_length").to_i*crawl_counter) + content[:length].to_i) / (crawl_counter + 1)))
27
+ else
28
+ redis.hset("statistics", "average_length", content[:length].to_i)
29
+ end
30
+ redis.hset "statistics", "maximum_length", content[:length].to_i if redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > redis.hget("statistics", "maximum_length").to_i
31
+ redis.hset "statistics", "minimum_length", content[:length].to_i if redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < redis.hget("statistics", "minimum_length").to_i
32
+
33
+ if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
34
+ redis.incr "total_pages"
35
+ else
36
+ redis.incr "total_assets"
37
+ end
38
+
39
+ mime_counts = {}
40
+ if redis.hexists "statistics", "mime_counts"
41
+ mime_counts = JSON.parse(redis.hget("statistics", "mime_counts"))
42
+ if mime_counts.has_key? content[:mime_type]
43
+ mime_counts[content[:mime_type]] += 1
44
+ else
45
+ mime_counts[content[:mime_type]] = 1
46
+ end
47
+ else
48
+ mime_counts = {content[:mime_type] => 1}
49
+ end
50
+ redis.hset "statistics", "mime_counts", mime_counts.to_json
51
+
52
+ status_counts = {}
53
+ if redis.hexists "statistics", "status_counts"
54
+ status_counts = JSON.parse(redis.hget("statistics", "status_counts"))
55
+ if status_counts.has_key? content[:status_code].to_i
56
+ status_counts[content[:status_code].to_i] += 1
57
+ else
58
+ status_counts[content[:status_code].to_i] = 1
59
+ end
60
+ else
61
+ status_counts = {content[:status_code].to_i => 1}
62
+ end
63
+ redis.hset "statistics", "status_counts", status_counts.to_json
64
+
65
+ end
66
+
14
67
  set :views, settings.root + '/../views'
15
68
 
16
69
  get '/' do
@@ -19,7 +72,6 @@ class Stats < Sinatra::Base
19
72
  haml :statistics
20
73
  end
21
74
 
22
-
23
75
  def self.start
24
76
  thread = Thread.new do
25
77
  Stats.run!
@@ -74,6 +74,25 @@ describe Cobweb do
74
74
  Cobweb.new.should be_an_instance_of Cobweb
75
75
  end
76
76
 
77
+ it "should setup with defaults" do
78
+ cobweb = Cobweb.new
79
+
80
+ options = cobweb.instance_eval("@options")
81
+ ap options
82
+
83
+ options[:follow_redirects].should == true
84
+ options[:redirect_limit].should == 10
85
+ options[:processing_queue].should == CobwebProcessJob
86
+ options[:crawl_finished_queue].should == CobwebFinishedJob
87
+ options[:quiet].should == true
88
+ options[:debug].should == false
89
+ options[:cache].should == 300
90
+ options[:timeout].should == 10
91
+ options[:redis_options].should == {}
92
+ options[:internal_urls].should == []
93
+
94
+ end
95
+
77
96
  describe "get" do
78
97
  it "should return a hash with default values" do
79
98
  @cobweb.get(@base_url).should be_an_instance_of Hash
@@ -141,7 +160,7 @@ describe Cobweb do
141
160
  #@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
142
161
  #
143
162
  #content = @cobweb.get(@base_url)
144
- #content.should be_an_instance_of Hash
163
+ #content.should be_an_instance_of HashHelper
145
164
  #ap content
146
165
  #content[:url].should == "http://redirect-me.com/redirect.html"
147
166
  #content[:redirect_through].length.should == 2
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.22
4
+ version: 0.0.24
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-03-09 00:00:00.000000000 Z
12
+ date: 2012-03-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70101145919340 !ruby/object:Gem::Requirement
16
+ requirement: &70268501331520 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70101145919340
24
+ version_requirements: *70268501331520
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70101145918920 !ruby/object:Gem::Requirement
27
+ requirement: &70268501331100 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70101145918920
35
+ version_requirements: *70268501331100
36
36
  - !ruby/object:Gem::Dependency
37
- name: absolutize
38
- requirement: &70101145918500 !ruby/object:Gem::Requirement
37
+ name: nokogiri
38
+ requirement: &70268501330680 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70101145918500
46
+ version_requirements: *70268501330680
47
47
  - !ruby/object:Gem::Dependency
48
- name: nokogiri
49
- requirement: &70101145934440 !ruby/object:Gem::Requirement
48
+ name: addressable
49
+ requirement: &70268501330240 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70101145934440
57
+ version_requirements: *70268501330240
58
58
  - !ruby/object:Gem::Dependency
59
- name: addressable
60
- requirement: &70101145934020 !ruby/object:Gem::Requirement
59
+ name: rspec
60
+ requirement: &70268501329820 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70101145934020
68
+ version_requirements: *70268501329820
69
69
  - !ruby/object:Gem::Dependency
70
- name: rspec
71
- requirement: &70101145933580 !ruby/object:Gem::Requirement
70
+ name: awesome_print
71
+ requirement: &70268501329400 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70101145933580
79
+ version_requirements: *70268501329400
80
80
  - !ruby/object:Gem::Dependency
81
- name: awesome_print
82
- requirement: &70101145933160 !ruby/object:Gem::Requirement
81
+ name: sinatra
82
+ requirement: &70268501328980 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70101145933160
90
+ version_requirements: *70268501328980
91
91
  - !ruby/object:Gem::Dependency
92
- name: sinatra
93
- requirement: &70101145932740 !ruby/object:Gem::Requirement
92
+ name: thin
93
+ requirement: &70268501328560 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70101145932740
101
+ version_requirements: *70268501328560
102
102
  - !ruby/object:Gem::Dependency
103
- name: thin
104
- requirement: &70101145932320 !ruby/object:Gem::Requirement
103
+ name: haml
104
+ requirement: &70268501328140 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70101145932320
112
+ version_requirements: *70268501328140
113
113
  - !ruby/object:Gem::Dependency
114
- name: haml
115
- requirement: &70101145931900 !ruby/object:Gem::Requirement
114
+ name: hashie
115
+ requirement: &70268501344080 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: '0'
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70101145931900
123
+ version_requirements: *70268501344080
124
124
  description:
125
125
  email: stewart@rockwellcottage.com
126
126
  executables: []
@@ -134,14 +134,12 @@ files:
134
134
  - spec/samples/sample_html_links.html
135
135
  - spec/spec.opts
136
136
  - spec/spec_helper.rb
137
- - lib/cobweb/version.rb
138
137
  - lib/cobweb.rb
139
138
  - lib/cobweb_crawler.rb
140
139
  - lib/cobweb_finished_job.rb
141
140
  - lib/cobweb_process_job.rb
142
141
  - lib/content_link_parser.rb
143
142
  - lib/crawl_job.rb
144
- - lib/hash.rb
145
143
  - lib/namespaced_redis.rb
146
144
  - lib/redirect_error.rb
147
145
  - lib/robots.rb
@@ -1 +0,0 @@
1
- VERSION = "0.0.21"
@@ -1,22 +0,0 @@
1
- ## add symbolize methods to hash
2
- class Hash
3
- def symbolize_keys
4
- keys.each do |key|
5
- if key.instance_of? String
6
- value = self[key]
7
- self.delete(key)
8
- self[key.to_sym] = value
9
- end
10
- end
11
- self
12
- end
13
- def deep_symbolize_keys
14
- symbolize_keys
15
- keys.each do |key|
16
- if self[key].instance_of? Hash
17
- self[key].deep_symbolize_keys
18
- end
19
- end
20
- self
21
- end
22
- end