cobweb 0.0.22 → 0.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.21
2
+ h1. Cobweb v0.0.23
3
3
 
4
4
  h2. Intro
5
5
 
@@ -54,14 +54,16 @@ Creates a new crawler object based on a base_url
54
54
 
55
55
  * options - Options are passed in as a hash,
56
56
 
57
- ** :follow_redirects - transparently follows redirects and populates the :redirect_through key in the content hash (Default: true)
58
- ** :redirect_limit - sets the limit to be used for concurrent redirects (Default: 10)
59
- ** :processing_queue - specifies the processing queue for content to be sent to (Default: ContentProcessJob)
60
- ** :debug - enables debug output (Default: false)
61
- ** :quiet - hides default output (Default: false)
62
- ** :cache - sets the ttl for caching pages, set to nil to disable caching (Default: 300)
63
- ** :timeout - http timeout for requests (Default: 10)
64
- ** :redis_options - hash containing the initialization options for redis (e.g. {:host => "redis.mydomain.com"}
57
+ ** :follow_redirects - transparently follows redirects and populates the :redirect_through key in the content hash (Default: true)
58
+ ** :redirect_limit - sets the limit to be used for concurrent redirects (Default: 10)
59
+ ** :processing_queue - specifies the processing queue for content to be sent to (Default: ContentProcessJob)
60
+ ** :debug - enables debug output (Default: false)
61
+ ** :quiet - hides default output (Default: false)
62
+ ** :cache - sets the ttl for caching pages, set to nil to disable caching (Default: 300)
63
+ ** :timeout - http timeout for requests (Default: 10)
64
+ ** :redis_options - hash containing the initialization options for redis (e.g. {:host => "redis.mydomain.com"}
65
+ ** :internal_urls - array of strings representing internal url forms for your site (eg: ['http://test.com/*', 'http://blog.test.com/*', 'http://externaltest.com/*'])
66
+ ** :first_page_redirect_internal - if true and the first page crawled is a redirect, it will add the final destination of redirects to the internal_urls (e.g. http://www.test.com gets redirected to http://test.com)
65
67
 
66
68
  bq. crawler = CobWeb.new(:follow_redirects => false)
67
69
 
@@ -70,6 +72,8 @@ h4. start(base_url)
70
72
  Starts a crawl through resque. Requires the :processing_queue to be set to a valid class for the resque job to work with the data retrieved.
71
73
 
72
74
  * base_url - the url to start the crawl from
75
+
76
+ Once the crawler starts, if the first page is redirected (eg from http://www.test.com to http://test.com) then the endpoint scheme, host and domain is added to the internal_urls automatically.
73
77
 
74
78
  bq. crawler.start("http://www.google.com/")
75
79
 
@@ -19,20 +19,33 @@ class Cobweb
19
19
  # investigate using event machine for single threaded crawling
20
20
 
21
21
  def self.version
22
- "0.0.22"
22
+ "0.0.24"
23
+ end
24
+
25
+ def method_missing(method_sym, *arguments, &block)
26
+ if method_sym.to_s =~ /^default_(.*)_to$/
27
+ tag_name = method_sym.to_s.split("_")[1..-2].join("_").to_sym
28
+ @options[tag_name] = arguments[0] unless @options.has_key?(tag_name)
29
+ else
30
+ super
31
+ end
23
32
  end
24
33
 
25
34
  def initialize(options = {})
26
35
  @options = options
27
- @options[:follow_redirects] = true unless @options.has_key?(:follow_redirects)
28
- @options[:redirect_limit] = 10 unless @options.has_key?(:redirect_limit)
29
- @options[:processing_queue] = CobwebProcessJob unless @options.has_key?(:processing_queue)
30
- @options[:crawl_finished_queue] = CobwebFinishedJob unless @options.has_key?(:crawl_finished_queue)
31
- @options[:quiet] = true unless @options.has_key?(:quiet)
32
- @options[:debug] = false unless @options.has_key?(:debug)
33
- @options[:cache] = 300 unless @options.has_key?(:cache)
34
- @options[:timeout] = 10 unless @options.has_key?(:timeout)
35
- @options[:redis_options] = {} unless @options.has_key?(:redis_options)
36
+
37
+ default_follow_redirects_to true
38
+ default_redirect_limit_to 10
39
+ default_processing_queue_to CobwebProcessJob
40
+ default_crawl_finished_queue_to CobwebFinishedJob
41
+ default_quiet_to true
42
+ default_debug_to false
43
+ default_cache_to 300
44
+ default_timeout_to 10
45
+ default_redis_options_to Hash.new
46
+ default_internal_urls_to []
47
+ default_first_page_redirect_internal_to true
48
+
36
49
  end
37
50
 
38
51
  def start(base_url)
@@ -42,9 +55,20 @@ class Cobweb
42
55
  :url => base_url
43
56
  }
44
57
 
58
+ if @options[:internal_urls].empty?
59
+ uri = Addressable::URI.parse(base_url)
60
+ @options[:internal_urls] << [uri.scheme, "://", uri.host, "/*"].join
61
+ end
62
+
45
63
  request.merge!(@options)
46
64
  @redis = NamespacedRedis.new(Redis.new(request[:redis_options]), "cobweb-#{Cobweb.version}-#{request[:crawl_id]}")
47
65
  @redis.hset "statistics", "queued_at", DateTime.now
66
+ @redis.set("crawl-counter", 0)
67
+ @redis.set("queue-counter", 1)
68
+
69
+
70
+ # add internal_urls into redis
71
+ @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
48
72
 
49
73
  Resque.enqueue(CrawlJob, request)
50
74
  end
@@ -70,7 +94,7 @@ class Cobweb
70
94
  redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}")
71
95
  end
72
96
 
73
- content = {}
97
+ content = {:base_url => url}
74
98
 
75
99
  # check if it has already been cached
76
100
  if redis.get(unique_id) and @options[:cache]
@@ -96,7 +120,7 @@ class Cobweb
96
120
  begin
97
121
  print "Retrieving #{url }... " unless @options[:quiet]
98
122
  request = Net::HTTP::Get.new uri.request_uri
99
-
123
+
100
124
  response = @http.request request
101
125
 
102
126
  if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
@@ -125,7 +149,7 @@ class Cobweb
125
149
  content[:response_time] = Time.now.to_f - request_time
126
150
 
127
151
  puts "Retrieved." unless @options[:quiet]
128
-
152
+
129
153
  # create the content container
130
154
  content[:url] = uri.to_s
131
155
  content[:status_code] = response.code.to_i
@@ -138,12 +162,16 @@ class Cobweb
138
162
  end
139
163
  content[:length] = response.content_length
140
164
  if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
141
- content[:body] = response.body
142
- else
165
+ if response["Content-Encoding"]=="gzip"
166
+ content[:body] = Zlib::GzipReader.new(StringIO.new(response.body)).read
167
+ else
168
+ content[:body] = response.body
169
+ end
170
+ else
143
171
  content[:body] = Base64.encode64(response.body)
144
172
  end
145
173
  content[:location] = response["location"]
146
- content[:headers] = response.to_hash.symbolize_keys
174
+ content[:headers] = response.to_hash.deep_symbolize_keys
147
175
  # parse data for links
148
176
  link_parser = ContentLinkParser.new(content[:url], content[:body])
149
177
  content[:links] = link_parser.link_data
@@ -170,7 +198,7 @@ class Cobweb
170
198
  content[:links] = {}
171
199
 
172
200
  rescue SocketError => e
173
- puts "ERROR: #{e.message}"
201
+ puts "ERROR: SocketError#{e.message}"
174
202
 
175
203
  ## generate a blank content
176
204
  content = {}
@@ -185,7 +213,7 @@ class Cobweb
185
213
  content[:links] = {}
186
214
 
187
215
  rescue Timeout::Error => e
188
- puts "ERROR: #{e.message}"
216
+ puts "ERROR Timeout::Error: #{e.message}"
189
217
 
190
218
  ## generate a blank content
191
219
  content = {}
@@ -207,10 +235,14 @@ class Cobweb
207
235
  raise "url cannot be nil" if url.nil?
208
236
 
209
237
  absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
210
-
238
+
211
239
  # get the unique id for this request
212
240
  unique_id = Digest::SHA1.hexdigest(url)
213
- redirect_limit = options[:redirect_limit]
241
+ if options.has_key?(:redirect_limit) and !options[:redirect_limit].nil?
242
+ redirect_limit = options[:redirect_limit].to_i
243
+ else
244
+ redirect_limit = 10
245
+ end
214
246
 
215
247
  # connect to redis
216
248
  if options.has_key? :crawl_id
@@ -224,7 +256,7 @@ class Cobweb
224
256
  # check if it has already been cached
225
257
  if redis.get("head-#{unique_id}") and @options[:cache]
226
258
  puts "Cache hit for #{url}" unless @options[:quiet]
227
- Marshal.load(redis.get("head-#{unique_id}")).deep_symbolize_keys
259
+ content = Marshal.load(redis.get("head-#{unique_id}")).deep_symbolize_keys
228
260
  else
229
261
  print "Retrieving #{url }... " unless @options[:quiet]
230
262
  uri = Addressable::URI.parse(url.strip)
@@ -247,7 +279,9 @@ class Cobweb
247
279
  puts "redirected... " unless @options[:quiet]
248
280
  url = absolutize.url(response['location']).to_s
249
281
  redirect_limit = redirect_limit - 1
250
- content = head(url, redirect_limit)
282
+ options = options.clone
283
+ options[:redirect_limit]=redirect_limit
284
+ content = head(url, options)
251
285
  content[:url] = uri.to_s
252
286
  content[:redirect_through] = [] if content[:redirect_through].nil?
253
287
  content[:redirect_through].insert(0, url)
@@ -4,7 +4,7 @@ class CobwebProcessJob
4
4
  @queue = :cobweb_process_job
5
5
 
6
6
  def self.perform(content)
7
- content.symbolize_keys
7
+ content = HashHelper.symbolize_keys(content)
8
8
  puts "Dummy Processing for #{content[:url]}"
9
9
 
10
10
  #ap content.keys
@@ -2,7 +2,6 @@
2
2
  class ContentLinkParser
3
3
 
4
4
  require "nokogiri"
5
- require "absolutize"
6
5
 
7
6
  def initialize(url, content, options = {})
8
7
  @options = options
@@ -6,163 +6,142 @@ class CrawlJob
6
6
 
7
7
  @queue = :cobweb_crawl_job
8
8
 
9
- ## redis params used
10
- #
11
- # crawl-counter
12
- # crawled
13
- # queue-counter
14
- # statistics[:average_response_time]
15
- # statistics[:maximum_response_time]
16
- # statistics[:minimum_response_time]
17
- # statistics[:average_length]
18
- # statistics[:maximum_length]
19
- # statistics[:minimum_length]
20
- # statistics[:queued_at]
21
- # statistics[:started_at]
22
- # statistics]:finished_at]
23
- # total_pages
24
- # total_assets
25
- # statistics[:mime_counts]["mime_type"]
26
- # statistics[:status_counts][xxx]
27
-
28
9
  def self.perform(content_request)
29
- # change all hash keys to symbols
30
- content_request.deep_symbolize_keys
31
- redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
10
+
11
+ # change all hash keys to symbols
12
+ content_request = content_request.deep_symbolize_keys
13
+
14
+ @redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
32
15
 
33
16
  @absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
34
-
17
+ @debug = content_request[:debug]
18
+
19
+ refresh_counters
20
+
35
21
  # check we haven't crawled this url before
36
- crawl_counter = redis.get("crawl-counter").to_i
37
- queue_counter = redis.get("queue-counter").to_i
38
- unless redis.sismember "crawled", content_request[:url]
22
+ unless @redis.sismember "crawled", content_request[:url]
39
23
 
40
- # increment counter and check we haven't hit our crawl limit
41
- redis.incr "crawl-counter"
42
- crawl_counter += 1
43
- if crawl_counter <= content_request[:crawl_limit].to_i
24
+ # if there is no limit or we're still under it lets get the url
25
+ if content_request[:crawl_limit].nil? or @crawl_counter <= content_request[:crawl_limit].to_i
44
26
  content = Cobweb.new(content_request).get(content_request[:url], content_request)
45
-
27
+
46
28
  ## update statistics
47
- if redis.hexists "statistics", "average_response_time"
48
- redis.hset("statistics", "average_response_time", (((redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / crawl_counter + 1))
49
- else
50
- redis.hset("statistics", "average_response_time", content[:response_time].to_f)
51
- end
52
- redis.hset "statistics", "maximum_response_time", content[:response_time].to_f if redis.hget("statistics", "maximum_response_time").nil? or content[:response_time].to_f > redis.hget("statistics", "maximum_response_time").to_f
53
- redis.hset "statistics", "minimum_response_time", content[:response_time].to_f if redis.hget("statistics", "minimum_response_time").nil? or content[:response_time].to_f < redis.hget("statistics", "minimum_response_time").to_f
54
- if redis.hexists "statistics", "average_length"
55
- redis.hset("statistics", "average_length", (((redis.hget("statistics", "average_length").to_i*crawl_counter) + content[:length].to_i) / crawl_counter + 1))
56
- else
57
- redis.hset("statistics", "average_length", content[:length].to_i)
29
+ Stats.set_statistics_in_redis(@redis, content)
30
+
31
+ # set the base url if this is the first page
32
+ set_base_url @redis, content, content_request
33
+
34
+ internal_links = all_links_from_content(content).map{|link| link.to_s}
35
+
36
+ # reject the link if we've crawled it or queued it
37
+ internal_links.reject!{|link| @redis.sismember("crawled", link)}
38
+ internal_links.reject!{|link| @redis.sismember("queued", link)}
39
+
40
+ # select the link if its internal
41
+ internal_links.select!{|link| internal_link?(link)}
42
+
43
+ internal_links.each do |link|
44
+ enqueue_content(content_request, link)
58
45
  end
59
- redis.hset "statistics", "maximum_length", content[:length].to_i if redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > redis.hget("statistics", "maximum_length").to_i
60
- redis.hset "statistics", "minimum_length", content[:length].to_i if redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < redis.hget("statistics", "minimum_length").to_i
61
46
 
62
- if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
63
- redis.incr "total_pages"
64
- else
65
- redis.incr "total_assets"
66
- end
67
-
68
- mime_counts = {}
69
- if redis.hexists "statistics", "mime_counts"
70
- mime_counts = JSON.parse(redis.hget("statistics", "mime_counts"))
71
- if mime_counts.has_key? content[:mime_type]
72
- mime_counts[content[:mime_type]] += 1
73
- else
74
- mime_counts[content[:mime_type]] = 1
75
- end
76
- else
77
- mime_counts = {content[:mime_type] => 1}
78
- end
79
- redis.hset "statistics", "mime_counts", mime_counts.to_json
80
-
81
- status_counts = {}
82
- if redis.hexists "statistics", "status_counts"
83
- status_counts = JSON.parse(redis.hget("statistics", "status_counts"))
84
- if status_counts.has_key? content[:status_code].to_i
85
- status_counts[content[:status_code].to_i] += 1
86
- else
87
- status_counts[content[:status_code].to_i] = 1
88
- end
89
- else
90
- status_counts = {content[:status_code].to_i => 1}
91
- end
92
- redis.hset "statistics", "status_counts", status_counts.to_json
93
-
94
- redis.srem "queued", content_request[:url]
95
- redis.sadd "crawled", content_request[:url]
96
- set_base_url redis, content, content_request[:base_url]
97
- content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link|
98
- link = link.to_s
99
- unless redis.sismember "crawled", link
100
- puts "Checking if #{link} matches #{redis.get("base_url")} as internal?" if content_request[:debug]
101
- if link.to_s.match(Regexp.new("^#{redis.get("base_url")}"))
102
- puts "Matched as #{link} as internal" if content_request[:debug]
103
- unless redis.sismember("crawled", link) or redis.sismember("queued", link)
104
- if queue_counter <= content_request[:crawl_limit].to_i
105
- new_request = content_request.clone
106
- new_request[:url] = link
107
- new_request[:parent] = content_request[:url]
108
- Resque.enqueue(CrawlJob, new_request)
109
- redis.sadd "queued", link
110
- redis.incr "queue-counter"
111
- queue_counter += 1
112
- end
113
- end
114
- end
115
- end
116
- end
47
+ # now that we're done, lets update the queues
48
+ @redis.srem "queued", content_request[:url]
49
+ decrement_queue_counter
50
+ @redis.sadd "crawled", content_request[:url]
51
+ increment_crawl_counter
117
52
 
118
53
  # enqueue to processing queue
119
54
  Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]}))
120
55
  puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
121
- puts "Crawled: #{crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{queue_counter}" if content_request[:debug]
122
-
123
-
56
+ puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter}" if content_request[:debug]
57
+
124
58
  else
125
- puts "Crawl Limit Exceeded by #{crawl_counter - content_request[:crawl_limit].to_i} objects" if content_request[:debug]
59
+ puts "Crawl Limit Exceeded by #{@crawl_counter - content_request[:crawl_limit].to_i} objects" if content_request[:debug]
126
60
  end
127
61
  else
128
62
  puts "Already crawled #{content_request[:url]}" if content_request[:debug]
129
63
  end
130
64
 
131
- # detect finished state
132
-
133
- if queue_counter == crawl_counter or content_request[:crawl_limit].to_i <= crawl_counter
65
+ # if the'res nothing left queued or the crawled limit has been reached
66
+ if @queue_counter == 0 || @crawl_counter >= content_request[:crawl_limit].to_i
134
67
 
135
- puts "queue_counter: #{queue_counter}"
136
- puts "crawl_counter: #{crawl_counter}"
68
+ puts "queue_counter: #{@queue_counter}"
69
+ puts "crawl_counter: #{@crawl_counter}"
137
70
  puts "crawl_limit: #{content_request[:crawl_limit]}"
138
71
 
139
72
  # finished
140
73
  puts "FINISHED"
141
- stats = redis.hgetall "statistics"
142
- stats[:total_pages] = redis.get "total_pages"
143
- stats[:total_assets] = redis.get "total_assets"
144
- stats[:crawl_counter] = redis.get "crawl_counter"
145
- stats[:queue_counter] = redis.get "queue_counter"
146
- stats[:crawled] = redis.smembers "crawled"
74
+ stats = @redis.hgetall "statistics"
75
+ stats[:total_pages] = @redis.get "total_pages"
76
+ stats[:total_assets] = @redis.get "total_assets"
77
+ stats[:crawl_counter] = @redis.get "crawl_counter"
78
+ stats[:queue_counter] = @redis.get "queue_counter"
79
+ stats[:crawled] = @redis.smembers "crawled"
147
80
 
148
- Resque.enqueue(const_get(content_request[:crawl_finished_queue]), stats.merge({:source_id => content_request[:source_id]}))
81
+ Resque.enqueue(const_get(content_request[:crawl_finished_queue]), stats.merge({:crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
149
82
 
150
- ap stats
151
83
  end
152
84
  end
153
85
 
154
86
  private
155
- def self.set_base_url(redis, content, base_url)
87
+ def self.set_base_url(redis, content, content_request)
156
88
  if redis.get("base_url").nil?
157
- if content[:status_code] >= 300 and content[:status_code] < 400
158
- #redirect received for first url
159
- redis.set("base_url", @absolutize.url(content[:location]).to_s)
160
- puts "WARNING: base_url given redirects to another location, setting base_url to #{@absolutize.url(content[:location]).to_s}"
161
- else
162
- redis.set("base_url", base_url)
89
+ unless content[:redirect_through].empty? || !content_request[:first_page_redirect_internal]
90
+ uri = Addressable::URI.parse(content[:redirect_through].last)
91
+ redis.sadd("internal_urls", [uri.scheme, "://", uri.host, "/*"].join)
92
+ end
93
+ redis.set("base_url", content[:url])
94
+ end
95
+ end
96
+
97
+ def self.internal_link?(link)
98
+ puts "Checking for internal link for: #{link}" if @debug
99
+ @internal_patterns ||= @redis.smembers("internal_urls").map{|pattern| Regexp.new("^#{pattern.gsub("*", ".*?")}")}
100
+ valid_link = true
101
+ @internal_patterns.each do |pattern|
102
+ puts "Matching against #{pattern.source}" if @debug
103
+ if link.match(pattern)
104
+ puts "Matched as internal" if @debug
105
+ return true
163
106
  end
164
107
  end
108
+ puts "Didn't match any pattern so marked as not internal" if @debug
109
+ false
165
110
  end
166
111
 
112
+ def self.all_links_from_content(content)
113
+ content[:links].keys.map{|key| content[:links][key]}.flatten
114
+ end
167
115
 
116
+ def self.enqueue_content(content_request, link)
117
+ new_request = content_request.clone
118
+ new_request[:url] = link
119
+ new_request[:parent] = content_request[:url]
120
+ Resque.enqueue(CrawlJob, new_request)
121
+ @redis.sadd "queued", link
122
+ increment_queue_counter
123
+ end
124
+
125
+ def self.increment_queue_counter
126
+ @redis.incr "queue-counter"
127
+ refresh_counters
128
+ end
129
+ def self.increment_crawl_counter
130
+ @redis.incr "crawl-counter"
131
+ refresh_counters
132
+ end
133
+ def self.decrement_queue_counter
134
+ @redis.decr "queue-counter"
135
+ refresh_counters
136
+ end
137
+ def self.refresh_counters
138
+ @crawl_counter = @redis.get("crawl-counter").to_i
139
+ @queue_counter = @redis.get("queue-counter").to_i
140
+ end
141
+ def self.reset_counters
142
+ @redis.set("crawl-counter", @redis.smembers("crawled").count)
143
+ @redis.set("queue-counter", @redis.smembers("queued").count)
144
+ @crawl_counter = @redis.get("crawl-counter").to_i
145
+ @queue_counter = @redis.get("queue-counter").to_i
146
+ end
168
147
  end
@@ -11,6 +11,59 @@ class Stats < Sinatra::Base
11
11
  @@status = status
12
12
  end
13
13
 
14
+ def self.set_statistics_in_redis(redis, content)
15
+ crawl_counter = redis.get("crawl-counter").to_i
16
+ queue_counter = redis.get("queue-counter").to_i
17
+
18
+ if redis.hexists "statistics", "average_response_time"
19
+ redis.hset("statistics", "average_response_time", (((redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1)))
20
+ else
21
+ redis.hset("statistics", "average_response_time", content[:response_time].to_f)
22
+ end
23
+ redis.hset "statistics", "maximum_response_time", content[:response_time].to_f if redis.hget("statistics", "maximum_response_time").nil? or content[:response_time].to_f > redis.hget("statistics", "maximum_response_time").to_f
24
+ redis.hset "statistics", "minimum_response_time", content[:response_time].to_f if redis.hget("statistics", "minimum_response_time").nil? or content[:response_time].to_f < redis.hget("statistics", "minimum_response_time").to_f
25
+ if redis.hexists "statistics", "average_length"
26
+ redis.hset("statistics", "average_length", (((redis.hget("statistics", "average_length").to_i*crawl_counter) + content[:length].to_i) / (crawl_counter + 1)))
27
+ else
28
+ redis.hset("statistics", "average_length", content[:length].to_i)
29
+ end
30
+ redis.hset "statistics", "maximum_length", content[:length].to_i if redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > redis.hget("statistics", "maximum_length").to_i
31
+ redis.hset "statistics", "minimum_length", content[:length].to_i if redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < redis.hget("statistics", "minimum_length").to_i
32
+
33
+ if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
34
+ redis.incr "total_pages"
35
+ else
36
+ redis.incr "total_assets"
37
+ end
38
+
39
+ mime_counts = {}
40
+ if redis.hexists "statistics", "mime_counts"
41
+ mime_counts = JSON.parse(redis.hget("statistics", "mime_counts"))
42
+ if mime_counts.has_key? content[:mime_type]
43
+ mime_counts[content[:mime_type]] += 1
44
+ else
45
+ mime_counts[content[:mime_type]] = 1
46
+ end
47
+ else
48
+ mime_counts = {content[:mime_type] => 1}
49
+ end
50
+ redis.hset "statistics", "mime_counts", mime_counts.to_json
51
+
52
+ status_counts = {}
53
+ if redis.hexists "statistics", "status_counts"
54
+ status_counts = JSON.parse(redis.hget("statistics", "status_counts"))
55
+ if status_counts.has_key? content[:status_code].to_i
56
+ status_counts[content[:status_code].to_i] += 1
57
+ else
58
+ status_counts[content[:status_code].to_i] = 1
59
+ end
60
+ else
61
+ status_counts = {content[:status_code].to_i => 1}
62
+ end
63
+ redis.hset "statistics", "status_counts", status_counts.to_json
64
+
65
+ end
66
+
14
67
  set :views, settings.root + '/../views'
15
68
 
16
69
  get '/' do
@@ -19,7 +72,6 @@ class Stats < Sinatra::Base
19
72
  haml :statistics
20
73
  end
21
74
 
22
-
23
75
  def self.start
24
76
  thread = Thread.new do
25
77
  Stats.run!
@@ -74,6 +74,25 @@ describe Cobweb do
74
74
  Cobweb.new.should be_an_instance_of Cobweb
75
75
  end
76
76
 
77
+ it "should setup with defaults" do
78
+ cobweb = Cobweb.new
79
+
80
+ options = cobweb.instance_eval("@options")
81
+ ap options
82
+
83
+ options[:follow_redirects].should == true
84
+ options[:redirect_limit].should == 10
85
+ options[:processing_queue].should == CobwebProcessJob
86
+ options[:crawl_finished_queue].should == CobwebFinishedJob
87
+ options[:quiet].should == true
88
+ options[:debug].should == false
89
+ options[:cache].should == 300
90
+ options[:timeout].should == 10
91
+ options[:redis_options].should == {}
92
+ options[:internal_urls].should == []
93
+
94
+ end
95
+
77
96
  describe "get" do
78
97
  it "should return a hash with default values" do
79
98
  @cobweb.get(@base_url).should be_an_instance_of Hash
@@ -141,7 +160,7 @@ describe Cobweb do
141
160
  #@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
142
161
  #
143
162
  #content = @cobweb.get(@base_url)
144
- #content.should be_an_instance_of Hash
163
+ #content.should be_an_instance_of HashHelper
145
164
  #ap content
146
165
  #content[:url].should == "http://redirect-me.com/redirect.html"
147
166
  #content[:redirect_through].length.should == 2
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.22
4
+ version: 0.0.24
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-03-09 00:00:00.000000000 Z
12
+ date: 2012-03-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70101145919340 !ruby/object:Gem::Requirement
16
+ requirement: &70268501331520 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70101145919340
24
+ version_requirements: *70268501331520
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70101145918920 !ruby/object:Gem::Requirement
27
+ requirement: &70268501331100 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70101145918920
35
+ version_requirements: *70268501331100
36
36
  - !ruby/object:Gem::Dependency
37
- name: absolutize
38
- requirement: &70101145918500 !ruby/object:Gem::Requirement
37
+ name: nokogiri
38
+ requirement: &70268501330680 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70101145918500
46
+ version_requirements: *70268501330680
47
47
  - !ruby/object:Gem::Dependency
48
- name: nokogiri
49
- requirement: &70101145934440 !ruby/object:Gem::Requirement
48
+ name: addressable
49
+ requirement: &70268501330240 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70101145934440
57
+ version_requirements: *70268501330240
58
58
  - !ruby/object:Gem::Dependency
59
- name: addressable
60
- requirement: &70101145934020 !ruby/object:Gem::Requirement
59
+ name: rspec
60
+ requirement: &70268501329820 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70101145934020
68
+ version_requirements: *70268501329820
69
69
  - !ruby/object:Gem::Dependency
70
- name: rspec
71
- requirement: &70101145933580 !ruby/object:Gem::Requirement
70
+ name: awesome_print
71
+ requirement: &70268501329400 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70101145933580
79
+ version_requirements: *70268501329400
80
80
  - !ruby/object:Gem::Dependency
81
- name: awesome_print
82
- requirement: &70101145933160 !ruby/object:Gem::Requirement
81
+ name: sinatra
82
+ requirement: &70268501328980 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70101145933160
90
+ version_requirements: *70268501328980
91
91
  - !ruby/object:Gem::Dependency
92
- name: sinatra
93
- requirement: &70101145932740 !ruby/object:Gem::Requirement
92
+ name: thin
93
+ requirement: &70268501328560 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70101145932740
101
+ version_requirements: *70268501328560
102
102
  - !ruby/object:Gem::Dependency
103
- name: thin
104
- requirement: &70101145932320 !ruby/object:Gem::Requirement
103
+ name: haml
104
+ requirement: &70268501328140 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70101145932320
112
+ version_requirements: *70268501328140
113
113
  - !ruby/object:Gem::Dependency
114
- name: haml
115
- requirement: &70101145931900 !ruby/object:Gem::Requirement
114
+ name: hashie
115
+ requirement: &70268501344080 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: '0'
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70101145931900
123
+ version_requirements: *70268501344080
124
124
  description:
125
125
  email: stewart@rockwellcottage.com
126
126
  executables: []
@@ -134,14 +134,12 @@ files:
134
134
  - spec/samples/sample_html_links.html
135
135
  - spec/spec.opts
136
136
  - spec/spec_helper.rb
137
- - lib/cobweb/version.rb
138
137
  - lib/cobweb.rb
139
138
  - lib/cobweb_crawler.rb
140
139
  - lib/cobweb_finished_job.rb
141
140
  - lib/cobweb_process_job.rb
142
141
  - lib/content_link_parser.rb
143
142
  - lib/crawl_job.rb
144
- - lib/hash.rb
145
143
  - lib/namespaced_redis.rb
146
144
  - lib/redirect_error.rb
147
145
  - lib/robots.rb
@@ -1 +0,0 @@
1
- VERSION = "0.0.21"
@@ -1,22 +0,0 @@
1
- ## add symbolize methods to hash
2
- class Hash
3
- def symbolize_keys
4
- keys.each do |key|
5
- if key.instance_of? String
6
- value = self[key]
7
- self.delete(key)
8
- self[key.to_sym] = value
9
- end
10
- end
11
- self
12
- end
13
- def deep_symbolize_keys
14
- symbolize_keys
15
- keys.each do |key|
16
- if self[key].instance_of? Hash
17
- self[key].deep_symbolize_keys
18
- end
19
- end
20
- self
21
- end
22
- end