cobweb 0.0.54 → 0.0.55

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.54
2
+ h1. Cobweb v0.0.55
3
3
 
4
4
  h2. Intro
5
5
 
@@ -14,6 +14,7 @@ h3. Standalone
14
14
  CobwebCrawler takes the same options as cobweb itself, so you can use any of the options available for that. An example is listed below.
15
15
 
16
16
  bq. crawler = CobwebCrawler.new(:cache => 600);
17
+
17
18
  bq. stats = crawler.crawl("http://www.pepsico.com")
18
19
 
19
20
  While the crawler is running, you can view statistics on http://localhost:4567
@@ -111,6 +112,8 @@ h2. Todo
111
112
  * Add ability to start and stop crawls from web interface
112
113
  * Allow crawler to start as web interface only (ie not run crawls at start)
113
114
  * Fix content encoding issue requiring separate process job
115
+ * DRY the cobweb get/head calls, its got a lot of duplication
116
+ * Investigate using event machine for single threaded crawling
114
117
 
115
118
  h3. Big changes
116
119
 
@@ -10,19 +10,15 @@ Dir[File.dirname(__FILE__) + '/**/*.rb'].each do |file|
10
10
  require file
11
11
  end
12
12
 
13
+ # Cobweb class is used to perform get and head requests. You can use this on its own if you wish without the crawler
13
14
  class Cobweb
14
- ## TASKS
15
-
16
- # redesign to have a resque stack and a single threaded stack
17
- # dry the code below, its got a lot of duplication
18
- # detect the end of the crawl (queued == 0 ?)
19
- # on end of crawl, return statistic hash (could call specified method ?) if single threaded or enqueue to a specified queue the stat hash
20
- # investigate using event machine for single threaded crawling
21
15
 
16
+ # retrieves current version
22
17
  def self.version
23
18
  CobwebVersion.version
24
19
  end
25
20
 
21
+ # used for setting default options
26
22
  def method_missing(method_sym, *arguments, &block)
27
23
  if method_sym.to_s =~ /^default_(.*)_to$/
28
24
  tag_name = method_sym.to_s.split("_")[1..-2].join("_").to_sym
@@ -32,6 +28,7 @@ class Cobweb
32
28
  end
33
29
  end
34
30
 
31
+ # See readme for more information on options available
35
32
  def initialize(options = {})
36
33
  @options = options
37
34
  default_use_encoding_safe_process_job_to false
@@ -49,6 +46,7 @@ class Cobweb
49
46
 
50
47
  end
51
48
 
49
+ # This method starts the resque based crawl and enqueues the base_url
52
50
  def start(base_url)
53
51
  raise ":base_url is required" unless base_url
54
52
  request = {
@@ -75,7 +73,20 @@ class Cobweb
75
73
 
76
74
  Resque.enqueue(CrawlJob, request)
77
75
  end
76
+
77
+ # Returns array of cookies from content
78
+ def get_cookies(response)
79
+ all_cookies = response.get_fields('set-cookie')
80
+ unless all_cookies.nil?
81
+ cookies_array = Array.new
82
+ all_cookies.each { |cookie|
83
+ cookies_array.push(cookie.split('; ')[0])
84
+ }
85
+ cookies = cookies_array.join('; ')
86
+ end
87
+ end
78
88
 
89
+ # Performs a HTTP GET request to the specified url applying the options supplied
79
90
  def get(url, options = @options)
80
91
  raise "url cannot be nil" if url.nil?
81
92
  uri = Addressable::URI.parse(url)
@@ -103,7 +114,7 @@ class Cobweb
103
114
  # check if it has already been cached
104
115
  if redis.get(unique_id) and @options[:cache]
105
116
  puts "Cache hit for #{url}" unless @options[:quiet]
106
- content = deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
117
+ content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
107
118
  else
108
119
  # retrieve data
109
120
  unless @http && @http.address == uri.host && @http.port == uri.inferred_port
@@ -173,7 +184,7 @@ class Cobweb
173
184
  content[:body] = Base64.encode64(response.body)
174
185
  end
175
186
  content[:location] = response["location"]
176
- content[:headers] = deep_symbolize_keys(response.to_hash)
187
+ content[:headers] = HashUtil.deep_symbolize_keys(response.to_hash)
177
188
  # parse data for links
178
189
  link_parser = ContentLinkParser.new(content[:url], content[:body])
179
190
  content[:links] = link_parser.link_data
@@ -233,17 +244,7 @@ class Cobweb
233
244
  content
234
245
  end
235
246
 
236
- def get_cookies(response)
237
- all_cookies = response.get_fields('set-cookie')
238
- unless all_cookies.nil?
239
- cookies_array = Array.new
240
- all_cookies.each { |cookie|
241
- cookies_array.push(cookie.split('; ')[0])
242
- }
243
- cookies = cookies_array.join('; ')
244
- end
245
- end
246
-
247
+ # Performs a HTTP HEAD request to the specified url applying the options supplied
247
248
  def head(url, options = @options)
248
249
  raise "url cannot be nil" if url.nil?
249
250
  uri = Addressable::URI.parse(url)
@@ -271,7 +272,7 @@ class Cobweb
271
272
  # check if it has already been cached
272
273
  if redis.get("head-#{unique_id}") and @options[:cache]
273
274
  puts "Cache hit for #{url}" unless @options[:quiet]
274
- content = deep_symbolize_keys(Marshal.load(redis.get("head-#{unique_id}")))
275
+ content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get("head-#{unique_id}")))
275
276
  else
276
277
  # retrieve data
277
278
  unless @http && @http.address == uri.host && @http.port == uri.inferred_port
@@ -379,18 +380,6 @@ class Cobweb
379
380
 
380
381
  content
381
382
  end
382
- end
383
-
384
- def deep_symbolize_keys(hash)
385
- hash.keys.each do |key|
386
- value = hash[key]
387
- hash.delete(key)
388
- hash[key.to_sym] = value
389
- if hash[key.to_sym].instance_of? Hash
390
- hash[key.to_sym] = deep_symbolize_keys(hash[key.to_sym])
391
- end
392
- end
393
- hash
394
- end
395
-
383
+
384
+ end
396
385
  end
@@ -3,8 +3,10 @@ require 'date'
3
3
  require 'ap'
4
4
  #require 'namespaced_redis'
5
5
 
6
+ # CobwebCrawler is a standalone crawler, it includes a built in statistics monitor using Sinatra.
6
7
  class CobwebCrawler
7
8
 
9
+ # See README for more information on options available
8
10
  def initialize(options={})
9
11
  @options = options
10
12
 
@@ -31,6 +33,7 @@ class CobwebCrawler
31
33
  @cobweb = Cobweb.new(@options)
32
34
  end
33
35
 
36
+ # Initiates a crawl starting at the base_url and applying the options supplied. Can also take a block that is executed and passed content hash and statistic hash'
34
37
  def crawl(base_url, crawl_options = {}, &block)
35
38
  @options[:base_url] = base_url unless @options.has_key? :base_url
36
39
 
@@ -107,7 +110,9 @@ class CobwebCrawler
107
110
 
108
111
  end
109
112
 
113
+ # Monkey patch into String a starts_with method
110
114
  class String
115
+ # Monkey patch into String a starts_with method
111
116
  def cobweb_starts_with?(val)
112
117
  if self.length >= val.length
113
118
  self[0..val.length-1] == val
@@ -1,8 +1,10 @@
1
+ # Dummy resque job that executes at the end of the crawl if none are specified
1
2
  class CobwebFinishedJob
2
3
  require "ap"
3
4
 
4
5
  @queue = :cobweb_finished_job
5
6
 
7
+ # perform method for resque to execute
6
8
  def self.perform(statistics)
7
9
  puts "Dummy Finished Job"
8
10
 
@@ -1,6 +1,8 @@
1
+
2
+ # CobwebLinks processes links to determine whether they are internal or external links
1
3
  class CobwebLinks
2
4
 
3
- # processes links supplied to it
5
+ # Initalise's internal and external patterns and sets up regular expressions
4
6
  def initialize(options={})
5
7
  @options = options
6
8
 
@@ -15,6 +17,7 @@ class CobwebLinks
15
17
 
16
18
  end
17
19
 
20
+ # Returns true if the link is matched to an internal_url and not matched to an external_url
18
21
  def internal?(link)
19
22
  if @options[:debug]
20
23
  puts "--------------------------------"
@@ -27,6 +30,7 @@ class CobwebLinks
27
30
  !@internal_patterns.select{|pattern| link.match(pattern)}.empty? && @external_patterns.select{|pattern| link.match(pattern)}.empty?
28
31
  end
29
32
 
33
+ # Returns true if the link is matched to an external_url or not matched to an internal_url
30
34
  def external?(link)
31
35
  if @options[:debug]
32
36
  puts "--------------------------------"
@@ -40,6 +44,7 @@ class CobwebLinks
40
44
  end
41
45
 
42
46
  private
47
+ # escapes characters with meaning in regular expressions and adds wildcard expression
43
48
  def escape_pattern_for_regex(pattern)
44
49
  pattern = pattern.gsub(".", "\\.")
45
50
  pattern = pattern.gsub("?", "\\?")
@@ -49,8 +54,10 @@ class CobwebLinks
49
54
  end
50
55
  end
51
56
 
57
+ # Exception raised for :internal_urls missing from CobwebLinks
52
58
  class InternalUrlsMissingError < Exception
53
59
  end
60
+ # Exception raised for :internal_urls being invalid from CobwebLinks
54
61
  class InvalidUrlsError < Exception
55
62
  end
56
63
 
@@ -1,8 +1,10 @@
1
+ # Dummy resque process job that is ran if none are specified
1
2
  class CobwebProcessJob
2
3
  require "ap"
3
4
 
4
5
  @queue = :cobweb_process_job
5
6
 
7
+ # Resque perform method
6
8
  def self.perform(content)
7
9
  content = HashHelper.symbolize_keys(content)
8
10
  puts "Dummy Processing for #{content[:url]}"
@@ -1,6 +1,9 @@
1
+ # CobwebVersion holds the current version of the gem
1
2
  class CobwebVersion
3
+
4
+ # Returns a string of the current version
2
5
  def self.version
3
- "0.0.54"
6
+ "0.0.55"
4
7
  end
5
8
 
6
9
  end
@@ -1,8 +1,10 @@
1
+ require "nokogiri"
1
2
 
3
+ # ContentLinkParser extracts links from HTML content and assigns them to a hash based on the location the link was found. The has contents can be configured in options, however, defaults to a pretty sensible default.
4
+ # Links can also be returned regardless of the location they were located and can be filtered by the scheme
2
5
  class ContentLinkParser
3
6
 
4
- require "nokogiri"
5
-
7
+ # Parses the content and absolutizes the urls based on url. Options can be setup to determine the links that are extracted.
6
8
  def initialize(url, content, options = {})
7
9
  @options = options
8
10
  @url = url
@@ -29,6 +31,7 @@ class ContentLinkParser
29
31
 
30
32
  end
31
33
 
34
+ # Returns a hash with arrays of links
32
35
  def link_data
33
36
  data = {}
34
37
  @options[:tags].keys.each do |key|
@@ -37,6 +40,7 @@ class ContentLinkParser
37
40
  data
38
41
  end
39
42
 
43
+ # Returns an array of all absolutized links, specify :valid_schemes in options to limit to certain schemes. Also filters repeating folders (ie if the crawler got in a link loop situation)
40
44
  def all_links(options = {})
41
45
  options[:valid_schemes] = [:http, :https] unless options.has_key? :valid_schemes
42
46
  data = link_data
@@ -47,6 +51,7 @@ class ContentLinkParser
47
51
  links
48
52
  end
49
53
 
54
+ # Returns the type of links as a method rather than using the hash e.g. 'content_link_parser.images'
50
55
  def method_missing(m)
51
56
  if @options[:tags].keys.include?(m)
52
57
  links = []
@@ -60,6 +65,8 @@ class ContentLinkParser
60
65
  end
61
66
  end
62
67
 
68
+ private
69
+ # Processes the content to find links based on options[:tags]
63
70
  def find_matches(array, selector, attribute)
64
71
  if attribute.kind_of? String or attribute.kind_of? Symbol
65
72
  @doc.css(selector).each do |tag|
@@ -1,3 +1,5 @@
1
+
2
+ # CrawlJob defines a resque job to perform the crawl
1
3
  class CrawlJob
2
4
 
3
5
  require "net/https"
@@ -7,10 +9,11 @@ class CrawlJob
7
9
 
8
10
  @queue = :cobweb_crawl_job
9
11
 
12
+ # Resque perform method to maintain the crawl, enqueue found links and detect the end of crawl
10
13
  def self.perform(content_request)
11
14
 
12
15
  # change all hash keys to symbols
13
- content_request = self.deep_symbolize_keys(content_request)
16
+ content_request = HashUtil.deep_symbolize_keys(content_request)
14
17
 
15
18
  content_request[:redis_options] = {} unless content_request.has_key? :redis_options
16
19
  @redis = NamespacedRedis.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
@@ -81,12 +84,14 @@ class CrawlJob
81
84
 
82
85
  end
83
86
 
87
+ # Sets the crawl status to 'Crawl Stopped' and enqueues the crawl finished job
84
88
  def self.finished(content_request)
85
89
  # finished
86
90
  @stats.end_crawl(content_request)
87
91
  Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
88
92
  end
89
-
93
+
94
+ # Enqueues the content to the processing queue setup in options
90
95
  def self.send_to_processing_queue(content, content_request)
91
96
  content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
92
97
  if content_request[:use_encoding_safe_process_job]
@@ -102,14 +107,17 @@ class CrawlJob
102
107
 
103
108
  private
104
109
 
110
+ # Returns true if the crawl count is within limits
105
111
  def self.within_crawl_limits?(crawl_limit)
106
112
  crawl_limit.nil? or @crawl_counter < crawl_limit.to_i
107
113
  end
108
114
 
115
+ # Returns true if the queue count is calculated to be still within limits when complete
109
116
  def self.within_queue_limits?(crawl_limit)
110
117
  within_crawl_limits?(crawl_limit) and (crawl_limit.nil? or (@queue_counter + @crawl_counter) < crawl_limit.to_i)
111
118
  end
112
119
 
120
+ # Sets the base url in redis. If the first page is a redirect, it sets the base_url to the destination
113
121
  def self.set_base_url(redis, content, content_request)
114
122
  if redis.get("base_url").nil?
115
123
  unless content[:redirect_through].nil? || content[:redirect_through].empty? || !content_request[:first_page_redirect_internal]
@@ -120,6 +128,7 @@ class CrawlJob
120
128
  end
121
129
  end
122
130
 
131
+ # Enqueues content to the crawl_job queue
123
132
  def self.enqueue_content(content_request, link)
124
133
  new_request = content_request.clone
125
134
  new_request[:url] = link
@@ -129,37 +138,32 @@ class CrawlJob
129
138
  increment_queue_counter
130
139
  end
131
140
 
141
+ # Increments the queue counter and refreshes crawl counters
132
142
  def self.increment_queue_counter
133
143
  @redis.incr "queue-counter"
134
144
  refresh_counters
135
145
  end
146
+ # Increments the crawl counter and refreshes crawl counters
136
147
  def self.increment_crawl_counter
137
148
  @redis.incr "crawl-counter"
138
149
  refresh_counters
139
150
  end
151
+ # Decrements the queue counter and refreshes crawl counters
140
152
  def self.decrement_queue_counter
141
153
  @redis.decr "queue-counter"
142
154
  refresh_counters
143
155
  end
156
+ # Refreshes the crawl counters
144
157
  def self.refresh_counters
145
158
  @crawl_counter = @redis.get("crawl-counter").to_i
146
159
  @queue_counter = @redis.get("queue-counter").to_i
147
160
  end
161
+ # Sets the crawl counters based on the crawled and queued queues
148
162
  def self.reset_counters
149
163
  @redis.set("crawl-counter", @redis.smembers("crawled").count)
150
164
  @redis.set("queue-counter", @redis.smembers("queued").count)
151
165
  @crawl_counter = @redis.get("crawl-counter").to_i
152
166
  @queue_counter = @redis.get("queue-counter").to_i
153
167
  end
154
- def self.deep_symbolize_keys(hash)
155
- hash.keys.each do |key|
156
- value = hash[key]
157
- hash.delete(key)
158
- hash[key.to_sym] = value
159
- if hash[key.to_sym].instance_of? Hash
160
- hash[key.to_sym] = self.deep_symbolize_keys(hash[key.to_sym])
161
- end
162
- end
163
- hash
164
- end
168
+
165
169
  end
@@ -1,7 +1,9 @@
1
+ # Process Job to resolve encoding issue
1
2
  class EncodingSafeProcessJob
2
3
 
3
4
  @queue = :encoding_safe_process_job
4
5
 
6
+ # Resque perform method
5
7
  def self.perform(content)
6
8
  clazz = const_get(content["processing_queue"])
7
9
  content["body"] = Base64.decode64(content["body"])
@@ -0,0 +1,16 @@
1
+ # Collection of utility methods for the Hash object
2
+ class HashUtil
3
+
4
+ # Returns a hash with the keys converted to symbols
5
+ def self.deep_symbolize_keys(hash)
6
+ hash.keys.each do |key|
7
+ value = hash[key]
8
+ hash.delete(key)
9
+ hash[key.to_sym] = value
10
+ if hash[key.to_sym].instance_of? Hash
11
+ hash[key.to_sym] = HashUtil.deep_symbolize_keys(hash[key.to_sym])
12
+ end
13
+ end
14
+ hash
15
+ end
16
+ end
@@ -1,2 +1,3 @@
1
+ # Redirect Exception
1
2
  class RedirectError < Exception
2
3
  end
@@ -1,5 +1,7 @@
1
+ # Robots retrieves and processes the robots.txt file from the target server
1
2
  class Robots
2
3
 
4
+ # Processes the robots.txt file
3
5
  def initialize(url, file_name="robots.txt")
4
6
  uri = URI.parse(url)
5
7
  [uri.scheme, "://", uri.host, ":", uri.port, "/", file_name].join
@@ -1,6 +1,7 @@
1
1
  require 'sinatra'
2
2
  require 'haml'
3
3
 
4
+ # Sinatra server to host the statistics for the CobwebCrawler
4
5
  class Server < Sinatra::Base
5
6
 
6
7
  set :views, settings.root + '/../views'
@@ -8,6 +9,7 @@ class Server < Sinatra::Base
8
9
  set :public_folder, settings.root + '/../public'
9
10
  enable :static
10
11
 
12
+ # Sinatra Dashboard
11
13
  get '/' do
12
14
  @full_redis = Redis.new
13
15
 
@@ -27,6 +29,7 @@ class Server < Sinatra::Base
27
29
  haml :home
28
30
  end
29
31
 
32
+ # Sinatra Crawl Detail
30
33
  get '/statistics/:crawl_id' do
31
34
  redis = NamespacedRedis.new({}, "cobweb-#{params[:crawl_id]}")
32
35
 
@@ -58,6 +61,7 @@ class Server < Sinatra::Base
58
61
  haml :statistics
59
62
  end
60
63
 
64
+ # Starts the Sinatra server, and kills the processes when shutdown
61
65
  def self.start
62
66
  unless Server.running?
63
67
  thread = Thread.new do
@@ -72,21 +76,10 @@ class Server < Sinatra::Base
72
76
 
73
77
  end
74
78
 
75
- class HashUtil
76
- def self.deep_symbolize_keys(hash)
77
- hash.keys.each do |key|
78
- value = hash[key]
79
- hash.delete(key)
80
- hash[key.to_sym] = value
81
- if hash[key.to_sym].instance_of? Hash
82
- hash[key.to_sym] = HashUtil.deep_symbolize_keys(hash[key.to_sym])
83
- end
84
- end
85
- hash
86
- end
87
- end
88
-
79
+ # Monkey Patch of the Numeric class
89
80
  class Numeric
81
+
82
+ #Returns a human readable format for a number representing a data size
90
83
  def to_human
91
84
  units = %w{B KB MB GB TB}
92
85
  ap self
@@ -1,10 +1,14 @@
1
+
2
+ # Stats class is the main statisitics hub for monitoring crawls. Either can be viewed through the Sinatra interface, or returned from the CobwebCrawler.crawl method or block
1
3
  class Stats
2
4
 
5
+ # Sets up redis usage for statistics
3
6
  def initialize(options)
4
7
  @full_redis = Redis.new(options[:redis_options])
5
8
  @redis = NamespacedRedis.new(options[:redis_options], "cobweb-#{options[:crawl_id]}")
6
9
  end
7
10
 
11
+ # Sets up the crawl in statistics
8
12
  def start_crawl(options)
9
13
  unless @full_redis.sismember "cobweb_crawls", options[:crawl_id]
10
14
  @full_redis.sadd "cobweb_crawls", options[:crawl_id]
@@ -15,12 +19,14 @@ class Stats
15
19
  @redis.hset "statistics", "current_status", "Crawl Starting..."
16
20
  end
17
21
 
22
+ # Removes the crawl from the running crawls and updates status
18
23
  def end_crawl(options)
19
24
  @full_redis.srem "cobweb_crawls", options[:crawl_id]
20
25
  @redis.hset "statistics", "current_status", "Crawl Stopped"
21
26
  @redis.del "crawl_details"
22
27
  end
23
28
 
29
+ # Returns statistics hash. update_statistics takes the content hash, extracts statistics from it and updates redis with the data.
24
30
  def update_statistics(content, crawl_counter=@redis.scard("crawled").to_i, queue_counter=@redis.scard("queued").to_i)
25
31
 
26
32
  @statistics = get_statistics
@@ -125,6 +131,41 @@ class Stats
125
131
  @statistics
126
132
  end
127
133
 
134
+ # Returns the statistics hash
135
+ def get_statistics
136
+
137
+ @statistics = HashUtil.deep_symbolize_keys(@redis.hgetall("statistics"))
138
+ if @statistics[:status_counts].nil?
139
+ @statistics[:status_counts]
140
+ else
141
+ @statistics[:status_counts] = JSON.parse(@statistics[:status_counts])
142
+ end
143
+ if @statistics[:mime_counts].nil?
144
+ @statistics[:mime_counts]
145
+ else
146
+ @statistics[:mime_counts] = JSON.parse(@statistics[:mime_counts])
147
+ end
148
+ @statistics
149
+ end
150
+
151
+ # Sets the current status of the crawl
152
+ def update_status(status)
153
+ @redis.hset "statistics", "current_status", status
154
+ end
155
+
156
+ # Returns the current status of the crawl
157
+ def get_status
158
+ @redis.hget "statistics", "current_status"
159
+ end
160
+
161
+ # Sets totals for the end of the crawl (Not Used)
162
+ def set_totals
163
+ stats = get_statistics
164
+ stats[:crawled] = @redis.smembers "crawled"
165
+ end
166
+
167
+ private
168
+ # Records a time based statistic
128
169
  def record_time_stat(stat_name, value, type="minute", duration=60)
129
170
  key = DateTime.now.strftime("%Y-%m-%d %H:%M")
130
171
  if type == "hour"
@@ -142,6 +183,7 @@ class Stats
142
183
  end
143
184
  end
144
185
 
186
+ # Increments a time based statistic (eg pages per minute)
145
187
  def increment_time_stat(stat_name, type="minute", duration=60)
146
188
  key = DateTime.now.strftime("%Y-%m-%d %H:%M")
147
189
  if type == "hour"
@@ -162,35 +204,6 @@ class Stats
162
204
  end
163
205
  end
164
206
 
165
- def get_statistics
166
-
167
- @statistics = HashUtil.deep_symbolize_keys(@redis.hgetall("statistics"))
168
- if @statistics[:status_counts].nil?
169
- @statistics[:status_counts]
170
- else
171
- @statistics[:status_counts] = JSON.parse(@statistics[:status_counts])
172
- end
173
- if @statistics[:mime_counts].nil?
174
- @statistics[:mime_counts]
175
- else
176
- @statistics[:mime_counts] = JSON.parse(@statistics[:mime_counts])
177
- end
178
- @statistics
179
- end
180
-
181
- def update_status(status)
182
- @redis.hset "statistics", "current_status", status
183
- end
184
-
185
- def get_status
186
- @redis.hget "statistics", "current_status"
187
- end
188
-
189
- def set_totals
190
- stats = get_statistics
191
- stats[:crawled] = @redis.smembers "crawled"
192
- end
193
-
194
207
  end
195
208
 
196
209
 
@@ -1,4 +1,6 @@
1
+ # Helper class to perform tasks on URI's
1
2
  class UriHelper
3
+ # Returns an Addressable::URI with the fragment section removed
2
4
  def self.join_no_fragment(content, link)
3
5
  new_link = Addressable::URI.join(content, link)
4
6
  new_link.fragment=nil
@@ -1,6 +1,7 @@
1
1
  require File.expand_path(File.dirname(__FILE__) + '/../lib/cobweb')
2
2
  require 'mock_redis'
3
3
 
4
+ # Sets up the environment as test so that exceptions are raised
4
5
  ENVIRONMENT = "test"
5
6
 
6
7
  RSpec.configure do |config|
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.54
4
+ version: 0.0.55
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-05-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70234816199400 !ruby/object:Gem::Requirement
16
+ requirement: &70166180363640 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70234816199400
24
+ version_requirements: *70166180363640
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70234816197960 !ruby/object:Gem::Requirement
27
+ requirement: &70166180362480 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70234816197960
35
+ version_requirements: *70166180362480
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70234816196900 !ruby/object:Gem::Requirement
38
+ requirement: &70166180361540 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70234816196900
46
+ version_requirements: *70166180361540
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70234816195380 !ruby/object:Gem::Requirement
49
+ requirement: &70166180359680 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70234816195380
57
+ version_requirements: *70166180359680
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70234816192860 !ruby/object:Gem::Requirement
60
+ requirement: &70166180357240 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70234816192860
68
+ version_requirements: *70166180357240
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70234820117420 !ruby/object:Gem::Requirement
71
+ requirement: &70166180380020 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70234820117420
79
+ version_requirements: *70166180380020
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70234820114840 !ruby/object:Gem::Requirement
82
+ requirement: &70166180377140 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70234820114840
90
+ version_requirements: *70166180377140
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70234820127140 !ruby/object:Gem::Requirement
93
+ requirement: &70166180389240 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70234820127140
101
+ version_requirements: *70166180389240
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70234820126100 !ruby/object:Gem::Requirement
104
+ requirement: &70166180388200 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70234820126100
112
+ version_requirements: *70166180388200
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70234820125020 !ruby/object:Gem::Requirement
115
+ requirement: &70166180387040 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70234820125020
123
+ version_requirements: *70166180387040
124
124
  description: Web Crawler that uses resque background job engine to allow you to cluster
125
125
  your crawl.
126
126
  email: stewart@rockwellcottage.com
@@ -146,6 +146,7 @@ files:
146
146
  - lib/content_link_parser.rb
147
147
  - lib/crawl_job.rb
148
148
  - lib/encoding_safe_process_job.rb
149
+ - lib/hash_util.rb
149
150
  - lib/redirect_error.rb
150
151
  - lib/robots.rb
151
152
  - lib/server.rb