cobweb 1.0.11 → 1.0.12

Sign up to get free protection for your applications and to get access to all the features.
data/README.textile CHANGED
@@ -1,10 +1,11 @@
1
1
 
2
- h1. Cobweb v1.0.11
2
+ h1. Cobweb v1.0.12
3
3
 
4
4
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
-
6
- !https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
5
+ !https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
7
6
  !https://gemnasium.com/stewartmckee/cobweb.png!
7
+ !https://coveralls.io/repos/stewartmckee/cobweb/badge.png?branch=master(Coverage Status)!:https://coveralls.io/r/stewartmckee/cobweb
8
+
8
9
 
9
10
  h2. Intro
10
11
 
data/lib/cobweb.rb CHANGED
@@ -33,8 +33,14 @@ class Cobweb
33
33
  default_use_encoding_safe_process_job_to false
34
34
  default_follow_redirects_to true
35
35
  default_redirect_limit_to 10
36
- default_processing_queue_to "CobwebProcessJob"
37
- default_crawl_finished_queue_to "CobwebFinishedJob"
36
+ default_queue_system_to :resque
37
+ if @options[:queue_system] == :resque
38
+ default_processing_queue_to "CobwebProcessJob"
39
+ default_crawl_finished_queue_to "CobwebFinishedJob"
40
+ else
41
+ default_processing_queue_to "CrawlProcessWorker"
42
+ default_crawl_finished_queue_to "CrawlFinishedWorker"
43
+ end
38
44
  default_quiet_to true
39
45
  default_debug_to false
40
46
  default_cache_to 300
@@ -49,7 +55,7 @@ class Cobweb
49
55
  default_user_agent_to "cobweb/#{Cobweb.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"
50
56
  default_valid_mime_types_to ["*/*"]
51
57
  default_raise_exceptions_to false
52
- default_store_refered_url_to false
58
+ default_store_inbound_links_to false
53
59
 
54
60
  end
55
61
 
@@ -80,7 +86,14 @@ class Cobweb
80
86
 
81
87
  # add internal_urls into redis
82
88
  @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
83
- Resque.enqueue(CrawlJob, request)
89
+ if @options[:queue_system] == :resque
90
+ Resque.enqueue(CrawlJob, request)
91
+ elsif @options[:queue_system] == :sidekiq
92
+ CrawlWorker.perform_async(request)
93
+ else
94
+ raise "Unknown queue system: #{content_request[:queue_system]}"
95
+ end
96
+
84
97
  request
85
98
  end
86
99
 
@@ -124,8 +137,13 @@ class Cobweb
124
137
 
125
138
  # check if it has already been cached
126
139
  if ((@options[:cache_type] == :crawl_based && redis.get(unique_id)) || (@options[:cache_type] == :full && full_redis.get(unique_id))) && @options[:cache]
127
- puts "Cache hit for #{url}" unless @options[:quiet]
128
- content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
140
+ if @options[:cache_type] == :crawl_based
141
+ puts "Cache hit for #{url}" unless @options[:quiet]
142
+ content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
143
+ else
144
+ puts "Cache hit for #{url}" unless @options[:quiet]
145
+ content = HashUtil.deep_symbolize_keys(Marshal.load(full_redis.get(unique_id)))
146
+ end
129
147
  else
130
148
  # retrieve data
131
149
  #unless @http && @http.address == uri.host && @http.port == uri.inferred_port
@@ -204,8 +222,13 @@ class Cobweb
204
222
  end
205
223
  # add content to cache if required
206
224
  if @options[:cache]
207
- redis.set(unique_id, Marshal.dump(content))
208
- redis.expire unique_id, @options[:cache].to_i
225
+ if @options[:cache_type] == :crawl_based
226
+ redis.set(unique_id, Marshal.dump(content))
227
+ redis.expire unique_id, @options[:cache].to_i
228
+ else
229
+ full_redis.set(unique_id, Marshal.dump(content))
230
+ full_redis.expire unique_id, @options[:cache].to_i
231
+ end
209
232
  end
210
233
  rescue RedirectError => e
211
234
  raise e if @options[:raise_exceptions]
@@ -96,14 +96,12 @@ class CobwebCrawler
96
96
  @redis.sadd "crawled", url.to_s
97
97
  @redis.incr "crawl-counter"
98
98
 
99
- internal_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https])
99
+ document_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https]).uniq
100
100
 
101
101
  # select the link if its internal (eliminate external before expensive lookups in queued and crawled)
102
102
  cobweb_links = CobwebLinks.new(@options)
103
103
 
104
- internal_links = internal_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
105
-
106
- all_internal_links = internal_links
104
+ internal_links = document_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
107
105
 
108
106
  # reject the link if we've crawled it or queued it
109
107
  internal_links.reject!{|link| @redis.sismember("crawled", link)}
@@ -120,12 +118,13 @@ class CobwebCrawler
120
118
  @queue_counter += 1
121
119
  end
122
120
 
123
- if @options[:store_refered_url]
124
- all_internal_links.each do |link|
125
- @redis.sadd("inbound_links_#{Digest::MD5.hexdigest(link)}", url)
121
+ if @options[:store_inbound_links]
122
+ document_links.each do |target_link|
123
+ target_uri = UriHelper.parse(target_link)
124
+ @redis.sadd("inbound_links_#{Digest::MD5.hexdigest(target_uri.to_s)}", UriHelper.parse(url).to_s)
126
125
  end
127
126
  end
128
-
127
+
129
128
  @crawl_counter = @redis.scard("crawled").to_i
130
129
  @queue_counter = @redis.scard("queued").to_i
131
130
 
@@ -6,7 +6,7 @@ class CobwebProcessJob
6
6
 
7
7
  # Resque perform method
8
8
  def self.perform(content)
9
- content = HashHelper.symbolize_keys(content)
9
+ content = HashUtil.deep_symbolize_keys(content)
10
10
  puts "Dummy Processing for #{content[:url]}"
11
11
 
12
12
  #ap content.keys
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.11"
6
+ "1.0.12"
7
7
  end
8
8
 
9
9
  end
data/lib/crawl.rb CHANGED
@@ -88,11 +88,12 @@ module CobwebModule
88
88
 
89
89
  @cobweb_links = CobwebLinks.new(@options)
90
90
  if within_queue_limits?
91
- internal_links = ContentLinkParser.new(@options[:url], content.body, @options).all_links(:valid_schemes => [:http, :https])
91
+ document_links = ContentLinkParser.new(@options[:url], content.body, @options).all_links(:valid_schemes => [:http, :https])
92
92
  #get rid of duplicate links in the same page.
93
- internal_links.uniq!
93
+ document_links.uniq!
94
+
94
95
  # select the link if its internal
95
- internal_links.select! { |link| @cobweb_links.internal?(link) }
96
+ internal_links = document_links.select{ |link| @cobweb_links.internal?(link) }
96
97
 
97
98
  # reject the link if we've crawled it or queued it
98
99
  internal_links.reject! { |link| @redis.sismember("crawled", link) }
@@ -111,6 +112,13 @@ module CobwebModule
111
112
  end
112
113
  end
113
114
  end
115
+
116
+ if @options[:store_inbound_links]
117
+ document_links.each do |link|
118
+ uri = URI.parse(link)
119
+ @redis.sadd("inbound_links_#{Digest::MD5.hexdigest(uri.to_s)}", url)
120
+ end
121
+ end
114
122
  end
115
123
  end
116
124
 
@@ -180,7 +188,6 @@ module CobwebModule
180
188
 
181
189
  def finished
182
190
  set_first_to_finish
183
- debug_ap "CRAWL FINISHED #{@options[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}"
184
191
  @stats.end_crawl(@options)
185
192
  end
186
193
 
@@ -0,0 +1,27 @@
1
+ require 'sidekiq'
2
+ require File.expand_path(File.dirname(__FILE__) + '/sidekiq/cobweb_helper')
3
+
4
+ # If your client is single-threaded, we just need a single connection in our Redis connection pool
5
+ #Sidekiq.configure_client do |config|
6
+ # config.redis = { :namespace => 'x', :size => 1, :url => 'redis://localhost:6379/14' }
7
+ #end
8
+
9
+ # Sidekiq server is multi-threaded so our Redis connection pool size defaults to concurrency (-c)
10
+ #Sidekiq.configure_server do |config|
11
+ # config.redis = { :namespace => 'x', :url => 'redis://localhost:6379/14' }
12
+ #end
13
+
14
+ class CrawlFinishedWorker
15
+
16
+ include Sidekiq::Worker
17
+
18
+ sidekiq_options queue: "crawl_finished_worker"
19
+
20
+
21
+ def perform(statistics)
22
+ puts "Dummy Finished Job"
23
+
24
+ ap statistics
25
+
26
+ end
27
+ end
@@ -0,0 +1,250 @@
1
+ class CrawlHelper
2
+
3
+ require "net/https"
4
+ require "uri"
5
+ require "redis"
6
+ require 'namespaced_redis'
7
+
8
+ def self.crawl_page(content_request)
9
+ # change all hash keys to symbols
10
+ content_request = HashUtil.deep_symbolize_keys(content_request)
11
+ @content_request = content_request
12
+
13
+ content_request[:redis_options] = {} unless content_request.has_key? :redis_options
14
+ content_request[:crawl_limit_by_page] = false unless content_request.has_key? :crawl_limit_by_page
15
+ content_request[:valid_mime_types] = ["*/*"] unless content_request.has_key? :valid_mime_types
16
+ content_request[:queue_system] = content_request[:queue_system].to_sym
17
+
18
+ @redis = NamespacedRedis.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
19
+ @stats = Stats.new(content_request)
20
+
21
+ @debug = content_request[:debug]
22
+
23
+ decrement_queue_counter
24
+
25
+ # check we haven't crawled this url before
26
+ unless @redis.sismember "crawled", content_request[:url]
27
+ # if there is no limit or we're still under it lets get the url
28
+ if within_crawl_limits?(content_request[:crawl_limit])
29
+ content = Cobweb.new(content_request).get(content_request[:url], content_request)
30
+ if content_request[:url] == @redis.get("original_base_url")
31
+ @redis.set("crawled_base_url", content[:base_url])
32
+ end
33
+ if is_permitted_type(content)
34
+ begin
35
+ # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
36
+ @redis.srem "queued", content_request[:url]
37
+ @redis.sadd "crawled", content_request[:url]
38
+ @redis.srem "queued", content[:url]
39
+ @redis.sadd "crawled", content[:url]
40
+ # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
41
+ if content_request[:crawl_limit_by_page]
42
+ if content[:mime_type].match("text/html")
43
+ increment_crawl_started_counter
44
+ end
45
+ else
46
+ increment_crawl_started_counter
47
+ end
48
+
49
+ ## update statistics
50
+ @stats.update_status("Crawling #{content_request[:url]}...")
51
+ @stats.update_statistics(content)
52
+
53
+ # set the base url if this is the first page
54
+ set_base_url @redis, content, content_request
55
+
56
+ @cobweb_links = CobwebLinks.new(content_request)
57
+ if within_queue_limits?(content_request[:crawl_limit])
58
+ internal_links = ContentLinkParser.new(content_request[:url], content[:body], content_request).all_links(:valid_schemes => [:http, :https])
59
+
60
+ # select the link if its internal
61
+ internal_links.select! { |link| @cobweb_links.internal?(link) }
62
+
63
+ # reject the link if we've crawled it or queued it
64
+ internal_links.reject! { |link| @redis.sismember("crawled", link) }
65
+ internal_links.reject! { |link| @redis.sismember("queued", link) }
66
+
67
+ internal_links.each do |link|
68
+ enqueue_content(content_request, link) if within_queue_limits?(content_request[:crawl_limit])
69
+ end
70
+ end
71
+
72
+ # enqueue to processing queue
73
+ send_to_processing_queue(content, content_request)
74
+
75
+ #if the enqueue counter has been requested update that
76
+ if content_request.has_key? :enqueue_counter_key
77
+ enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
78
+ current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
79
+ enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
80
+ end
81
+
82
+ ensure
83
+ #update the queued and crawled lists if we are within the crawl limits.
84
+
85
+ # update the queue and crawl counts -- doing this very late in the piece so that the following transaction all occurs at once.
86
+ # really we should do this with a lock https://github.com/PatrickTulskie/redis-lock
87
+ if content_request[:crawl_limit_by_page]
88
+ if content[:mime_type].match("text/html")
89
+ increment_crawl_counter
90
+ end
91
+ else
92
+ increment_crawl_counter
93
+ end
94
+ puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter} In Progress: #{@crawl_started_counter-@crawl_counter}" if @debug
95
+ end
96
+ else
97
+ puts "ignoring #{content_request[:url]} as mime_type is #{content[:mime_type]}" if content_request[:debug]
98
+ end
99
+ else
100
+ puts "ignoring #{content_request[:url]} as outside of crawl limits." if content_request[:debug]
101
+ end
102
+
103
+ else
104
+ @redis.srem "queued", content_request[:url]
105
+ puts "Already crawled #{content_request[:url]}" if content_request[:debug]
106
+ end
107
+
108
+ # if there's nothing left queued or the crawled limit has been reached
109
+ refresh_counters
110
+ if content_request[:crawl_limit].nil? || content_request[:crawl_limit] == 0
111
+ if @queue_counter+@crawl_started_counter-@crawl_counter == 0
112
+ finished(content_request)
113
+ end
114
+ elsif (@queue_counter +@crawl_started_counter-@crawl_counter)== 0 || @crawl_counter >= content_request[:crawl_limit].to_i
115
+ finished(content_request)
116
+ end
117
+
118
+ end
119
+
120
+ # Sets the crawl status to 'Crawl Finished' and enqueues the crawl finished job
121
+ def self.finished(content_request)
122
+ # finished
123
+ if @redis.hget("statistics", "current_status")!= "Crawl Finished"
124
+ ap "CRAWL FINISHED #{content_request[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}" if content_request[:debug]
125
+ @stats.end_crawl(content_request)
126
+
127
+ additional_stats = {:crawl_id => content_request[:crawl_id], :crawled_base_url => @redis.get("crawled_base_url")}
128
+ additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
129
+ additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
130
+
131
+ if content_request[:queue_system] == :resque
132
+ Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge(additional_stats))
133
+ elsif content_request[:queue_system] == :sidekiq
134
+ puts "Queueing Finished on Sidekiq"
135
+ const_get(content_request[:crawl_finished_queue]).perform_async(@stats.get_statistics.merge(additional_stats))
136
+ else
137
+ raise "Unknown queue system: #{content_request[:queue_system]}"
138
+ end
139
+ else
140
+ # nothing to report here, we're skipping the remaining urls as we're outside of the crawl limit
141
+ end
142
+ end
143
+
144
+ # Enqueues the content to the processing queue setup in options
145
+ def self.send_to_processing_queue(content, content_request)
146
+ content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
147
+ if content_request[:direct_call_process_job]
148
+ clazz = const_get(content_request[:processing_queue])
149
+ clazz.perform(content_to_send)
150
+ elsif content_request[:use_encoding_safe_process_job]
151
+ content_to_send[:body] = Base64.encode64(content[:body])
152
+ content_to_send[:processing_queue] = content_request[:processing_queue]
153
+ Resque.enqueue(EncodingSafeProcessJob, content_to_send)
154
+ else
155
+ if content_request[:queue_system] == :resque
156
+ Resque.enqueue(const_get(content_request[:processing_queue]), content_to_send)
157
+ elsif content_request[:queue_system] == :sidekiq
158
+ puts "Queueing on Sidekiq"
159
+ const_get(content_request[:processing_queue]).perform_async(content_to_send)
160
+ else
161
+ raise "Unknown queue system: #{content_request[:queue_system]}"
162
+ end
163
+ end
164
+ puts "#{content_request[:url]} has been sent for processing. use_encoding_safe_process_job: #{content_request[:use_encoding_safe_process_job]}" if content_request[:debug]
165
+ end
166
+
167
+ private
168
+
169
+ # Helper method to determine if this content is to be processed or not
170
+ def self.is_permitted_type(content)
171
+ @content_request[:valid_mime_types].each do |mime_type|
172
+ return true if content[:mime_type].match(Cobweb.escape_pattern_for_regex(mime_type))
173
+ end
174
+ false
175
+ end
176
+
177
+ # Returns true if the crawl count is within limits
178
+ def self.within_crawl_limits?(crawl_limit)
179
+ refresh_counters
180
+ crawl_limit.nil? or @crawl_started_counter < crawl_limit.to_i
181
+ end
182
+
183
+ # Returns true if the queue count is calculated to be still within limits when complete
184
+ def self.within_queue_limits?(crawl_limit)
185
+ refresh_counters
186
+ (@content_request[:crawl_limit_by_page]&& (crawl_limit.nil? or @crawl_counter < crawl_limit.to_i)) || within_crawl_limits?(crawl_limit) && (crawl_limit.nil? || (@queue_counter + @crawl_counter) < crawl_limit.to_i)
187
+ end
188
+
189
+ # Sets the base url in redis. If the first page is a redirect, it sets the base_url to the destination
190
+ def self.set_base_url(redis, content, content_request)
191
+ if redis.get("base_url").nil?
192
+ unless content[:redirect_through].nil? || content[:redirect_through].empty? || !content_request[:first_page_redirect_internal]
193
+ uri = Addressable::URI.parse(content[:redirect_through].last)
194
+ redis.sadd("internal_urls", [uri.scheme, "://", uri.host, "/*"].join)
195
+ end
196
+ redis.set("base_url", content[:url])
197
+ end
198
+ end
199
+
200
+ # Enqueues content to the crawl_job queue
201
+ def self.enqueue_content(content_request, link)
202
+ new_request = content_request.clone
203
+ new_request[:url] = link
204
+ new_request[:parent] = content_request[:url]
205
+ if content_request[:queue_system] == :resque
206
+ Resque.enqueue(CrawlJob, new_request)
207
+ elsif content_request[:queue_system] == :sidekiq
208
+ puts "Queueing content on Sidekiq"
209
+ CrawlWorker.perform_async(new_request)
210
+ else
211
+ raise "Unknown queue system: #{content_request[:queue_system]}"
212
+ end
213
+ @redis.sadd "queued", link
214
+ increment_queue_counter
215
+ end
216
+
217
+ # Increments the queue counter and refreshes crawl counters
218
+ def self.increment_queue_counter
219
+ @redis.incr "queue-counter"
220
+ refresh_counters
221
+ end
222
+ # Increments the crawl counter and refreshes crawl counters
223
+ def self.increment_crawl_counter
224
+ @redis.incr "crawl-counter"
225
+ refresh_counters
226
+ end
227
+ def self.increment_crawl_started_counter
228
+ @redis.incr "crawl-started-counter"
229
+ refresh_counters
230
+ end
231
+ # Decrements the queue counter and refreshes crawl counters
232
+ def self.decrement_queue_counter
233
+ @redis.decr "queue-counter"
234
+ refresh_counters
235
+ end
236
+ # Refreshes the crawl counters
237
+ def self.refresh_counters
238
+ @crawl_counter = @redis.get("crawl-counter").to_i
239
+ @crawl_started_counter = @redis.get("crawl-started-counter").to_i
240
+ @queue_counter = @redis.get("queue-counter").to_i
241
+ end
242
+
243
+ def self.print_counters
244
+ puts counters
245
+ end
246
+
247
+ def self.counters
248
+ "@crawl_counter: #{@crawl_counter} @crawl_started_counter: #{@crawl_started_counter} @queue_counter: #{@queue_counter}"
249
+ end
250
+ end