cobweb 1.0.11 → 1.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.textile CHANGED
@@ -1,10 +1,11 @@
1
1
 
2
- h1. Cobweb v1.0.11
2
+ h1. Cobweb v1.0.12
3
3
 
4
4
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
-
6
- !https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
5
+ !https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
7
6
  !https://gemnasium.com/stewartmckee/cobweb.png!
7
+ !https://coveralls.io/repos/stewartmckee/cobweb/badge.png?branch=master(Coverage Status)!:https://coveralls.io/r/stewartmckee/cobweb
8
+
8
9
 
9
10
  h2. Intro
10
11
 
data/lib/cobweb.rb CHANGED
@@ -33,8 +33,14 @@ class Cobweb
33
33
  default_use_encoding_safe_process_job_to false
34
34
  default_follow_redirects_to true
35
35
  default_redirect_limit_to 10
36
- default_processing_queue_to "CobwebProcessJob"
37
- default_crawl_finished_queue_to "CobwebFinishedJob"
36
+ default_queue_system_to :resque
37
+ if @options[:queue_system] == :resque
38
+ default_processing_queue_to "CobwebProcessJob"
39
+ default_crawl_finished_queue_to "CobwebFinishedJob"
40
+ else
41
+ default_processing_queue_to "CrawlProcessWorker"
42
+ default_crawl_finished_queue_to "CrawlFinishedWorker"
43
+ end
38
44
  default_quiet_to true
39
45
  default_debug_to false
40
46
  default_cache_to 300
@@ -49,7 +55,7 @@ class Cobweb
49
55
  default_user_agent_to "cobweb/#{Cobweb.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"
50
56
  default_valid_mime_types_to ["*/*"]
51
57
  default_raise_exceptions_to false
52
- default_store_refered_url_to false
58
+ default_store_inbound_links_to false
53
59
 
54
60
  end
55
61
 
@@ -80,7 +86,14 @@ class Cobweb
80
86
 
81
87
  # add internal_urls into redis
82
88
  @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
83
- Resque.enqueue(CrawlJob, request)
89
+ if @options[:queue_system] == :resque
90
+ Resque.enqueue(CrawlJob, request)
91
+ elsif @options[:queue_system] == :sidekiq
92
+ CrawlWorker.perform_async(request)
93
+ else
94
+ raise "Unknown queue system: #{content_request[:queue_system]}"
95
+ end
96
+
84
97
  request
85
98
  end
86
99
 
@@ -124,8 +137,13 @@ class Cobweb
124
137
 
125
138
  # check if it has already been cached
126
139
  if ((@options[:cache_type] == :crawl_based && redis.get(unique_id)) || (@options[:cache_type] == :full && full_redis.get(unique_id))) && @options[:cache]
127
- puts "Cache hit for #{url}" unless @options[:quiet]
128
- content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
140
+ if @options[:cache_type] == :crawl_based
141
+ puts "Cache hit for #{url}" unless @options[:quiet]
142
+ content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
143
+ else
144
+ puts "Cache hit for #{url}" unless @options[:quiet]
145
+ content = HashUtil.deep_symbolize_keys(Marshal.load(full_redis.get(unique_id)))
146
+ end
129
147
  else
130
148
  # retrieve data
131
149
  #unless @http && @http.address == uri.host && @http.port == uri.inferred_port
@@ -204,8 +222,13 @@ class Cobweb
204
222
  end
205
223
  # add content to cache if required
206
224
  if @options[:cache]
207
- redis.set(unique_id, Marshal.dump(content))
208
- redis.expire unique_id, @options[:cache].to_i
225
+ if @options[:cache_type] == :crawl_based
226
+ redis.set(unique_id, Marshal.dump(content))
227
+ redis.expire unique_id, @options[:cache].to_i
228
+ else
229
+ full_redis.set(unique_id, Marshal.dump(content))
230
+ full_redis.expire unique_id, @options[:cache].to_i
231
+ end
209
232
  end
210
233
  rescue RedirectError => e
211
234
  raise e if @options[:raise_exceptions]
@@ -96,14 +96,12 @@ class CobwebCrawler
96
96
  @redis.sadd "crawled", url.to_s
97
97
  @redis.incr "crawl-counter"
98
98
 
99
- internal_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https])
99
+ document_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https]).uniq
100
100
 
101
101
  # select the link if its internal (eliminate external before expensive lookups in queued and crawled)
102
102
  cobweb_links = CobwebLinks.new(@options)
103
103
 
104
- internal_links = internal_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
105
-
106
- all_internal_links = internal_links
104
+ internal_links = document_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
107
105
 
108
106
  # reject the link if we've crawled it or queued it
109
107
  internal_links.reject!{|link| @redis.sismember("crawled", link)}
@@ -120,12 +118,13 @@ class CobwebCrawler
120
118
  @queue_counter += 1
121
119
  end
122
120
 
123
- if @options[:store_refered_url]
124
- all_internal_links.each do |link|
125
- @redis.sadd("inbound_links_#{Digest::MD5.hexdigest(link)}", url)
121
+ if @options[:store_inbound_links]
122
+ document_links.each do |target_link|
123
+ target_uri = UriHelper.parse(target_link)
124
+ @redis.sadd("inbound_links_#{Digest::MD5.hexdigest(target_uri.to_s)}", UriHelper.parse(url).to_s)
126
125
  end
127
126
  end
128
-
127
+
129
128
  @crawl_counter = @redis.scard("crawled").to_i
130
129
  @queue_counter = @redis.scard("queued").to_i
131
130
 
@@ -6,7 +6,7 @@ class CobwebProcessJob
6
6
 
7
7
  # Resque perform method
8
8
  def self.perform(content)
9
- content = HashHelper.symbolize_keys(content)
9
+ content = HashUtil.deep_symbolize_keys(content)
10
10
  puts "Dummy Processing for #{content[:url]}"
11
11
 
12
12
  #ap content.keys
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.11"
6
+ "1.0.12"
7
7
  end
8
8
 
9
9
  end
data/lib/crawl.rb CHANGED
@@ -88,11 +88,12 @@ module CobwebModule
88
88
 
89
89
  @cobweb_links = CobwebLinks.new(@options)
90
90
  if within_queue_limits?
91
- internal_links = ContentLinkParser.new(@options[:url], content.body, @options).all_links(:valid_schemes => [:http, :https])
91
+ document_links = ContentLinkParser.new(@options[:url], content.body, @options).all_links(:valid_schemes => [:http, :https])
92
92
  #get rid of duplicate links in the same page.
93
- internal_links.uniq!
93
+ document_links.uniq!
94
+
94
95
  # select the link if its internal
95
- internal_links.select! { |link| @cobweb_links.internal?(link) }
96
+ internal_links = document_links.select{ |link| @cobweb_links.internal?(link) }
96
97
 
97
98
  # reject the link if we've crawled it or queued it
98
99
  internal_links.reject! { |link| @redis.sismember("crawled", link) }
@@ -111,6 +112,13 @@ module CobwebModule
111
112
  end
112
113
  end
113
114
  end
115
+
116
+ if @options[:store_inbound_links]
117
+ document_links.each do |link|
118
+ uri = URI.parse(link)
119
+ @redis.sadd("inbound_links_#{Digest::MD5.hexdigest(uri.to_s)}", url)
120
+ end
121
+ end
114
122
  end
115
123
  end
116
124
 
@@ -180,7 +188,6 @@ module CobwebModule
180
188
 
181
189
  def finished
182
190
  set_first_to_finish
183
- debug_ap "CRAWL FINISHED #{@options[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}"
184
191
  @stats.end_crawl(@options)
185
192
  end
186
193
 
@@ -0,0 +1,27 @@
1
+ require 'sidekiq'
2
+ require File.expand_path(File.dirname(__FILE__) + '/sidekiq/cobweb_helper')
3
+
4
+ # If your client is single-threaded, we just need a single connection in our Redis connection pool
5
+ #Sidekiq.configure_client do |config|
6
+ # config.redis = { :namespace => 'x', :size => 1, :url => 'redis://localhost:6379/14' }
7
+ #end
8
+
9
+ # Sidekiq server is multi-threaded so our Redis connection pool size defaults to concurrency (-c)
10
+ #Sidekiq.configure_server do |config|
11
+ # config.redis = { :namespace => 'x', :url => 'redis://localhost:6379/14' }
12
+ #end
13
+
14
+ class CrawlFinishedWorker
15
+
16
+ include Sidekiq::Worker
17
+
18
+ sidekiq_options queue: "crawl_finished_worker"
19
+
20
+
21
+ def perform(statistics)
22
+ puts "Dummy Finished Job"
23
+
24
+ ap statistics
25
+
26
+ end
27
+ end
@@ -0,0 +1,250 @@
1
+ class CrawlHelper
2
+
3
+ require "net/https"
4
+ require "uri"
5
+ require "redis"
6
+ require 'namespaced_redis'
7
+
8
+ def self.crawl_page(content_request)
9
+ # change all hash keys to symbols
10
+ content_request = HashUtil.deep_symbolize_keys(content_request)
11
+ @content_request = content_request
12
+
13
+ content_request[:redis_options] = {} unless content_request.has_key? :redis_options
14
+ content_request[:crawl_limit_by_page] = false unless content_request.has_key? :crawl_limit_by_page
15
+ content_request[:valid_mime_types] = ["*/*"] unless content_request.has_key? :valid_mime_types
16
+ content_request[:queue_system] = content_request[:queue_system].to_sym
17
+
18
+ @redis = NamespacedRedis.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
19
+ @stats = Stats.new(content_request)
20
+
21
+ @debug = content_request[:debug]
22
+
23
+ decrement_queue_counter
24
+
25
+ # check we haven't crawled this url before
26
+ unless @redis.sismember "crawled", content_request[:url]
27
+ # if there is no limit or we're still under it lets get the url
28
+ if within_crawl_limits?(content_request[:crawl_limit])
29
+ content = Cobweb.new(content_request).get(content_request[:url], content_request)
30
+ if content_request[:url] == @redis.get("original_base_url")
31
+ @redis.set("crawled_base_url", content[:base_url])
32
+ end
33
+ if is_permitted_type(content)
34
+ begin
35
+ # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
36
+ @redis.srem "queued", content_request[:url]
37
+ @redis.sadd "crawled", content_request[:url]
38
+ @redis.srem "queued", content[:url]
39
+ @redis.sadd "crawled", content[:url]
40
+ # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
41
+ if content_request[:crawl_limit_by_page]
42
+ if content[:mime_type].match("text/html")
43
+ increment_crawl_started_counter
44
+ end
45
+ else
46
+ increment_crawl_started_counter
47
+ end
48
+
49
+ ## update statistics
50
+ @stats.update_status("Crawling #{content_request[:url]}...")
51
+ @stats.update_statistics(content)
52
+
53
+ # set the base url if this is the first page
54
+ set_base_url @redis, content, content_request
55
+
56
+ @cobweb_links = CobwebLinks.new(content_request)
57
+ if within_queue_limits?(content_request[:crawl_limit])
58
+ internal_links = ContentLinkParser.new(content_request[:url], content[:body], content_request).all_links(:valid_schemes => [:http, :https])
59
+
60
+ # select the link if its internal
61
+ internal_links.select! { |link| @cobweb_links.internal?(link) }
62
+
63
+ # reject the link if we've crawled it or queued it
64
+ internal_links.reject! { |link| @redis.sismember("crawled", link) }
65
+ internal_links.reject! { |link| @redis.sismember("queued", link) }
66
+
67
+ internal_links.each do |link|
68
+ enqueue_content(content_request, link) if within_queue_limits?(content_request[:crawl_limit])
69
+ end
70
+ end
71
+
72
+ # enqueue to processing queue
73
+ send_to_processing_queue(content, content_request)
74
+
75
+ #if the enqueue counter has been requested update that
76
+ if content_request.has_key? :enqueue_counter_key
77
+ enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
78
+ current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
79
+ enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
80
+ end
81
+
82
+ ensure
83
+ #update the queued and crawled lists if we are within the crawl limits.
84
+
85
+ # update the queue and crawl counts -- doing this very late in the piece so that the following transaction all occurs at once.
86
+ # really we should do this with a lock https://github.com/PatrickTulskie/redis-lock
87
+ if content_request[:crawl_limit_by_page]
88
+ if content[:mime_type].match("text/html")
89
+ increment_crawl_counter
90
+ end
91
+ else
92
+ increment_crawl_counter
93
+ end
94
+ puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter} In Progress: #{@crawl_started_counter-@crawl_counter}" if @debug
95
+ end
96
+ else
97
+ puts "ignoring #{content_request[:url]} as mime_type is #{content[:mime_type]}" if content_request[:debug]
98
+ end
99
+ else
100
+ puts "ignoring #{content_request[:url]} as outside of crawl limits." if content_request[:debug]
101
+ end
102
+
103
+ else
104
+ @redis.srem "queued", content_request[:url]
105
+ puts "Already crawled #{content_request[:url]}" if content_request[:debug]
106
+ end
107
+
108
+ # if there's nothing left queued or the crawled limit has been reached
109
+ refresh_counters
110
+ if content_request[:crawl_limit].nil? || content_request[:crawl_limit] == 0
111
+ if @queue_counter+@crawl_started_counter-@crawl_counter == 0
112
+ finished(content_request)
113
+ end
114
+ elsif (@queue_counter +@crawl_started_counter-@crawl_counter)== 0 || @crawl_counter >= content_request[:crawl_limit].to_i
115
+ finished(content_request)
116
+ end
117
+
118
+ end
119
+
120
+ # Sets the crawl status to 'Crawl Finished' and enqueues the crawl finished job
121
+ def self.finished(content_request)
122
+ # finished
123
+ if @redis.hget("statistics", "current_status")!= "Crawl Finished"
124
+ ap "CRAWL FINISHED #{content_request[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}" if content_request[:debug]
125
+ @stats.end_crawl(content_request)
126
+
127
+ additional_stats = {:crawl_id => content_request[:crawl_id], :crawled_base_url => @redis.get("crawled_base_url")}
128
+ additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
129
+ additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
130
+
131
+ if content_request[:queue_system] == :resque
132
+ Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge(additional_stats))
133
+ elsif content_request[:queue_system] == :sidekiq
134
+ puts "Queueing Finished on Sidekiq"
135
+ const_get(content_request[:crawl_finished_queue]).perform_async(@stats.get_statistics.merge(additional_stats))
136
+ else
137
+ raise "Unknown queue system: #{content_request[:queue_system]}"
138
+ end
139
+ else
140
+ # nothing to report here, we're skipping the remaining urls as we're outside of the crawl limit
141
+ end
142
+ end
143
+
144
+ # Enqueues the content to the processing queue setup in options
145
+ def self.send_to_processing_queue(content, content_request)
146
+ content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
147
+ if content_request[:direct_call_process_job]
148
+ clazz = const_get(content_request[:processing_queue])
149
+ clazz.perform(content_to_send)
150
+ elsif content_request[:use_encoding_safe_process_job]
151
+ content_to_send[:body] = Base64.encode64(content[:body])
152
+ content_to_send[:processing_queue] = content_request[:processing_queue]
153
+ Resque.enqueue(EncodingSafeProcessJob, content_to_send)
154
+ else
155
+ if content_request[:queue_system] == :resque
156
+ Resque.enqueue(const_get(content_request[:processing_queue]), content_to_send)
157
+ elsif content_request[:queue_system] == :sidekiq
158
+ puts "Queueing on Sidekiq"
159
+ const_get(content_request[:processing_queue]).perform_async(content_to_send)
160
+ else
161
+ raise "Unknown queue system: #{content_request[:queue_system]}"
162
+ end
163
+ end
164
+ puts "#{content_request[:url]} has been sent for processing. use_encoding_safe_process_job: #{content_request[:use_encoding_safe_process_job]}" if content_request[:debug]
165
+ end
166
+
167
+ private
168
+
169
+ # Helper method to determine if this content is to be processed or not
170
+ def self.is_permitted_type(content)
171
+ @content_request[:valid_mime_types].each do |mime_type|
172
+ return true if content[:mime_type].match(Cobweb.escape_pattern_for_regex(mime_type))
173
+ end
174
+ false
175
+ end
176
+
177
+ # Returns true if the crawl count is within limits
178
+ def self.within_crawl_limits?(crawl_limit)
179
+ refresh_counters
180
+ crawl_limit.nil? or @crawl_started_counter < crawl_limit.to_i
181
+ end
182
+
183
+ # Returns true if the queue count is calculated to be still within limits when complete
184
+ def self.within_queue_limits?(crawl_limit)
185
+ refresh_counters
186
+ (@content_request[:crawl_limit_by_page]&& (crawl_limit.nil? or @crawl_counter < crawl_limit.to_i)) || within_crawl_limits?(crawl_limit) && (crawl_limit.nil? || (@queue_counter + @crawl_counter) < crawl_limit.to_i)
187
+ end
188
+
189
+ # Sets the base url in redis. If the first page is a redirect, it sets the base_url to the destination
190
+ def self.set_base_url(redis, content, content_request)
191
+ if redis.get("base_url").nil?
192
+ unless content[:redirect_through].nil? || content[:redirect_through].empty? || !content_request[:first_page_redirect_internal]
193
+ uri = Addressable::URI.parse(content[:redirect_through].last)
194
+ redis.sadd("internal_urls", [uri.scheme, "://", uri.host, "/*"].join)
195
+ end
196
+ redis.set("base_url", content[:url])
197
+ end
198
+ end
199
+
200
+ # Enqueues content to the crawl_job queue
201
+ def self.enqueue_content(content_request, link)
202
+ new_request = content_request.clone
203
+ new_request[:url] = link
204
+ new_request[:parent] = content_request[:url]
205
+ if content_request[:queue_system] == :resque
206
+ Resque.enqueue(CrawlJob, new_request)
207
+ elsif content_request[:queue_system] == :sidekiq
208
+ puts "Queueing content on Sidekiq"
209
+ CrawlWorker.perform_async(new_request)
210
+ else
211
+ raise "Unknown queue system: #{content_request[:queue_system]}"
212
+ end
213
+ @redis.sadd "queued", link
214
+ increment_queue_counter
215
+ end
216
+
217
+ # Increments the queue counter and refreshes crawl counters
218
+ def self.increment_queue_counter
219
+ @redis.incr "queue-counter"
220
+ refresh_counters
221
+ end
222
+ # Increments the crawl counter and refreshes crawl counters
223
+ def self.increment_crawl_counter
224
+ @redis.incr "crawl-counter"
225
+ refresh_counters
226
+ end
227
+ def self.increment_crawl_started_counter
228
+ @redis.incr "crawl-started-counter"
229
+ refresh_counters
230
+ end
231
+ # Decrements the queue counter and refreshes crawl counters
232
+ def self.decrement_queue_counter
233
+ @redis.decr "queue-counter"
234
+ refresh_counters
235
+ end
236
+ # Refreshes the crawl counters
237
+ def self.refresh_counters
238
+ @crawl_counter = @redis.get("crawl-counter").to_i
239
+ @crawl_started_counter = @redis.get("crawl-started-counter").to_i
240
+ @queue_counter = @redis.get("queue-counter").to_i
241
+ end
242
+
243
+ def self.print_counters
244
+ puts counters
245
+ end
246
+
247
+ def self.counters
248
+ "@crawl_counter: #{@crawl_counter} @crawl_started_counter: #{@crawl_started_counter} @queue_counter: #{@queue_counter}"
249
+ end
250
+ end