cobweb 0.0.59 → 0.0.60

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.59
2
+ h1. Cobweb v0.0.60
3
3
  !https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
4
4
 
5
5
  h2. Intro
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "0.0.59"
6
+ "0.0.60"
7
7
  end
8
8
 
9
9
  end
@@ -34,57 +34,67 @@ class CrawlJob
34
34
  if is_permitted_type(content)
35
35
  # if there is no limit or we're still under it lets get the url
36
36
  if within_crawl_limits?(content_request[:crawl_limit])
37
- #update the queued and crawled lists if we are within the crawl limits.
38
- @redis.srem "queued", content_request[:url]
39
- @redis.sadd "crawled", content_request[:url]
40
- # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
41
- if content_request[:crawl_limit_by_page]
42
- if content[:mime_type].match("text/html")
43
- increment_crawl_counter
37
+ begin
38
+ @redis.srem "queued", content_request[:url]
39
+ @redis.sadd "crawled", content_request[:url]
40
+ # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
41
+ if content_request[:crawl_limit_by_page]
42
+ if content[:mime_type].match("text/html")
43
+ increment_crawl_started_counter
44
+ end
45
+ else
44
46
  increment_crawl_started_counter
45
47
  end
46
- else
47
- increment_crawl_counter
48
- increment_crawl_started_counter
49
- end
50
-
51
- ## update statistics
52
- @stats.update_status("Crawling #{content_request[:url]}...")
53
- @stats.update_statistics(content)
54
-
55
- # set the base url if this is the first page
56
- set_base_url @redis, content, content_request
57
-
58
- @cobweb_links = CobwebLinks.new(content_request)
59
- if within_queue_limits?(content_request[:crawl_limit])
60
- internal_links = ContentLinkParser.new(content_request[:url], content[:body]).all_links(:valid_schemes => [:http, :https])
61
-
62
- # select the link if its internal
63
- internal_links.select!{|link| @cobweb_links.internal?(link)}
64
-
65
- # reject the link if we've crawled it or queued it
66
- internal_links.reject!{|link| @redis.sismember("crawled", link)}
67
- internal_links.reject!{|link| @redis.sismember("queued", link)}
68
-
69
- internal_links.each do |link|
70
- enqueue_content(content_request, link) if within_queue_limits?(content_request[:crawl_limit])
48
+
49
+ ## update statistics
50
+ @stats.update_status("Crawling #{content_request[:url]}...")
51
+ @stats.update_statistics(content)
52
+
53
+ # set the base url if this is the first page
54
+ set_base_url @redis, content, content_request
55
+
56
+ @cobweb_links = CobwebLinks.new(content_request)
57
+ if within_queue_limits?(content_request[:crawl_limit])
58
+ internal_links = ContentLinkParser.new(content_request[:url], content[:body]).all_links(:valid_schemes => [:http, :https])
59
+
60
+ # select the link if its internal
61
+ internal_links.select! { |link| @cobweb_links.internal?(link) }
62
+
63
+ # reject the link if we've crawled it or queued it
64
+ internal_links.reject! { |link| @redis.sismember("crawled", link) }
65
+ internal_links.reject! { |link| @redis.sismember("queued", link) }
66
+
67
+ internal_links.each do |link|
68
+ enqueue_content(content_request, link) if within_queue_limits?(content_request[:crawl_limit])
69
+ end
71
70
  end
72
- end
73
-
74
- # enqueue to processing queue
75
- send_to_processing_queue(content, content_request)
76
-
77
- #if the enqueue counter has been requested update that
78
- if content_request.has_key? :enqueue_counter_key
79
- enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
80
- current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
81
- enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
82
- end
83
71
 
84
- # update the queue and crawl counts -- doing this very late in the piece so that the following transaction all occurs at once.
85
- # really we should do this with a lock https://github.com/PatrickTulskie/redis-lock
86
- #increment_crawl_counter
87
- puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter} In Progress: #{@crawl_started_counter-@crawl_counter}" if @debug
72
+ # enqueue to processing queue
73
+ send_to_processing_queue(content, content_request)
74
+
75
+ #if the enqueue counter has been requested update that
76
+ if content_request.has_key? :enqueue_counter_key
77
+ enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
78
+ current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
79
+ enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
80
+ end
81
+
82
+ ensure
83
+ #update the queued and crawled lists if we are within the crawl limits.
84
+
85
+ # update the queue and crawl counts -- doing this very late in the piece so that the following transaction all occurs at once.
86
+ # really we should do this with a lock https://github.com/PatrickTulskie/redis-lock
87
+ if content_request[:crawl_limit_by_page]
88
+ if content[:mime_type].match("text/html")
89
+ increment_crawl_counter
90
+ end
91
+ else
92
+ increment_crawl_counter
93
+ end
94
+ puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter} In Progress: #{@crawl_started_counter-@crawl_counter}" if @debug
95
+ end
96
+ else
97
+ puts "ignoring #{content_request[:url]} as outside of crawl limits." if content_request[:debug]
88
98
  end
89
99
  else
90
100
  puts "ignoring #{content_request[:url]} as mime_type is #{content[:mime_type]}" if content_request[:debug]
@@ -96,11 +106,12 @@ class CrawlJob
96
106
  end
97
107
 
98
108
  # if there's nothing left queued or the crawled limit has been reached
109
+ refresh_counters
99
110
  if content_request[:crawl_limit].nil? || content_request[:crawl_limit] == 0
100
- if @queue_counter == 0
111
+ if @queue_counter+@crawl_started_counter-@crawl_counter == 0
101
112
  finished(content_request)
102
113
  end
103
- elsif @queue_counter == 0 || @crawl_counter >= content_request[:crawl_limit].to_i
114
+ elsif (@queue_counter +@crawl_started_counter-@crawl_counter)== 0 || @crawl_counter >= content_request[:crawl_limit].to_i
104
115
  finished(content_request)
105
116
  end
106
117
 
@@ -109,9 +120,13 @@ class CrawlJob
109
120
  # Sets the crawl status to 'Crawl Stopped' and enqueues the crawl finished job
110
121
  def self.finished(content_request)
111
122
  # finished
112
- ap "FINISHED"
113
- @stats.end_crawl(content_request)
114
- Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
123
+ if @redis.hget("statistics", "current_status")!= "Crawl Stopped"
124
+ ap "CRAWL FINISHED #{content_request[:url]}, #{counters}" if content_request[:debug]
125
+ @stats.end_crawl(content_request)
126
+ Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
127
+ else
128
+ ap "CRAWL REFINISHED #{content_request[:url]}, #{counters}" if content_request[:debug]
129
+ end
115
130
  end
116
131
 
117
132
  # Enqueues the content to the processing queue setup in options
@@ -143,13 +158,13 @@ class CrawlJob
143
158
  # Returns true if the crawl count is within limits
144
159
  def self.within_crawl_limits?(crawl_limit)
145
160
  refresh_counters
146
- crawl_limit.nil? or @crawl_counter <= crawl_limit.to_i
147
161
  crawl_limit.nil? or @crawl_started_counter < crawl_limit.to_i
148
162
  end
149
163
 
150
164
  # Returns true if the queue count is calculated to be still within limits when complete
151
165
  def self.within_queue_limits?(crawl_limit)
152
- @content_request[:crawl_limit_by_page] || within_crawl_limits?(crawl_limit) && (crawl_limit.nil? || (@queue_counter + @crawl_started_counter) < crawl_limit.to_i)
166
+ refresh_counters
167
+ (@content_request[:crawl_limit_by_page]&& (crawl_limit.nil? or @crawl_counter < crawl_limit.to_i)) || within_crawl_limits?(crawl_limit) && (crawl_limit.nil? || (@queue_counter + @crawl_counter) < crawl_limit.to_i)
153
168
  end
154
169
 
155
170
  # Sets the base url in redis. If the first page is a redirect, it sets the base_url to the destination
@@ -200,17 +215,11 @@ class CrawlJob
200
215
  end
201
216
 
202
217
  def self.print_counters
203
- puts "@crawl_counter: #{@crawl_counter} @crawl_started_counter: #{@crawl_started_counter} @queue_counter: #{@queue_counter}"
218
+ puts counters
204
219
  end
205
-
206
- # Sets the crawl counters based on the crawled and queued queues
207
- def self.reset_counters
208
- @redis.set("crawl-started-counter", @redis.smembers("crawled").count)
209
- @redis.set("crawl-counter", @redis.smembers("crawled").count)
210
- @redis.set("queue-counter", @redis.smembers("queued").count)
211
- @crawl_started_counter = @redis.get("crawl-started-counter").to_i
212
- @crawl_counter = @redis.get("crawl-counter").to_i
213
- @queue_counter = @redis.get("queue-counter").to_i
220
+
221
+ def self.counters
222
+ "@crawl_counter: #{@crawl_counter} @crawl_started_counter: #{@crawl_started_counter} @queue_counter: #{@queue_counter}"
214
223
  end
215
-
224
+
216
225
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.59
4
+ version: 0.0.60
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-10 00:00:00.000000000 Z
12
+ date: 2012-07-12 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70204470213880 !ruby/object:Gem::Requirement
16
+ requirement: &70111532832640 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70204470213880
24
+ version_requirements: *70111532832640
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70204470212220 !ruby/object:Gem::Requirement
27
+ requirement: &70111532832160 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70204470212220
35
+ version_requirements: *70111532832160
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70204470211500 !ruby/object:Gem::Requirement
38
+ requirement: &70111532831300 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70204470211500
46
+ version_requirements: *70111532831300
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70204470210300 !ruby/object:Gem::Requirement
49
+ requirement: &70111532830260 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70204470210300
57
+ version_requirements: *70111532830260
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70204470208860 !ruby/object:Gem::Requirement
60
+ requirement: &70111532829180 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70204470208860
68
+ version_requirements: *70111532829180
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70204470223880 !ruby/object:Gem::Requirement
71
+ requirement: &70111532844220 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70204470223880
79
+ version_requirements: *70111532844220
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70204470223280 !ruby/object:Gem::Requirement
82
+ requirement: &70111532843480 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70204470223280
90
+ version_requirements: *70111532843480
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70204470222720 !ruby/object:Gem::Requirement
93
+ requirement: &70111532842880 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70204470222720
101
+ version_requirements: *70111532842880
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70204470222160 !ruby/object:Gem::Requirement
104
+ requirement: &70111532842240 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70204470222160
112
+ version_requirements: *70111532842240
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70204470221480 !ruby/object:Gem::Requirement
115
+ requirement: &70111532841500 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70204470221480
123
+ version_requirements: *70111532841500
124
124
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
125
125
  crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
126
126
  is also a standalone crawler that has a sophisticated statistics monitoring interface