cobweb 0.0.59 → 0.0.60

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.59
2
+ h1. Cobweb v0.0.60
3
3
  !https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
4
4
 
5
5
  h2. Intro
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "0.0.59"
6
+ "0.0.60"
7
7
  end
8
8
 
9
9
  end
@@ -34,57 +34,67 @@ class CrawlJob
34
34
  if is_permitted_type(content)
35
35
  # if there is no limit or we're still under it lets get the url
36
36
  if within_crawl_limits?(content_request[:crawl_limit])
37
- #update the queued and crawled lists if we are within the crawl limits.
38
- @redis.srem "queued", content_request[:url]
39
- @redis.sadd "crawled", content_request[:url]
40
- # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
41
- if content_request[:crawl_limit_by_page]
42
- if content[:mime_type].match("text/html")
43
- increment_crawl_counter
37
+ begin
38
+ @redis.srem "queued", content_request[:url]
39
+ @redis.sadd "crawled", content_request[:url]
40
+ # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
41
+ if content_request[:crawl_limit_by_page]
42
+ if content[:mime_type].match("text/html")
43
+ increment_crawl_started_counter
44
+ end
45
+ else
44
46
  increment_crawl_started_counter
45
47
  end
46
- else
47
- increment_crawl_counter
48
- increment_crawl_started_counter
49
- end
50
-
51
- ## update statistics
52
- @stats.update_status("Crawling #{content_request[:url]}...")
53
- @stats.update_statistics(content)
54
-
55
- # set the base url if this is the first page
56
- set_base_url @redis, content, content_request
57
-
58
- @cobweb_links = CobwebLinks.new(content_request)
59
- if within_queue_limits?(content_request[:crawl_limit])
60
- internal_links = ContentLinkParser.new(content_request[:url], content[:body]).all_links(:valid_schemes => [:http, :https])
61
-
62
- # select the link if its internal
63
- internal_links.select!{|link| @cobweb_links.internal?(link)}
64
-
65
- # reject the link if we've crawled it or queued it
66
- internal_links.reject!{|link| @redis.sismember("crawled", link)}
67
- internal_links.reject!{|link| @redis.sismember("queued", link)}
68
-
69
- internal_links.each do |link|
70
- enqueue_content(content_request, link) if within_queue_limits?(content_request[:crawl_limit])
48
+
49
+ ## update statistics
50
+ @stats.update_status("Crawling #{content_request[:url]}...")
51
+ @stats.update_statistics(content)
52
+
53
+ # set the base url if this is the first page
54
+ set_base_url @redis, content, content_request
55
+
56
+ @cobweb_links = CobwebLinks.new(content_request)
57
+ if within_queue_limits?(content_request[:crawl_limit])
58
+ internal_links = ContentLinkParser.new(content_request[:url], content[:body]).all_links(:valid_schemes => [:http, :https])
59
+
60
+ # select the link if its internal
61
+ internal_links.select! { |link| @cobweb_links.internal?(link) }
62
+
63
+ # reject the link if we've crawled it or queued it
64
+ internal_links.reject! { |link| @redis.sismember("crawled", link) }
65
+ internal_links.reject! { |link| @redis.sismember("queued", link) }
66
+
67
+ internal_links.each do |link|
68
+ enqueue_content(content_request, link) if within_queue_limits?(content_request[:crawl_limit])
69
+ end
71
70
  end
72
- end
73
-
74
- # enqueue to processing queue
75
- send_to_processing_queue(content, content_request)
76
-
77
- #if the enqueue counter has been requested update that
78
- if content_request.has_key? :enqueue_counter_key
79
- enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
80
- current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
81
- enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
82
- end
83
71
 
84
- # update the queue and crawl counts -- doing this very late in the piece so that the following transaction all occurs at once.
85
- # really we should do this with a lock https://github.com/PatrickTulskie/redis-lock
86
- #increment_crawl_counter
87
- puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter} In Progress: #{@crawl_started_counter-@crawl_counter}" if @debug
72
+ # enqueue to processing queue
73
+ send_to_processing_queue(content, content_request)
74
+
75
+ #if the enqueue counter has been requested update that
76
+ if content_request.has_key? :enqueue_counter_key
77
+ enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
78
+ current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
79
+ enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
80
+ end
81
+
82
+ ensure
83
+ #update the queued and crawled lists if we are within the crawl limits.
84
+
85
+ # update the queue and crawl counts -- doing this very late in the piece so that the following transaction all occurs at once.
86
+ # really we should do this with a lock https://github.com/PatrickTulskie/redis-lock
87
+ if content_request[:crawl_limit_by_page]
88
+ if content[:mime_type].match("text/html")
89
+ increment_crawl_counter
90
+ end
91
+ else
92
+ increment_crawl_counter
93
+ end
94
+ puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter} In Progress: #{@crawl_started_counter-@crawl_counter}" if @debug
95
+ end
96
+ else
97
+ puts "ignoring #{content_request[:url]} as outside of crawl limits." if content_request[:debug]
88
98
  end
89
99
  else
90
100
  puts "ignoring #{content_request[:url]} as mime_type is #{content[:mime_type]}" if content_request[:debug]
@@ -96,11 +106,12 @@ class CrawlJob
96
106
  end
97
107
 
98
108
  # if there's nothing left queued or the crawled limit has been reached
109
+ refresh_counters
99
110
  if content_request[:crawl_limit].nil? || content_request[:crawl_limit] == 0
100
- if @queue_counter == 0
111
+ if @queue_counter+@crawl_started_counter-@crawl_counter == 0
101
112
  finished(content_request)
102
113
  end
103
- elsif @queue_counter == 0 || @crawl_counter >= content_request[:crawl_limit].to_i
114
+ elsif (@queue_counter +@crawl_started_counter-@crawl_counter)== 0 || @crawl_counter >= content_request[:crawl_limit].to_i
104
115
  finished(content_request)
105
116
  end
106
117
 
@@ -109,9 +120,13 @@ class CrawlJob
109
120
  # Sets the crawl status to 'Crawl Stopped' and enqueues the crawl finished job
110
121
  def self.finished(content_request)
111
122
  # finished
112
- ap "FINISHED"
113
- @stats.end_crawl(content_request)
114
- Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
123
+ if @redis.hget("statistics", "current_status")!= "Crawl Stopped"
124
+ ap "CRAWL FINISHED #{content_request[:url]}, #{counters}" if content_request[:debug]
125
+ @stats.end_crawl(content_request)
126
+ Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
127
+ else
128
+ ap "CRAWL REFINISHED #{content_request[:url]}, #{counters}" if content_request[:debug]
129
+ end
115
130
  end
116
131
 
117
132
  # Enqueues the content to the processing queue setup in options
@@ -143,13 +158,13 @@ class CrawlJob
143
158
  # Returns true if the crawl count is within limits
144
159
  def self.within_crawl_limits?(crawl_limit)
145
160
  refresh_counters
146
- crawl_limit.nil? or @crawl_counter <= crawl_limit.to_i
147
161
  crawl_limit.nil? or @crawl_started_counter < crawl_limit.to_i
148
162
  end
149
163
 
150
164
  # Returns true if the queue count is calculated to be still within limits when complete
151
165
  def self.within_queue_limits?(crawl_limit)
152
- @content_request[:crawl_limit_by_page] || within_crawl_limits?(crawl_limit) && (crawl_limit.nil? || (@queue_counter + @crawl_started_counter) < crawl_limit.to_i)
166
+ refresh_counters
167
+ (@content_request[:crawl_limit_by_page]&& (crawl_limit.nil? or @crawl_counter < crawl_limit.to_i)) || within_crawl_limits?(crawl_limit) && (crawl_limit.nil? || (@queue_counter + @crawl_counter) < crawl_limit.to_i)
153
168
  end
154
169
 
155
170
  # Sets the base url in redis. If the first page is a redirect, it sets the base_url to the destination
@@ -200,17 +215,11 @@ class CrawlJob
200
215
  end
201
216
 
202
217
  def self.print_counters
203
- puts "@crawl_counter: #{@crawl_counter} @crawl_started_counter: #{@crawl_started_counter} @queue_counter: #{@queue_counter}"
218
+ puts counters
204
219
  end
205
-
206
- # Sets the crawl counters based on the crawled and queued queues
207
- def self.reset_counters
208
- @redis.set("crawl-started-counter", @redis.smembers("crawled").count)
209
- @redis.set("crawl-counter", @redis.smembers("crawled").count)
210
- @redis.set("queue-counter", @redis.smembers("queued").count)
211
- @crawl_started_counter = @redis.get("crawl-started-counter").to_i
212
- @crawl_counter = @redis.get("crawl-counter").to_i
213
- @queue_counter = @redis.get("queue-counter").to_i
220
+
221
+ def self.counters
222
+ "@crawl_counter: #{@crawl_counter} @crawl_started_counter: #{@crawl_started_counter} @queue_counter: #{@queue_counter}"
214
223
  end
215
-
224
+
216
225
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.59
4
+ version: 0.0.60
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-10 00:00:00.000000000 Z
12
+ date: 2012-07-12 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70204470213880 !ruby/object:Gem::Requirement
16
+ requirement: &70111532832640 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70204470213880
24
+ version_requirements: *70111532832640
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70204470212220 !ruby/object:Gem::Requirement
27
+ requirement: &70111532832160 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70204470212220
35
+ version_requirements: *70111532832160
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70204470211500 !ruby/object:Gem::Requirement
38
+ requirement: &70111532831300 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70204470211500
46
+ version_requirements: *70111532831300
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70204470210300 !ruby/object:Gem::Requirement
49
+ requirement: &70111532830260 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70204470210300
57
+ version_requirements: *70111532830260
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70204470208860 !ruby/object:Gem::Requirement
60
+ requirement: &70111532829180 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70204470208860
68
+ version_requirements: *70111532829180
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70204470223880 !ruby/object:Gem::Requirement
71
+ requirement: &70111532844220 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70204470223880
79
+ version_requirements: *70111532844220
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70204470223280 !ruby/object:Gem::Requirement
82
+ requirement: &70111532843480 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70204470223280
90
+ version_requirements: *70111532843480
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70204470222720 !ruby/object:Gem::Requirement
93
+ requirement: &70111532842880 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70204470222720
101
+ version_requirements: *70111532842880
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70204470222160 !ruby/object:Gem::Requirement
104
+ requirement: &70111532842240 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70204470222160
112
+ version_requirements: *70111532842240
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70204470221480 !ruby/object:Gem::Requirement
115
+ requirement: &70111532841500 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70204470221480
123
+ version_requirements: *70111532841500
124
124
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
125
125
  crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
126
126
  is also a standalone crawler that has a sophisticated statistics monitoring interface