cobweb 0.0.72 → 0.0.73

Sign up to get free protection for your applications and to get access to all the features.
data/README.textile CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.72
2
+ h1. Cobweb v0.0.73
3
3
 
4
4
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
5
 
@@ -22,9 +22,8 @@ class CobwebCrawlHelper
22
22
 
23
23
  # set status as cancelled now so that we don't enqueue any further pages
24
24
  self.statistics.end_crawl(@data, true)
25
- puts "end_crawl: #{self.statistics.get_status}"
25
+
26
26
  if options[:finished_resque_queue]
27
- puts "enqueueing finished job..."
28
27
 
29
28
  additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
30
29
  additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
@@ -33,18 +32,23 @@ class CobwebCrawlHelper
33
32
  Resque.enqueue(options[:finished_resque_queue], @stats.get_statistics.merge(additional_stats))
34
33
  end
35
34
 
36
- position = 0
37
- job_items = Resque.peek(options[:queue_name], position, BATCH_SIZE)
38
- until job_items.empty?
39
- puts "Batch: #{position} : #{job_items.count}"
35
+ counter = 0
36
+ while(counter < 200) do
37
+ break if self.statistics.get_status == CANCELLED
38
+ sleep 1
39
+ counter += 1
40
+ end
41
+ position = Resque.size(options[:queue_name])
42
+ until position == 0
43
+ position-=BATCH_SIZE
44
+ position = 0 if position < 0
45
+ job_items = Resque.peek(options[:queue_name], position, BATCH_SIZE)
40
46
  job_items.each do |item|
41
47
  if item["args"][0]["crawl_id"] == id
42
48
  # remove this job from the queue
43
49
  Resque.dequeue(CrawlJob, item["args"][0])
44
50
  end
45
51
  end
46
-
47
- job_items = Resque.peek(options[:queue_name], position+=BATCH_SIZE, BATCH_SIZE)
48
52
  end
49
53
 
50
54
  end
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "0.0.72"
6
+ "0.0.73"
7
7
  end
8
8
 
9
9
  end
data/lib/crawl_job.rb CHANGED
@@ -28,84 +28,92 @@ class CrawlJob
28
28
  # check we haven't crawled this url before
29
29
  unless @redis.sismember "crawled", content_request[:url]
30
30
  # if there is no limit or we're still under it lets get the url
31
- if within_crawl_limits?(content_request[:crawl_limit]) and @crawl.status != CobwebCrawlHelper::CANCELLED
32
- content = Cobweb.new(content_request).get(content_request[:url], content_request)
33
- if content_request[:url] == @redis.get("original_base_url")
34
- @redis.set("crawled_base_url", content[:base_url])
35
- end
36
- if is_permitted_type(content)
37
- begin
38
- @redis.incr "inprogress"
39
- # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
40
- @redis.srem "queued", content_request[:url]
41
- @redis.sadd "crawled", content_request[:url]
42
- @redis.srem "queued", content[:url]
43
- @redis.sadd "crawled", content[:url]
44
- # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
45
- if content_request[:crawl_limit_by_page]
46
- if content[:mime_type].match("text/html")
31
+ if within_crawl_limits?(content_request[:crawl_limit])
32
+ if @crawl.status != CobwebCrawlHelper::CANCELLED
33
+ content = Cobweb.new(content_request).get(content_request[:url], content_request)
34
+ if content_request[:url] == @redis.get("original_base_url")
35
+ @redis.set("crawled_base_url", content[:base_url])
36
+ end
37
+ if is_permitted_type(content)
38
+ begin
39
+ @redis.incr "inprogress"
40
+ # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
41
+ @redis.srem "queued", content_request[:url]
42
+ @redis.sadd "crawled", content_request[:url]
43
+ @redis.srem "queued", content[:url]
44
+ @redis.sadd "crawled", content[:url]
45
+ # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
46
+ if content_request[:crawl_limit_by_page]
47
+ if content[:mime_type].match("text/html")
48
+ increment_crawl_started_counter
49
+ end
50
+ else
47
51
  increment_crawl_started_counter
48
52
  end
49
- else
50
- increment_crawl_started_counter
51
- end
52
53
 
53
- ## update statistics
54
- @stats.update_status("Crawling #{content_request[:url]}...")
55
- @stats.update_statistics(content)
54
+ ## update statistics
55
+ @stats.update_status("Crawling #{content_request[:url]}...")
56
+ @stats.update_statistics(content)
56
57
 
57
- # set the base url if this is the first page
58
- set_base_url @redis, content, content_request
58
+ # set the base url if this is the first page
59
+ set_base_url @redis, content, content_request
59
60
 
60
- @cobweb_links = CobwebLinks.new(content_request)
61
- if within_queue_limits?(content_request[:crawl_limit])
62
- internal_links = ContentLinkParser.new(content_request[:url], content[:body], content_request).all_links(:valid_schemes => [:http, :https])
63
- #get rid of duplicate links in the same page.
64
- internal_links.uniq!
65
- # select the link if its internal
66
- internal_links.select! { |link| @cobweb_links.internal?(link) }
61
+ @cobweb_links = CobwebLinks.new(content_request)
62
+ if within_queue_limits?(content_request[:crawl_limit])
63
+ internal_links = ContentLinkParser.new(content_request[:url], content[:body], content_request).all_links(:valid_schemes => [:http, :https])
64
+ #get rid of duplicate links in the same page.
65
+ internal_links.uniq!
66
+ # select the link if its internal
67
+ internal_links.select! { |link| @cobweb_links.internal?(link) }
67
68
 
68
- # reject the link if we've crawled it or queued it
69
- internal_links.reject! { |link| @redis.sismember("crawled", link) }
70
- internal_links.reject! { |link| @redis.sismember("queued", link) }
69
+ # reject the link if we've crawled it or queued it
70
+ internal_links.reject! { |link| @redis.sismember("crawled", link) }
71
+ internal_links.reject! { |link| @redis.sismember("queued", link) }
71
72
 
72
- internal_links.each do |link|
73
- puts link
74
- puts "Not enqueuing due to cancelled crawl" if @crawl.status == CobwebCrawlHelper::CANCELLED
75
- if within_queue_limits?(content_request[:crawl_limit]) and @crawl.status != CobwebCrawlHelper::CANCELLED
76
- enqueue_content(content_request, link)
73
+ internal_links.each do |link|
74
+ puts link
75
+ puts "Not enqueuing due to cancelled crawl" if @crawl.status == CobwebCrawlHelper::CANCELLED
76
+ if within_queue_limits?(content_request[:crawl_limit])
77
+ if @crawl.status != CobwebCrawlHelper::CANCELLED
78
+ enqueue_content(content_request, link)
79
+ else
80
+ puts "Cannot enqueue new content as crawl has been cancelled." if content_request[:debug]
81
+ end
82
+ end
77
83
  end
78
84
  end
79
- end
80
85
 
81
- # enqueue to processing queue
82
- send_to_processing_queue(content, content_request)
86
+ # enqueue to processing queue
87
+ send_to_processing_queue(content, content_request)
83
88
 
84
- #if the enqueue counter has been requested update that
85
- if content_request.has_key?(:enqueue_counter_key)
86
- enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
87
- current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
88
- enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
89
- end
89
+ #if the enqueue counter has been requested update that
90
+ if content_request.has_key?(:enqueue_counter_key)
91
+ enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
92
+ current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
93
+ enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
94
+ end
90
95
 
91
- ensure
92
- @redis.decr "inprogress"
93
- #update the queued and crawled lists if we are within the crawl limits.
96
+ ensure
97
+ @redis.decr "inprogress"
98
+ #update the queued and crawled lists if we are within the crawl limits.
94
99
 
95
- # update the queue and crawl counts -- doing this very late in the piece so that the following transaction all occurs at once.
96
- # really we should do this with a lock https://github.com/PatrickTulskie/redis-lock
97
- # stewart: i'm looking at the layout of this, think that there is scope for cleaning up the perform method to be more DRY.
98
- if content_request[:crawl_limit_by_page]
99
- if content[:mime_type].match("text/html")
100
+ # update the queue and crawl counts -- doing this very late in the piece so that the following transaction all occurs at once.
101
+ # really we should do this with a lock https://github.com/PatrickTulskie/redis-lock
102
+ # stewart: i'm looking at the layout of this, think that there is scope for cleaning up the perform method to be more DRY.
103
+ if content_request[:crawl_limit_by_page]
104
+ if content[:mime_type].match("text/html")
105
+ increment_crawl_counter
106
+ end
107
+ else
100
108
  increment_crawl_counter
101
109
  end
102
- else
103
- increment_crawl_counter
110
+ puts "Crawled: #{crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{queue_counter} In Progress: #{crawl_started_counter-crawl_counter}" if @debug
104
111
  end
105
- puts "Crawled: #{crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{queue_counter} In Progress: #{crawl_started_counter-crawl_counter}" if @debug
112
+ else
113
+ puts "ignoring #{content_request[:url]} as mime_type is #{content[:mime_type]}" if content_request[:debug]
106
114
  end
107
115
  else
108
- puts "ignoring #{content_request[:url]} as mime_type is #{content[:mime_type]}" if content_request[:debug]
116
+ puts "ignoring #{content_request[:url]} as crawl has been cancelled." if content_request[:debug]
109
117
  end
110
118
  else
111
119
  puts "ignoring #{content_request[:url]} as outside of crawl limits." if content_request[:debug]
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.72
4
+ version: 0.0.73
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-09-20 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70308627743380 !ruby/object:Gem::Requirement
16
+ requirement: &70226914018080 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70308627743380
24
+ version_requirements: *70226914018080
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70308627741840 !ruby/object:Gem::Requirement
27
+ requirement: &70226914017080 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70308627741840
35
+ version_requirements: *70226914017080
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70308627741060 !ruby/object:Gem::Requirement
38
+ requirement: &70226914016400 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70308627741060
46
+ version_requirements: *70226914016400
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70308627740200 !ruby/object:Gem::Requirement
49
+ requirement: &70226914015220 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70308627740200
57
+ version_requirements: *70226914015220
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70308627739500 !ruby/object:Gem::Requirement
60
+ requirement: &70226914014640 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70308627739500
68
+ version_requirements: *70226914014640
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70308627739020 !ruby/object:Gem::Requirement
71
+ requirement: &70226914013860 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70308627739020
79
+ version_requirements: *70226914013860
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70308627738100 !ruby/object:Gem::Requirement
82
+ requirement: &70226914013140 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70308627738100
90
+ version_requirements: *70226914013140
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70308627737580 !ruby/object:Gem::Requirement
93
+ requirement: &70226914012280 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70308627737580
101
+ version_requirements: *70226914012280
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70308627737040 !ruby/object:Gem::Requirement
104
+ requirement: &70226914011460 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70308627737040
112
+ version_requirements: *70226914011460
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70308627736400 !ruby/object:Gem::Requirement
115
+ requirement: &70226914010720 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,10 +120,10 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70308627736400
123
+ version_requirements: *70226914010720
124
124
  - !ruby/object:Gem::Dependency
125
125
  name: json
126
- requirement: &70308627735860 !ruby/object:Gem::Requirement
126
+ requirement: &70226914010260 !ruby/object:Gem::Requirement
127
127
  none: false
128
128
  requirements:
129
129
  - - ! '>='
@@ -131,7 +131,7 @@ dependencies:
131
131
  version: '0'
132
132
  type: :runtime
133
133
  prerelease: false
134
- version_requirements: *70308627735860
134
+ version_requirements: *70226914010260
135
135
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
136
136
  crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
137
137
  is also a standalone crawler that has a sophisticated statistics monitoring interface