cobweb 0.0.72 → 0.0.73

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.textile CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.72
2
+ h1. Cobweb v0.0.73
3
3
 
4
4
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
5
 
@@ -22,9 +22,8 @@ class CobwebCrawlHelper
22
22
 
23
23
  # set status as cancelled now so that we don't enqueue any further pages
24
24
  self.statistics.end_crawl(@data, true)
25
- puts "end_crawl: #{self.statistics.get_status}"
25
+
26
26
  if options[:finished_resque_queue]
27
- puts "enqueueing finished job..."
28
27
 
29
28
  additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
30
29
  additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
@@ -33,18 +32,23 @@ class CobwebCrawlHelper
33
32
  Resque.enqueue(options[:finished_resque_queue], @stats.get_statistics.merge(additional_stats))
34
33
  end
35
34
 
36
- position = 0
37
- job_items = Resque.peek(options[:queue_name], position, BATCH_SIZE)
38
- until job_items.empty?
39
- puts "Batch: #{position} : #{job_items.count}"
35
+ counter = 0
36
+ while(counter < 200) do
37
+ break if self.statistics.get_status == CANCELLED
38
+ sleep 1
39
+ counter += 1
40
+ end
41
+ position = Resque.size(options[:queue_name])
42
+ until position == 0
43
+ position-=BATCH_SIZE
44
+ position = 0 if position < 0
45
+ job_items = Resque.peek(options[:queue_name], position, BATCH_SIZE)
40
46
  job_items.each do |item|
41
47
  if item["args"][0]["crawl_id"] == id
42
48
  # remove this job from the queue
43
49
  Resque.dequeue(CrawlJob, item["args"][0])
44
50
  end
45
51
  end
46
-
47
- job_items = Resque.peek(options[:queue_name], position+=BATCH_SIZE, BATCH_SIZE)
48
52
  end
49
53
 
50
54
  end
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "0.0.72"
6
+ "0.0.73"
7
7
  end
8
8
 
9
9
  end
data/lib/crawl_job.rb CHANGED
@@ -28,84 +28,92 @@ class CrawlJob
28
28
  # check we haven't crawled this url before
29
29
  unless @redis.sismember "crawled", content_request[:url]
30
30
  # if there is no limit or we're still under it lets get the url
31
- if within_crawl_limits?(content_request[:crawl_limit]) and @crawl.status != CobwebCrawlHelper::CANCELLED
32
- content = Cobweb.new(content_request).get(content_request[:url], content_request)
33
- if content_request[:url] == @redis.get("original_base_url")
34
- @redis.set("crawled_base_url", content[:base_url])
35
- end
36
- if is_permitted_type(content)
37
- begin
38
- @redis.incr "inprogress"
39
- # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
40
- @redis.srem "queued", content_request[:url]
41
- @redis.sadd "crawled", content_request[:url]
42
- @redis.srem "queued", content[:url]
43
- @redis.sadd "crawled", content[:url]
44
- # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
45
- if content_request[:crawl_limit_by_page]
46
- if content[:mime_type].match("text/html")
31
+ if within_crawl_limits?(content_request[:crawl_limit])
32
+ if @crawl.status != CobwebCrawlHelper::CANCELLED
33
+ content = Cobweb.new(content_request).get(content_request[:url], content_request)
34
+ if content_request[:url] == @redis.get("original_base_url")
35
+ @redis.set("crawled_base_url", content[:base_url])
36
+ end
37
+ if is_permitted_type(content)
38
+ begin
39
+ @redis.incr "inprogress"
40
+ # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
41
+ @redis.srem "queued", content_request[:url]
42
+ @redis.sadd "crawled", content_request[:url]
43
+ @redis.srem "queued", content[:url]
44
+ @redis.sadd "crawled", content[:url]
45
+ # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
46
+ if content_request[:crawl_limit_by_page]
47
+ if content[:mime_type].match("text/html")
48
+ increment_crawl_started_counter
49
+ end
50
+ else
47
51
  increment_crawl_started_counter
48
52
  end
49
- else
50
- increment_crawl_started_counter
51
- end
52
53
 
53
- ## update statistics
54
- @stats.update_status("Crawling #{content_request[:url]}...")
55
- @stats.update_statistics(content)
54
+ ## update statistics
55
+ @stats.update_status("Crawling #{content_request[:url]}...")
56
+ @stats.update_statistics(content)
56
57
 
57
- # set the base url if this is the first page
58
- set_base_url @redis, content, content_request
58
+ # set the base url if this is the first page
59
+ set_base_url @redis, content, content_request
59
60
 
60
- @cobweb_links = CobwebLinks.new(content_request)
61
- if within_queue_limits?(content_request[:crawl_limit])
62
- internal_links = ContentLinkParser.new(content_request[:url], content[:body], content_request).all_links(:valid_schemes => [:http, :https])
63
- #get rid of duplicate links in the same page.
64
- internal_links.uniq!
65
- # select the link if its internal
66
- internal_links.select! { |link| @cobweb_links.internal?(link) }
61
+ @cobweb_links = CobwebLinks.new(content_request)
62
+ if within_queue_limits?(content_request[:crawl_limit])
63
+ internal_links = ContentLinkParser.new(content_request[:url], content[:body], content_request).all_links(:valid_schemes => [:http, :https])
64
+ #get rid of duplicate links in the same page.
65
+ internal_links.uniq!
66
+ # select the link if its internal
67
+ internal_links.select! { |link| @cobweb_links.internal?(link) }
67
68
 
68
- # reject the link if we've crawled it or queued it
69
- internal_links.reject! { |link| @redis.sismember("crawled", link) }
70
- internal_links.reject! { |link| @redis.sismember("queued", link) }
69
+ # reject the link if we've crawled it or queued it
70
+ internal_links.reject! { |link| @redis.sismember("crawled", link) }
71
+ internal_links.reject! { |link| @redis.sismember("queued", link) }
71
72
 
72
- internal_links.each do |link|
73
- puts link
74
- puts "Not enqueuing due to cancelled crawl" if @crawl.status == CobwebCrawlHelper::CANCELLED
75
- if within_queue_limits?(content_request[:crawl_limit]) and @crawl.status != CobwebCrawlHelper::CANCELLED
76
- enqueue_content(content_request, link)
73
+ internal_links.each do |link|
74
+ puts link
75
+ puts "Not enqueuing due to cancelled crawl" if @crawl.status == CobwebCrawlHelper::CANCELLED
76
+ if within_queue_limits?(content_request[:crawl_limit])
77
+ if @crawl.status != CobwebCrawlHelper::CANCELLED
78
+ enqueue_content(content_request, link)
79
+ else
80
+ puts "Cannot enqueue new content as crawl has been cancelled." if content_request[:debug]
81
+ end
82
+ end
77
83
  end
78
84
  end
79
- end
80
85
 
81
- # enqueue to processing queue
82
- send_to_processing_queue(content, content_request)
86
+ # enqueue to processing queue
87
+ send_to_processing_queue(content, content_request)
83
88
 
84
- #if the enqueue counter has been requested update that
85
- if content_request.has_key?(:enqueue_counter_key)
86
- enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
87
- current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
88
- enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
89
- end
89
+ #if the enqueue counter has been requested update that
90
+ if content_request.has_key?(:enqueue_counter_key)
91
+ enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
92
+ current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
93
+ enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
94
+ end
90
95
 
91
- ensure
92
- @redis.decr "inprogress"
93
- #update the queued and crawled lists if we are within the crawl limits.
96
+ ensure
97
+ @redis.decr "inprogress"
98
+ #update the queued and crawled lists if we are within the crawl limits.
94
99
 
95
- # update the queue and crawl counts -- doing this very late in the piece so that the following transaction all occurs at once.
96
- # really we should do this with a lock https://github.com/PatrickTulskie/redis-lock
97
- # stewart: i'm looking at the layout of this, think that there is scope for cleaning up the perform method to be more DRY.
98
- if content_request[:crawl_limit_by_page]
99
- if content[:mime_type].match("text/html")
100
+ # update the queue and crawl counts -- doing this very late in the piece so that the following transaction all occurs at once.
101
+ # really we should do this with a lock https://github.com/PatrickTulskie/redis-lock
102
+ # stewart: i'm looking at the layout of this, think that there is scope for cleaning up the perform method to be more DRY.
103
+ if content_request[:crawl_limit_by_page]
104
+ if content[:mime_type].match("text/html")
105
+ increment_crawl_counter
106
+ end
107
+ else
100
108
  increment_crawl_counter
101
109
  end
102
- else
103
- increment_crawl_counter
110
+ puts "Crawled: #{crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{queue_counter} In Progress: #{crawl_started_counter-crawl_counter}" if @debug
104
111
  end
105
- puts "Crawled: #{crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{queue_counter} In Progress: #{crawl_started_counter-crawl_counter}" if @debug
112
+ else
113
+ puts "ignoring #{content_request[:url]} as mime_type is #{content[:mime_type]}" if content_request[:debug]
106
114
  end
107
115
  else
108
- puts "ignoring #{content_request[:url]} as mime_type is #{content[:mime_type]}" if content_request[:debug]
116
+ puts "ignoring #{content_request[:url]} as crawl has been cancelled." if content_request[:debug]
109
117
  end
110
118
  else
111
119
  puts "ignoring #{content_request[:url]} as outside of crawl limits." if content_request[:debug]
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.72
4
+ version: 0.0.73
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-09-20 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70308627743380 !ruby/object:Gem::Requirement
16
+ requirement: &70226914018080 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70308627743380
24
+ version_requirements: *70226914018080
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70308627741840 !ruby/object:Gem::Requirement
27
+ requirement: &70226914017080 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70308627741840
35
+ version_requirements: *70226914017080
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70308627741060 !ruby/object:Gem::Requirement
38
+ requirement: &70226914016400 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70308627741060
46
+ version_requirements: *70226914016400
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70308627740200 !ruby/object:Gem::Requirement
49
+ requirement: &70226914015220 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70308627740200
57
+ version_requirements: *70226914015220
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70308627739500 !ruby/object:Gem::Requirement
60
+ requirement: &70226914014640 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70308627739500
68
+ version_requirements: *70226914014640
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70308627739020 !ruby/object:Gem::Requirement
71
+ requirement: &70226914013860 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70308627739020
79
+ version_requirements: *70226914013860
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70308627738100 !ruby/object:Gem::Requirement
82
+ requirement: &70226914013140 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70308627738100
90
+ version_requirements: *70226914013140
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70308627737580 !ruby/object:Gem::Requirement
93
+ requirement: &70226914012280 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70308627737580
101
+ version_requirements: *70226914012280
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70308627737040 !ruby/object:Gem::Requirement
104
+ requirement: &70226914011460 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70308627737040
112
+ version_requirements: *70226914011460
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70308627736400 !ruby/object:Gem::Requirement
115
+ requirement: &70226914010720 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,10 +120,10 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70308627736400
123
+ version_requirements: *70226914010720
124
124
  - !ruby/object:Gem::Dependency
125
125
  name: json
126
- requirement: &70308627735860 !ruby/object:Gem::Requirement
126
+ requirement: &70226914010260 !ruby/object:Gem::Requirement
127
127
  none: false
128
128
  requirements:
129
129
  - - ! '>='
@@ -131,7 +131,7 @@ dependencies:
131
131
  version: '0'
132
132
  type: :runtime
133
133
  prerelease: false
134
- version_requirements: *70308627735860
134
+ version_requirements: *70226914010260
135
135
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
136
136
  crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
137
137
  is also a standalone crawler that has a sophisticated statistics monitoring interface