cobweb 0.0.24 → 0.0.25

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. data/README.textile +4 -4
  2. data/lib/cobweb.rb +1 -1
  3. data/lib/crawl_job.rb +26 -26
  4. metadata +30 -29
data/README.textile CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.23
2
+ h1. Cobweb v0.0.25
3
3
 
4
4
  h2. Intro
5
5
 
@@ -61,9 +61,9 @@ Creates a new crawler object based on a base_url
61
61
  ** :quiet - hides default output (Default: false)
62
62
  ** :cache - sets the ttl for caching pages, set to nil to disable caching (Default: 300)
63
63
  ** :timeout - http timeout for requests (Default: 10)
64
- ** :redis_options - hash containing the initialization options for redis (e.g. {:host => "redis.mydomain.com"}
65
- ** :internal_urls - array of strings representing internal url forms for your site (eg: ['http://test.com/*', 'http://blog.test.com/*', 'http://externaltest.com/*'])
66
- ** :first_page_redirect_internal - if true and the first page crawled is a redirect, it will add the final destination of redirects to the internal_urls (e.g. http://www.test.com gets redirected to http://test.com)
64
+ ** :redis_options - hash containing the initialization options for redis (e.g. {:host => "redis.mydomain.com"}) (Default: {})
65
+ ** :internal_urls - array of strings representing internal url forms for your site (eg: ['http://test.com/*', 'http://blog.test.com/*', 'http://externaltest.com/*']) (Default: [], although your first url's scheme, host and domain are added)
66
+ ** :first_page_redirect_internal - if true and the first page crawled is a redirect, it will add the final destination of redirects to the internal_urls (e.g. http://www.test.com gets redirected to http://test.com) (Default: true)
67
67
 
68
68
  bq. crawler = CobWeb.new(:follow_redirects => false)
69
69
 
data/lib/cobweb.rb CHANGED
@@ -19,7 +19,7 @@ class Cobweb
19
19
  # investigate using event machine for single threaded crawling
20
20
 
21
21
  def self.version
22
- "0.0.24"
22
+ "0.0.25"
23
23
  end
24
24
 
25
25
  def method_missing(method_sym, *arguments, &block)
data/lib/crawl_job.rb CHANGED
@@ -20,6 +20,10 @@ class CrawlJob
20
20
 
21
21
  # check we haven't crawled this url before
22
22
  unless @redis.sismember "crawled", content_request[:url]
23
+ @redis.srem "queued", content_request[:url]
24
+ decrement_queue_counter
25
+ @redis.sadd "crawled", content_request[:url]
26
+ increment_crawl_counter
23
27
 
24
28
  # if there is no limit or we're still under it lets get the url
25
29
  if content_request[:crawl_limit].nil? or @crawl_counter <= content_request[:crawl_limit].to_i
@@ -44,16 +48,31 @@ class CrawlJob
44
48
  enqueue_content(content_request, link)
45
49
  end
46
50
 
47
- # now that we're done, lets update the queues
48
- @redis.srem "queued", content_request[:url]
49
- decrement_queue_counter
50
- @redis.sadd "crawled", content_request[:url]
51
- increment_crawl_counter
52
-
53
51
  # enqueue to processing queue
54
- Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]}))
52
+ Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]}))
55
53
  puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
56
54
  puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter}" if content_request[:debug]
55
+
56
+
57
+ # if the'res nothing left queued or the crawled limit has been reached
58
+ if @queue_counter == 0 || @crawl_counter >= content_request[:crawl_limit].to_i
59
+
60
+ puts "queue_counter: #{@queue_counter}"
61
+ puts "crawl_counter: #{@crawl_counter}"
62
+ puts "crawl_limit: #{content_request[:crawl_limit]}"
63
+
64
+ # finished
65
+ puts "FINISHED"
66
+ stats = @redis.hgetall "statistics"
67
+ stats[:total_pages] = @redis.get("total_pages").to_i
68
+ stats[:total_assets] = @redis.get("total_assets").to_i
69
+ stats[:crawl_counter] = @crawl_counter
70
+ stats[:queue_counter] = @queue_counter
71
+ stats[:crawled] = @redis.smembers "crawled"
72
+
73
+ Resque.enqueue(const_get(content_request[:crawl_finished_queue]), stats.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
74
+
75
+ end
57
76
 
58
77
  else
59
78
  puts "Crawl Limit Exceeded by #{@crawl_counter - content_request[:crawl_limit].to_i} objects" if content_request[:debug]
@@ -62,25 +81,6 @@ class CrawlJob
62
81
  puts "Already crawled #{content_request[:url]}" if content_request[:debug]
63
82
  end
64
83
 
65
- # if the'res nothing left queued or the crawled limit has been reached
66
- if @queue_counter == 0 || @crawl_counter >= content_request[:crawl_limit].to_i
67
-
68
- puts "queue_counter: #{@queue_counter}"
69
- puts "crawl_counter: #{@crawl_counter}"
70
- puts "crawl_limit: #{content_request[:crawl_limit]}"
71
-
72
- # finished
73
- puts "FINISHED"
74
- stats = @redis.hgetall "statistics"
75
- stats[:total_pages] = @redis.get "total_pages"
76
- stats[:total_assets] = @redis.get "total_assets"
77
- stats[:crawl_counter] = @redis.get "crawl_counter"
78
- stats[:queue_counter] = @redis.get "queue_counter"
79
- stats[:crawled] = @redis.smembers "crawled"
80
-
81
- Resque.enqueue(const_get(content_request[:crawl_finished_queue]), stats.merge({:crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
82
-
83
- end
84
84
  end
85
85
 
86
86
  private
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.24
4
+ version: 0.0.25
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-03-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70268501331520 !ruby/object:Gem::Requirement
16
+ requirement: &70349719636940 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70268501331520
24
+ version_requirements: *70349719636940
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70268501331100 !ruby/object:Gem::Requirement
27
+ requirement: &70349719636520 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70268501331100
35
+ version_requirements: *70349719636520
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70268501330680 !ruby/object:Gem::Requirement
38
+ requirement: &70349719636100 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70268501330680
46
+ version_requirements: *70349719636100
47
47
  - !ruby/object:Gem::Dependency
48
- name: addressable
49
- requirement: &70268501330240 !ruby/object:Gem::Requirement
48
+ name: absolutize
49
+ requirement: &70349719635660 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70268501330240
57
+ version_requirements: *70349719635660
58
58
  - !ruby/object:Gem::Dependency
59
- name: rspec
60
- requirement: &70268501329820 !ruby/object:Gem::Requirement
59
+ name: addressable
60
+ requirement: &70349719635240 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70268501329820
68
+ version_requirements: *70349719635240
69
69
  - !ruby/object:Gem::Dependency
70
- name: awesome_print
71
- requirement: &70268501329400 !ruby/object:Gem::Requirement
70
+ name: rspec
71
+ requirement: &70349719634820 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70268501329400
79
+ version_requirements: *70349719634820
80
80
  - !ruby/object:Gem::Dependency
81
- name: sinatra
82
- requirement: &70268501328980 !ruby/object:Gem::Requirement
81
+ name: awesome_print
82
+ requirement: &70349719634400 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70268501328980
90
+ version_requirements: *70349719634400
91
91
  - !ruby/object:Gem::Dependency
92
- name: thin
93
- requirement: &70268501328560 !ruby/object:Gem::Requirement
92
+ name: sinatra
93
+ requirement: &70349719633980 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70268501328560
101
+ version_requirements: *70349719633980
102
102
  - !ruby/object:Gem::Dependency
103
- name: haml
104
- requirement: &70268501328140 !ruby/object:Gem::Requirement
103
+ name: thin
104
+ requirement: &70349719633560 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70268501328140
112
+ version_requirements: *70349719633560
113
113
  - !ruby/object:Gem::Dependency
114
- name: hashie
115
- requirement: &70268501344080 !ruby/object:Gem::Requirement
114
+ name: haml
115
+ requirement: &70349719633140 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,8 +120,9 @@ dependencies:
120
120
  version: '0'
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70268501344080
124
- description:
123
+ version_requirements: *70349719633140
124
+ description: Web Crawler that uses resque background job engine to allow you to cluster
125
+ your crawl.
125
126
  email: stewart@rockwellcottage.com
126
127
  executables: []
127
128
  extensions: []