cobweb 0.0.24 → 0.0.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. data/README.textile +4 -4
  2. data/lib/cobweb.rb +1 -1
  3. data/lib/crawl_job.rb +26 -26
  4. metadata +30 -29
data/README.textile CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.23
2
+ h1. Cobweb v0.0.25
3
3
 
4
4
  h2. Intro
5
5
 
@@ -61,9 +61,9 @@ Creates a new crawler object based on a base_url
61
61
  ** :quiet - hides default output (Default: false)
62
62
  ** :cache - sets the ttl for caching pages, set to nil to disable caching (Default: 300)
63
63
  ** :timeout - http timeout for requests (Default: 10)
64
- ** :redis_options - hash containing the initialization options for redis (e.g. {:host => "redis.mydomain.com"}
65
- ** :internal_urls - array of strings representing internal url forms for your site (eg: ['http://test.com/*', 'http://blog.test.com/*', 'http://externaltest.com/*'])
66
- ** :first_page_redirect_internal - if true and the first page crawled is a redirect, it will add the final destination of redirects to the internal_urls (e.g. http://www.test.com gets redirected to http://test.com)
64
+ ** :redis_options - hash containing the initialization options for redis (e.g. {:host => "redis.mydomain.com"}) (Default: {})
65
+ ** :internal_urls - array of strings representing internal url forms for your site (eg: ['http://test.com/*', 'http://blog.test.com/*', 'http://externaltest.com/*']) (Default: [], although your first url's scheme, host and domain are added)
66
+ ** :first_page_redirect_internal - if true and the first page crawled is a redirect, it will add the final destination of redirects to the internal_urls (e.g. http://www.test.com gets redirected to http://test.com) (Default: true)
67
67
 
68
68
  bq. crawler = CobWeb.new(:follow_redirects => false)
69
69
 
data/lib/cobweb.rb CHANGED
@@ -19,7 +19,7 @@ class Cobweb
19
19
  # investigate using event machine for single threaded crawling
20
20
 
21
21
  def self.version
22
- "0.0.24"
22
+ "0.0.25"
23
23
  end
24
24
 
25
25
  def method_missing(method_sym, *arguments, &block)
data/lib/crawl_job.rb CHANGED
@@ -20,6 +20,10 @@ class CrawlJob
20
20
 
21
21
  # check we haven't crawled this url before
22
22
  unless @redis.sismember "crawled", content_request[:url]
23
+ @redis.srem "queued", content_request[:url]
24
+ decrement_queue_counter
25
+ @redis.sadd "crawled", content_request[:url]
26
+ increment_crawl_counter
23
27
 
24
28
  # if there is no limit or we're still under it lets get the url
25
29
  if content_request[:crawl_limit].nil? or @crawl_counter <= content_request[:crawl_limit].to_i
@@ -44,16 +48,31 @@ class CrawlJob
44
48
  enqueue_content(content_request, link)
45
49
  end
46
50
 
47
- # now that we're done, lets update the queues
48
- @redis.srem "queued", content_request[:url]
49
- decrement_queue_counter
50
- @redis.sadd "crawled", content_request[:url]
51
- increment_crawl_counter
52
-
53
51
  # enqueue to processing queue
54
- Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]}))
52
+ Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]}))
55
53
  puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
56
54
  puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter}" if content_request[:debug]
55
+
56
+
57
+ # if the'res nothing left queued or the crawled limit has been reached
58
+ if @queue_counter == 0 || @crawl_counter >= content_request[:crawl_limit].to_i
59
+
60
+ puts "queue_counter: #{@queue_counter}"
61
+ puts "crawl_counter: #{@crawl_counter}"
62
+ puts "crawl_limit: #{content_request[:crawl_limit]}"
63
+
64
+ # finished
65
+ puts "FINISHED"
66
+ stats = @redis.hgetall "statistics"
67
+ stats[:total_pages] = @redis.get("total_pages").to_i
68
+ stats[:total_assets] = @redis.get("total_assets").to_i
69
+ stats[:crawl_counter] = @crawl_counter
70
+ stats[:queue_counter] = @queue_counter
71
+ stats[:crawled] = @redis.smembers "crawled"
72
+
73
+ Resque.enqueue(const_get(content_request[:crawl_finished_queue]), stats.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
74
+
75
+ end
57
76
 
58
77
  else
59
78
  puts "Crawl Limit Exceeded by #{@crawl_counter - content_request[:crawl_limit].to_i} objects" if content_request[:debug]
@@ -62,25 +81,6 @@ class CrawlJob
62
81
  puts "Already crawled #{content_request[:url]}" if content_request[:debug]
63
82
  end
64
83
 
65
- # if the'res nothing left queued or the crawled limit has been reached
66
- if @queue_counter == 0 || @crawl_counter >= content_request[:crawl_limit].to_i
67
-
68
- puts "queue_counter: #{@queue_counter}"
69
- puts "crawl_counter: #{@crawl_counter}"
70
- puts "crawl_limit: #{content_request[:crawl_limit]}"
71
-
72
- # finished
73
- puts "FINISHED"
74
- stats = @redis.hgetall "statistics"
75
- stats[:total_pages] = @redis.get "total_pages"
76
- stats[:total_assets] = @redis.get "total_assets"
77
- stats[:crawl_counter] = @redis.get "crawl_counter"
78
- stats[:queue_counter] = @redis.get "queue_counter"
79
- stats[:crawled] = @redis.smembers "crawled"
80
-
81
- Resque.enqueue(const_get(content_request[:crawl_finished_queue]), stats.merge({:crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
82
-
83
- end
84
84
  end
85
85
 
86
86
  private
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.24
4
+ version: 0.0.25
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-03-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70268501331520 !ruby/object:Gem::Requirement
16
+ requirement: &70349719636940 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70268501331520
24
+ version_requirements: *70349719636940
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70268501331100 !ruby/object:Gem::Requirement
27
+ requirement: &70349719636520 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70268501331100
35
+ version_requirements: *70349719636520
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70268501330680 !ruby/object:Gem::Requirement
38
+ requirement: &70349719636100 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70268501330680
46
+ version_requirements: *70349719636100
47
47
  - !ruby/object:Gem::Dependency
48
- name: addressable
49
- requirement: &70268501330240 !ruby/object:Gem::Requirement
48
+ name: absolutize
49
+ requirement: &70349719635660 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70268501330240
57
+ version_requirements: *70349719635660
58
58
  - !ruby/object:Gem::Dependency
59
- name: rspec
60
- requirement: &70268501329820 !ruby/object:Gem::Requirement
59
+ name: addressable
60
+ requirement: &70349719635240 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70268501329820
68
+ version_requirements: *70349719635240
69
69
  - !ruby/object:Gem::Dependency
70
- name: awesome_print
71
- requirement: &70268501329400 !ruby/object:Gem::Requirement
70
+ name: rspec
71
+ requirement: &70349719634820 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70268501329400
79
+ version_requirements: *70349719634820
80
80
  - !ruby/object:Gem::Dependency
81
- name: sinatra
82
- requirement: &70268501328980 !ruby/object:Gem::Requirement
81
+ name: awesome_print
82
+ requirement: &70349719634400 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70268501328980
90
+ version_requirements: *70349719634400
91
91
  - !ruby/object:Gem::Dependency
92
- name: thin
93
- requirement: &70268501328560 !ruby/object:Gem::Requirement
92
+ name: sinatra
93
+ requirement: &70349719633980 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70268501328560
101
+ version_requirements: *70349719633980
102
102
  - !ruby/object:Gem::Dependency
103
- name: haml
104
- requirement: &70268501328140 !ruby/object:Gem::Requirement
103
+ name: thin
104
+ requirement: &70349719633560 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70268501328140
112
+ version_requirements: *70349719633560
113
113
  - !ruby/object:Gem::Dependency
114
- name: hashie
115
- requirement: &70268501344080 !ruby/object:Gem::Requirement
114
+ name: haml
115
+ requirement: &70349719633140 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,8 +120,9 @@ dependencies:
120
120
  version: '0'
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70268501344080
124
- description:
123
+ version_requirements: *70349719633140
124
+ description: Web Crawler that uses resque background job engine to allow you to cluster
125
+ your crawl.
125
126
  email: stewart@rockwellcottage.com
126
127
  executables: []
127
128
  extensions: []