cobweb 0.0.24 → 0.0.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +4 -4
- data/lib/cobweb.rb +1 -1
- data/lib/crawl_job.rb +26 -26
- metadata +30 -29
data/README.textile
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
|
2
|
-
h1. Cobweb v0.0.
|
2
|
+
h1. Cobweb v0.0.25
|
3
3
|
|
4
4
|
h2. Intro
|
5
5
|
|
@@ -61,9 +61,9 @@ Creates a new crawler object based on a base_url
|
|
61
61
|
** :quiet - hides default output (Default: false)
|
62
62
|
** :cache - sets the ttl for caching pages, set to nil to disable caching (Default: 300)
|
63
63
|
** :timeout - http timeout for requests (Default: 10)
|
64
|
-
** :redis_options - hash containing the initialization options for redis (e.g. {:host => "redis.mydomain.com"}
|
65
|
-
** :internal_urls - array of strings representing internal url forms for your site (eg: ['http://test.com/*', 'http://blog.test.com/*', 'http://externaltest.com/*'])
|
66
|
-
** :first_page_redirect_internal - if true and the first page crawled is a redirect, it will add the final destination of redirects to the internal_urls (e.g. http://www.test.com gets redirected to http://test.com)
|
64
|
+
** :redis_options - hash containing the initialization options for redis (e.g. {:host => "redis.mydomain.com"}) (Default: {})
|
65
|
+
** :internal_urls - array of strings representing internal url forms for your site (eg: ['http://test.com/*', 'http://blog.test.com/*', 'http://externaltest.com/*']) (Default: [], although your first url's scheme, host and domain are added)
|
66
|
+
** :first_page_redirect_internal - if true and the first page crawled is a redirect, it will add the final destination of redirects to the internal_urls (e.g. http://www.test.com gets redirected to http://test.com) (Default: true)
|
67
67
|
|
68
68
|
bq. crawler = CobWeb.new(:follow_redirects => false)
|
69
69
|
|
data/lib/cobweb.rb
CHANGED
data/lib/crawl_job.rb
CHANGED
@@ -20,6 +20,10 @@ class CrawlJob
|
|
20
20
|
|
21
21
|
# check we haven't crawled this url before
|
22
22
|
unless @redis.sismember "crawled", content_request[:url]
|
23
|
+
@redis.srem "queued", content_request[:url]
|
24
|
+
decrement_queue_counter
|
25
|
+
@redis.sadd "crawled", content_request[:url]
|
26
|
+
increment_crawl_counter
|
23
27
|
|
24
28
|
# if there is no limit or we're still under it lets get the url
|
25
29
|
if content_request[:crawl_limit].nil? or @crawl_counter <= content_request[:crawl_limit].to_i
|
@@ -44,16 +48,31 @@ class CrawlJob
|
|
44
48
|
enqueue_content(content_request, link)
|
45
49
|
end
|
46
50
|
|
47
|
-
# now that we're done, lets update the queues
|
48
|
-
@redis.srem "queued", content_request[:url]
|
49
|
-
decrement_queue_counter
|
50
|
-
@redis.sadd "crawled", content_request[:url]
|
51
|
-
increment_crawl_counter
|
52
|
-
|
53
51
|
# enqueue to processing queue
|
54
|
-
Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]}))
|
52
|
+
Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]}))
|
55
53
|
puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
|
56
54
|
puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter}" if content_request[:debug]
|
55
|
+
|
56
|
+
|
57
|
+
# if the'res nothing left queued or the crawled limit has been reached
|
58
|
+
if @queue_counter == 0 || @crawl_counter >= content_request[:crawl_limit].to_i
|
59
|
+
|
60
|
+
puts "queue_counter: #{@queue_counter}"
|
61
|
+
puts "crawl_counter: #{@crawl_counter}"
|
62
|
+
puts "crawl_limit: #{content_request[:crawl_limit]}"
|
63
|
+
|
64
|
+
# finished
|
65
|
+
puts "FINISHED"
|
66
|
+
stats = @redis.hgetall "statistics"
|
67
|
+
stats[:total_pages] = @redis.get("total_pages").to_i
|
68
|
+
stats[:total_assets] = @redis.get("total_assets").to_i
|
69
|
+
stats[:crawl_counter] = @crawl_counter
|
70
|
+
stats[:queue_counter] = @queue_counter
|
71
|
+
stats[:crawled] = @redis.smembers "crawled"
|
72
|
+
|
73
|
+
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), stats.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
|
74
|
+
|
75
|
+
end
|
57
76
|
|
58
77
|
else
|
59
78
|
puts "Crawl Limit Exceeded by #{@crawl_counter - content_request[:crawl_limit].to_i} objects" if content_request[:debug]
|
@@ -62,25 +81,6 @@ class CrawlJob
|
|
62
81
|
puts "Already crawled #{content_request[:url]}" if content_request[:debug]
|
63
82
|
end
|
64
83
|
|
65
|
-
# if the'res nothing left queued or the crawled limit has been reached
|
66
|
-
if @queue_counter == 0 || @crawl_counter >= content_request[:crawl_limit].to_i
|
67
|
-
|
68
|
-
puts "queue_counter: #{@queue_counter}"
|
69
|
-
puts "crawl_counter: #{@crawl_counter}"
|
70
|
-
puts "crawl_limit: #{content_request[:crawl_limit]}"
|
71
|
-
|
72
|
-
# finished
|
73
|
-
puts "FINISHED"
|
74
|
-
stats = @redis.hgetall "statistics"
|
75
|
-
stats[:total_pages] = @redis.get "total_pages"
|
76
|
-
stats[:total_assets] = @redis.get "total_assets"
|
77
|
-
stats[:crawl_counter] = @redis.get "crawl_counter"
|
78
|
-
stats[:queue_counter] = @redis.get "queue_counter"
|
79
|
-
stats[:crawled] = @redis.smembers "crawled"
|
80
|
-
|
81
|
-
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), stats.merge({:crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
|
82
|
-
|
83
|
-
end
|
84
84
|
end
|
85
85
|
|
86
86
|
private
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.25
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-03-13 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70349719636940 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70349719636940
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70349719636520 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70349719636520
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70349719636100 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70349719636100
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
|
-
name:
|
49
|
-
requirement: &
|
48
|
+
name: absolutize
|
49
|
+
requirement: &70349719635660 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70349719635660
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
|
-
name:
|
60
|
-
requirement: &
|
59
|
+
name: addressable
|
60
|
+
requirement: &70349719635240 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70349719635240
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
71
|
-
requirement: &
|
70
|
+
name: rspec
|
71
|
+
requirement: &70349719634820 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70349719634820
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
|
-
name:
|
82
|
-
requirement: &
|
81
|
+
name: awesome_print
|
82
|
+
requirement: &70349719634400 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70349719634400
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
|
-
name:
|
93
|
-
requirement: &
|
92
|
+
name: sinatra
|
93
|
+
requirement: &70349719633980 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70349719633980
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
|
-
name:
|
104
|
-
requirement: &
|
103
|
+
name: thin
|
104
|
+
requirement: &70349719633560 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70349719633560
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
|
-
name:
|
115
|
-
requirement: &
|
114
|
+
name: haml
|
115
|
+
requirement: &70349719633140 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,8 +120,9 @@ dependencies:
|
|
120
120
|
version: '0'
|
121
121
|
type: :runtime
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
124
|
-
description:
|
123
|
+
version_requirements: *70349719633140
|
124
|
+
description: Web Crawler that uses resque background job engine to allow you to cluster
|
125
|
+
your crawl.
|
125
126
|
email: stewart@rockwellcottage.com
|
126
127
|
executables: []
|
127
128
|
extensions: []
|