cobweb 0.0.37 → 0.0.38
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +3 -1
- data/lib/cobweb.rb +1 -1
- data/lib/crawl_job.rb +27 -13
- data/lib/stats.rb +20 -0
- data/spec/cobweb/cobweb_job_spec.rb +41 -0
- metadata +23 -22
data/README.textile
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
|
2
|
-
h1. Cobweb v0.0.
|
2
|
+
h1. Cobweb v0.0.38
|
3
3
|
|
4
4
|
h2. Intro
|
5
5
|
|
@@ -98,6 +98,8 @@ bq. crawler.head("http://www.google.com/")
|
|
98
98
|
h3. Contributing/Testing
|
99
99
|
|
100
100
|
Feel free to contribute small or large bits of code, just please make sure that there are rspec test for the features your submitting. We also test on travis at http://travis-ci.org/#!/stewartmckee/cobweb if you want to see the state of the project.
|
101
|
+
|
102
|
+
Continuous integration testing is performed by the excellent Travis: http://travis-ci.org/#!/stewartmckee/cobweb
|
101
103
|
|
102
104
|
h2. License
|
103
105
|
|
data/lib/cobweb.rb
CHANGED
data/lib/crawl_job.rb
CHANGED
@@ -10,8 +10,9 @@ class CrawlJob
|
|
10
10
|
def self.perform(content_request)
|
11
11
|
|
12
12
|
# change all hash keys to symbols
|
13
|
-
content_request =
|
13
|
+
content_request = self.deep_symbolize_keys(content_request)
|
14
14
|
|
15
|
+
content_request[:redis_options] = {} unless content_request.has_key? :redis_options
|
15
16
|
@redis = NamespacedRedis.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
|
16
17
|
|
17
18
|
@debug = content_request[:debug]
|
@@ -62,18 +63,12 @@ class CrawlJob
|
|
62
63
|
end
|
63
64
|
|
64
65
|
# if there's nothing left queued or the crawled limit has been reached
|
65
|
-
if
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
stats[:crawl_counter] = @crawl_counter
|
72
|
-
stats[:queue_counter] = @queue_counter
|
73
|
-
stats[:crawled] = @redis.smembers "crawled"
|
74
|
-
|
75
|
-
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), stats.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
|
76
|
-
|
66
|
+
if content_request[:crawl_limit].nil? || content_request[:crawl_limit] == 0
|
67
|
+
if @redis.scard("queued") == 0
|
68
|
+
finished(content_request)
|
69
|
+
end
|
70
|
+
elsif @queue_counter == 0 || @crawl_counter >= content_request[:crawl_limit].to_i
|
71
|
+
finished(content_request)
|
77
72
|
end
|
78
73
|
end
|
79
74
|
else
|
@@ -84,6 +79,14 @@ class CrawlJob
|
|
84
79
|
|
85
80
|
end
|
86
81
|
|
82
|
+
def self.finished(content_request)
|
83
|
+
# finished
|
84
|
+
|
85
|
+
Stats.set_totals
|
86
|
+
|
87
|
+
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), Stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
|
88
|
+
end
|
89
|
+
|
87
90
|
def self.send_to_processing_queue(content, content_request)
|
88
91
|
content_to_send = content.merge({:internal_urls => internal_patterns, :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
|
89
92
|
if content_request[:use_encoding_safe_process_job]
|
@@ -175,4 +178,15 @@ class CrawlJob
|
|
175
178
|
@crawl_counter = @redis.get("crawl-counter").to_i
|
176
179
|
@queue_counter = @redis.get("queue-counter").to_i
|
177
180
|
end
|
181
|
+
def self.deep_symbolize_keys(hash)
|
182
|
+
hash.keys.each do |key|
|
183
|
+
value = hash[key]
|
184
|
+
hash.delete(key)
|
185
|
+
hash[key.to_sym] = value
|
186
|
+
if hash[key.to_sym].instance_of? Hash
|
187
|
+
hash[key.to_sym] = self.deep_symbolize_keys(hash[key.to_sym])
|
188
|
+
end
|
189
|
+
end
|
190
|
+
hash
|
191
|
+
end
|
178
192
|
end
|
data/lib/stats.rb
CHANGED
@@ -5,13 +5,32 @@ class Stats < Sinatra::Base
|
|
5
5
|
|
6
6
|
def self.update_statistics(statistics)
|
7
7
|
@@statistics = statistics
|
8
|
+
@@statistics
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.get_statistics
|
12
|
+
@@statistics ||= {}
|
8
13
|
end
|
9
14
|
|
10
15
|
def self.update_status(status)
|
11
16
|
@@status = status
|
12
17
|
end
|
13
18
|
|
19
|
+
def self.set_totals
|
20
|
+
stats = @redis.hgetall "statistics"
|
21
|
+
stats[:total_pages] = @redis.get("total_pages").to_i
|
22
|
+
stats[:total_assets] = @redis.get("total_assets").to_i
|
23
|
+
stats[:crawl_counter] = @crawl_counter
|
24
|
+
stats[:queue_counter] = @queue_counter
|
25
|
+
stats[:crawled] = @redis.smembers "crawled"
|
26
|
+
|
27
|
+
Stats.update_statistics(stats)
|
28
|
+
end
|
29
|
+
|
14
30
|
def self.set_statistics_in_redis(redis, content)
|
31
|
+
|
32
|
+
@redis = redis
|
33
|
+
|
15
34
|
crawl_counter = redis.get("crawl-counter").to_i
|
16
35
|
queue_counter = redis.get("queue-counter").to_i
|
17
36
|
|
@@ -62,6 +81,7 @@ class Stats < Sinatra::Base
|
|
62
81
|
end
|
63
82
|
redis.hset "statistics", "status_counts", status_counts.to_json
|
64
83
|
|
84
|
+
@@statistics = @redis.hgetall "statistics"
|
65
85
|
end
|
66
86
|
|
67
87
|
set :views, settings.root + '/../views'
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
describe Cobweb do
|
4
|
+
|
5
|
+
before(:each) do
|
6
|
+
@base_url = "http://www.baseurl.com/"
|
7
|
+
@cobweb = Cobweb.new :quiet => true, :cache => nil
|
8
|
+
end
|
9
|
+
|
10
|
+
describe "detect finish of crawl" do
|
11
|
+
|
12
|
+
describe "with no crawl limit" do
|
13
|
+
before(:each) do
|
14
|
+
@request = {
|
15
|
+
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
16
|
+
:url => @base_url,
|
17
|
+
:crawl_limit => nil
|
18
|
+
}
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should not limit crawl"
|
22
|
+
it "should detect the end or crawl"
|
23
|
+
end
|
24
|
+
|
25
|
+
describe "with a crawl limit" do
|
26
|
+
@request = {
|
27
|
+
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
28
|
+
:url => @base_url,
|
29
|
+
:crawl_limit => 3
|
30
|
+
}
|
31
|
+
|
32
|
+
it "should limit crawl"
|
33
|
+
it "should detect the end or crawl based on limit"
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.38
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-26 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70264627738060 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70264627738060
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70264627737640 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70264627737640
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70264627737220 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70264627737220
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: addressable
|
49
|
-
requirement: &
|
49
|
+
requirement: &70264627736800 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70264627736800
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &70264627736380 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70264627736380
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: awesome_print
|
71
|
-
requirement: &
|
71
|
+
requirement: &70264627735960 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70264627735960
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: sinatra
|
82
|
-
requirement: &
|
82
|
+
requirement: &70264627735540 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70264627735540
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: thin
|
93
|
-
requirement: &
|
93
|
+
requirement: &70264627735120 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70264627735120
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: haml
|
104
|
-
requirement: &
|
104
|
+
requirement: &70264627734700 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70264627734700
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: namespaced_redis
|
115
|
-
requirement: &
|
115
|
+
requirement: &70264627734280 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,7 +120,7 @@ dependencies:
|
|
120
120
|
version: '0'
|
121
121
|
type: :runtime
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70264627734280
|
124
124
|
description: Web Crawler that uses resque background job engine to allow you to cluster
|
125
125
|
your crawl.
|
126
126
|
email: stewart@rockwellcottage.com
|
@@ -130,6 +130,7 @@ extra_rdoc_files:
|
|
130
130
|
- README.textile
|
131
131
|
files:
|
132
132
|
- spec/cobweb/cobweb_crawler_spec.rb
|
133
|
+
- spec/cobweb/cobweb_job_spec.rb
|
133
134
|
- spec/cobweb/cobweb_spec.rb
|
134
135
|
- spec/cobweb/content_link_parser_spec.rb
|
135
136
|
- spec/samples/sample_html_links.html
|