cobweb 0.0.68 → 0.0.70
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +5 -7
- data/lib/{crawl.rb → cobweb_crawl_helper.rb} +1 -1
- data/lib/cobweb_version.rb +1 -1
- data/lib/crawl_job.rb +10 -7
- data/lib/stats.rb +4 -4
- data/spec/cobweb/cobweb_job_spec.rb +3 -3
- data/spec/cobweb/crawl_spec.rb +5 -5
- metadata +23 -23
data/README.textile
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
|
|
2
|
-
h1. Cobweb v0.0.
|
|
2
|
+
h1. Cobweb v0.0.70
|
|
3
3
|
|
|
4
4
|
!https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
|
|
5
5
|
|
|
@@ -138,15 +138,13 @@ You can also run within a block and get access to each page as it is being crawl
|
|
|
138
138
|
bc. statistics = CobwebCrawler.new(:cache => 600).crawl("http://www.pepsico.com") do |page|
|
|
139
139
|
puts "Just crawled #{page[:url]} and got a status of #{page[:status_code]}."
|
|
140
140
|
end
|
|
141
|
-
puts "Finished Crawl
|
|
141
|
+
puts "Finished Crawl with #{statistics[:page_count]} pages and #{statistics[:asset_count]} assets."
|
|
142
142
|
|
|
143
|
+
h3. CobwebCrawlHelper
|
|
143
144
|
|
|
145
|
+
The CobwebCrawlHelper class is a helper class to assist in getting information about a crawl and to perform functions against the crawl
|
|
144
146
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
The crawl class is a helper class to assist in getting information about a crawl and to perform functions against the crawl
|
|
148
|
-
|
|
149
|
-
bc. crawl = Crawl.new(options)
|
|
147
|
+
bc. crawl = CobwebCrawlHelper.new(options)
|
|
150
148
|
|
|
151
149
|
* options - the hash of options passed into Cobweb.new (must include a :crawl_id)
|
|
152
150
|
|
data/lib/cobweb_version.rb
CHANGED
data/lib/crawl_job.rb
CHANGED
|
@@ -14,7 +14,7 @@ class CrawlJob
|
|
|
14
14
|
# change all hash keys to symbols
|
|
15
15
|
content_request = HashUtil.deep_symbolize_keys(content_request)
|
|
16
16
|
@content_request = content_request
|
|
17
|
-
@crawl =
|
|
17
|
+
@crawl = CobwebCrawlHelper.new(content_request)
|
|
18
18
|
|
|
19
19
|
content_request[:redis_options] = {} unless content_request.has_key? :redis_options
|
|
20
20
|
content_request[:crawl_limit_by_page] = false unless content_request.has_key? :crawl_limit_by_page
|
|
@@ -28,13 +28,14 @@ class CrawlJob
|
|
|
28
28
|
# check we haven't crawled this url before
|
|
29
29
|
unless @redis.sismember "crawled", content_request[:url]
|
|
30
30
|
# if there is no limit or we're still under it lets get the url
|
|
31
|
-
if within_crawl_limits?(content_request[:crawl_limit]) and @crawl.status !=
|
|
31
|
+
if within_crawl_limits?(content_request[:crawl_limit]) and @crawl.status != CobwebCrawlHelper::CANCELLED
|
|
32
32
|
content = Cobweb.new(content_request).get(content_request[:url], content_request)
|
|
33
33
|
if content_request[:url] == @redis.get("original_base_url")
|
|
34
34
|
@redis.set("crawled_base_url", content[:base_url])
|
|
35
35
|
end
|
|
36
36
|
if is_permitted_type(content)
|
|
37
37
|
begin
|
|
38
|
+
@redis.incr "inprogress"
|
|
38
39
|
# move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
|
|
39
40
|
@redis.srem "queued", content_request[:url]
|
|
40
41
|
@redis.sadd "crawled", content_request[:url]
|
|
@@ -70,8 +71,8 @@ class CrawlJob
|
|
|
70
71
|
|
|
71
72
|
internal_links.each do |link|
|
|
72
73
|
puts link
|
|
73
|
-
puts "Not enqueuing due to cancelled crawl" if @crawl.status ==
|
|
74
|
-
if within_queue_limits?(content_request[:crawl_limit]) and @crawl.status !=
|
|
74
|
+
puts "Not enqueuing due to cancelled crawl" if @crawl.status == CobwebCrawlHelper::CANCELLED
|
|
75
|
+
if within_queue_limits?(content_request[:crawl_limit]) and @crawl.status != CobwebCrawlHelper::CANCELLED
|
|
75
76
|
enqueue_content(content_request, link)
|
|
76
77
|
end
|
|
77
78
|
end
|
|
@@ -88,6 +89,7 @@ class CrawlJob
|
|
|
88
89
|
end
|
|
89
90
|
|
|
90
91
|
ensure
|
|
92
|
+
@redis.decr "inprogress"
|
|
91
93
|
#update the queued and crawled lists if we are within the crawl limits.
|
|
92
94
|
|
|
93
95
|
# update the queue and crawl counts -- doing this very late in the piece so that the following transaction all occurs at once.
|
|
@@ -126,10 +128,10 @@ class CrawlJob
|
|
|
126
128
|
|
|
127
129
|
end
|
|
128
130
|
|
|
129
|
-
# Sets the crawl status to
|
|
131
|
+
# Sets the crawl status to CobwebCrawlHelper::FINISHED and enqueues the crawl finished job
|
|
130
132
|
def self.finished(content_request)
|
|
131
133
|
# finished
|
|
132
|
-
if @crawl.status !=
|
|
134
|
+
if @crawl.status != CobwebCrawlHelper::FINISHED and @crawl.status != CobwebCrawlHelper::CANCELLED && @redis.get("inprogress").to_i==0
|
|
133
135
|
ap "CRAWL FINISHED #{content_request[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}" if content_request[:debug]
|
|
134
136
|
@stats.end_crawl(content_request)
|
|
135
137
|
|
|
@@ -195,8 +197,9 @@ class CrawlJob
|
|
|
195
197
|
new_request = content_request.clone
|
|
196
198
|
new_request[:url] = link
|
|
197
199
|
new_request[:parent] = content_request[:url]
|
|
198
|
-
Resque.enqueue
|
|
200
|
+
#to help prevent accidentally double processing a link, let's mark it as queued just before the Resque.enqueue statement, rather than just after.
|
|
199
201
|
@redis.sadd "queued", link
|
|
202
|
+
Resque.enqueue(CrawlJob, new_request)
|
|
200
203
|
increment_queue_counter
|
|
201
204
|
end
|
|
202
205
|
|
data/lib/stats.rb
CHANGED
|
@@ -16,16 +16,16 @@ class Stats
|
|
|
16
16
|
@redis.hset "crawl_details", key, options[key].to_s
|
|
17
17
|
end
|
|
18
18
|
end
|
|
19
|
-
@redis.hset "statistics", "current_status",
|
|
19
|
+
@redis.hset "statistics", "current_status", CobwebCrawlHelper::STARTING
|
|
20
20
|
end
|
|
21
21
|
|
|
22
22
|
# Removes the crawl from the running crawls and updates status
|
|
23
23
|
def end_crawl(options, cancelled=false)
|
|
24
24
|
@full_redis.srem "cobweb_crawls", options[:crawl_id]
|
|
25
25
|
if cancelled
|
|
26
|
-
@redis.hset "statistics", "current_status",
|
|
26
|
+
@redis.hset "statistics", "current_status", CobwebCrawlHelper::CANCELLED
|
|
27
27
|
else
|
|
28
|
-
@redis.hset "statistics", "current_status",
|
|
28
|
+
@redis.hset "statistics", "current_status", CobwebCrawlHelper::FINISHED
|
|
29
29
|
end
|
|
30
30
|
@redis.del "crawl_details"
|
|
31
31
|
end
|
|
@@ -158,7 +158,7 @@ class Stats
|
|
|
158
158
|
|
|
159
159
|
# Sets the current status of the crawl
|
|
160
160
|
def update_status(status)
|
|
161
|
-
#@redis.hset("statistics", "current_status", status) unless status ==
|
|
161
|
+
#@redis.hset("statistics", "current_status", status) unless status == CobwebCrawlHelper::CANCELLED
|
|
162
162
|
end
|
|
163
163
|
|
|
164
164
|
# Returns the current status of the crawl
|
|
@@ -33,7 +33,7 @@ describe Cobweb, :local_only => true do
|
|
|
33
33
|
end
|
|
34
34
|
it "should not crawl anything if nothing has started" do
|
|
35
35
|
crawl = @cobweb.start(@base_url)
|
|
36
|
-
crawl_obj =
|
|
36
|
+
crawl_obj = CobwebCrawlHelper.new(crawl)
|
|
37
37
|
crawl_obj.destroy
|
|
38
38
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
|
39
39
|
wait_for_crawl_finished crawl[:crawl_id]
|
|
@@ -42,7 +42,7 @@ describe Cobweb, :local_only => true do
|
|
|
42
42
|
|
|
43
43
|
it "should not complete the crawl when cancelled" do
|
|
44
44
|
crawl = @cobweb.start(@base_url)
|
|
45
|
-
crawl_obj =
|
|
45
|
+
crawl_obj = CobwebCrawlHelper.new(crawl)
|
|
46
46
|
sleep 6
|
|
47
47
|
crawl_obj.destroy
|
|
48
48
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
|
@@ -231,7 +231,7 @@ def wait_for_crawl_finished(crawl_id, timeout=20)
|
|
|
231
231
|
end
|
|
232
232
|
|
|
233
233
|
def running?(crawl_id)
|
|
234
|
-
@stat.get_status !=
|
|
234
|
+
@stat.get_status != CobwebCrawlHelper::FINISHED and @stat.get_status != CobwebCrawlHelper::CANCELLED
|
|
235
235
|
end
|
|
236
236
|
|
|
237
237
|
def clear_queues
|
data/spec/cobweb/crawl_spec.rb
CHANGED
|
@@ -1,23 +1,23 @@
|
|
|
1
1
|
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
|
2
2
|
|
|
3
|
-
describe
|
|
3
|
+
describe CobwebCrawlHelper do
|
|
4
4
|
|
|
5
5
|
# this spec tests the crawl object
|
|
6
6
|
|
|
7
7
|
describe "initialize" do
|
|
8
8
|
describe "without data" do
|
|
9
9
|
it "should raise an exception" do
|
|
10
|
-
lambda {
|
|
10
|
+
lambda {CobwebCrawlHelper.new}.should raise_exception
|
|
11
11
|
end
|
|
12
12
|
end
|
|
13
13
|
|
|
14
14
|
describe "with data" do
|
|
15
15
|
before(:each) do
|
|
16
16
|
data = {:crawl_id => "asdf"}
|
|
17
|
-
@crawl =
|
|
17
|
+
@crawl = CobwebCrawlHelper.new(data)
|
|
18
18
|
end
|
|
19
19
|
it "should create a crawl object" do
|
|
20
|
-
@crawl.should be_an_instance_of
|
|
20
|
+
@crawl.should be_an_instance_of CobwebCrawlHelper
|
|
21
21
|
end
|
|
22
22
|
it "should return an id" do
|
|
23
23
|
@crawl.should respond_to "id"
|
|
@@ -46,7 +46,7 @@ describe Crawl do
|
|
|
46
46
|
end
|
|
47
47
|
describe "after called" do
|
|
48
48
|
before(:each) do
|
|
49
|
-
@crawl =
|
|
49
|
+
@crawl = CobwebCrawlHelper.new({:crawl_id => "crawl_0_id"})
|
|
50
50
|
@crawl.destroy
|
|
51
51
|
end
|
|
52
52
|
it "should delete only the crawl specified" do
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: cobweb
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.70
|
|
5
5
|
prerelease:
|
|
6
6
|
platform: ruby
|
|
7
7
|
authors:
|
|
@@ -9,11 +9,11 @@ authors:
|
|
|
9
9
|
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date: 2012-09-
|
|
12
|
+
date: 2012-09-16 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
|
14
14
|
- !ruby/object:Gem::Dependency
|
|
15
15
|
name: resque
|
|
16
|
-
requirement: &
|
|
16
|
+
requirement: &70248368307060 !ruby/object:Gem::Requirement
|
|
17
17
|
none: false
|
|
18
18
|
requirements:
|
|
19
19
|
- - ! '>='
|
|
@@ -21,10 +21,10 @@ dependencies:
|
|
|
21
21
|
version: '0'
|
|
22
22
|
type: :runtime
|
|
23
23
|
prerelease: false
|
|
24
|
-
version_requirements: *
|
|
24
|
+
version_requirements: *70248368307060
|
|
25
25
|
- !ruby/object:Gem::Dependency
|
|
26
26
|
name: redis
|
|
27
|
-
requirement: &
|
|
27
|
+
requirement: &70248368306020 !ruby/object:Gem::Requirement
|
|
28
28
|
none: false
|
|
29
29
|
requirements:
|
|
30
30
|
- - ! '>='
|
|
@@ -32,10 +32,10 @@ dependencies:
|
|
|
32
32
|
version: '0'
|
|
33
33
|
type: :runtime
|
|
34
34
|
prerelease: false
|
|
35
|
-
version_requirements: *
|
|
35
|
+
version_requirements: *70248368306020
|
|
36
36
|
- !ruby/object:Gem::Dependency
|
|
37
37
|
name: nokogiri
|
|
38
|
-
requirement: &
|
|
38
|
+
requirement: &70248368305360 !ruby/object:Gem::Requirement
|
|
39
39
|
none: false
|
|
40
40
|
requirements:
|
|
41
41
|
- - ! '>='
|
|
@@ -43,10 +43,10 @@ dependencies:
|
|
|
43
43
|
version: '0'
|
|
44
44
|
type: :runtime
|
|
45
45
|
prerelease: false
|
|
46
|
-
version_requirements: *
|
|
46
|
+
version_requirements: *70248368305360
|
|
47
47
|
- !ruby/object:Gem::Dependency
|
|
48
48
|
name: addressable
|
|
49
|
-
requirement: &
|
|
49
|
+
requirement: &70248368304140 !ruby/object:Gem::Requirement
|
|
50
50
|
none: false
|
|
51
51
|
requirements:
|
|
52
52
|
- - ! '>='
|
|
@@ -54,10 +54,10 @@ dependencies:
|
|
|
54
54
|
version: '0'
|
|
55
55
|
type: :runtime
|
|
56
56
|
prerelease: false
|
|
57
|
-
version_requirements: *
|
|
57
|
+
version_requirements: *70248368304140
|
|
58
58
|
- !ruby/object:Gem::Dependency
|
|
59
59
|
name: rspec
|
|
60
|
-
requirement: &
|
|
60
|
+
requirement: &70248368303560 !ruby/object:Gem::Requirement
|
|
61
61
|
none: false
|
|
62
62
|
requirements:
|
|
63
63
|
- - ! '>='
|
|
@@ -65,10 +65,10 @@ dependencies:
|
|
|
65
65
|
version: '0'
|
|
66
66
|
type: :runtime
|
|
67
67
|
prerelease: false
|
|
68
|
-
version_requirements: *
|
|
68
|
+
version_requirements: *70248368303560
|
|
69
69
|
- !ruby/object:Gem::Dependency
|
|
70
70
|
name: awesome_print
|
|
71
|
-
requirement: &
|
|
71
|
+
requirement: &70248368302820 !ruby/object:Gem::Requirement
|
|
72
72
|
none: false
|
|
73
73
|
requirements:
|
|
74
74
|
- - ! '>='
|
|
@@ -76,10 +76,10 @@ dependencies:
|
|
|
76
76
|
version: '0'
|
|
77
77
|
type: :runtime
|
|
78
78
|
prerelease: false
|
|
79
|
-
version_requirements: *
|
|
79
|
+
version_requirements: *70248368302820
|
|
80
80
|
- !ruby/object:Gem::Dependency
|
|
81
81
|
name: sinatra
|
|
82
|
-
requirement: &
|
|
82
|
+
requirement: &70248368302080 !ruby/object:Gem::Requirement
|
|
83
83
|
none: false
|
|
84
84
|
requirements:
|
|
85
85
|
- - ! '>='
|
|
@@ -87,10 +87,10 @@ dependencies:
|
|
|
87
87
|
version: '0'
|
|
88
88
|
type: :runtime
|
|
89
89
|
prerelease: false
|
|
90
|
-
version_requirements: *
|
|
90
|
+
version_requirements: *70248368302080
|
|
91
91
|
- !ruby/object:Gem::Dependency
|
|
92
92
|
name: thin
|
|
93
|
-
requirement: &
|
|
93
|
+
requirement: &70248368301260 !ruby/object:Gem::Requirement
|
|
94
94
|
none: false
|
|
95
95
|
requirements:
|
|
96
96
|
- - ! '>='
|
|
@@ -98,10 +98,10 @@ dependencies:
|
|
|
98
98
|
version: '0'
|
|
99
99
|
type: :runtime
|
|
100
100
|
prerelease: false
|
|
101
|
-
version_requirements: *
|
|
101
|
+
version_requirements: *70248368301260
|
|
102
102
|
- !ruby/object:Gem::Dependency
|
|
103
103
|
name: haml
|
|
104
|
-
requirement: &
|
|
104
|
+
requirement: &70248368300400 !ruby/object:Gem::Requirement
|
|
105
105
|
none: false
|
|
106
106
|
requirements:
|
|
107
107
|
- - ! '>='
|
|
@@ -109,10 +109,10 @@ dependencies:
|
|
|
109
109
|
version: '0'
|
|
110
110
|
type: :runtime
|
|
111
111
|
prerelease: false
|
|
112
|
-
version_requirements: *
|
|
112
|
+
version_requirements: *70248368300400
|
|
113
113
|
- !ruby/object:Gem::Dependency
|
|
114
114
|
name: namespaced_redis
|
|
115
|
-
requirement: &
|
|
115
|
+
requirement: &70248368299680 !ruby/object:Gem::Requirement
|
|
116
116
|
none: false
|
|
117
117
|
requirements:
|
|
118
118
|
- - ! '>='
|
|
@@ -120,7 +120,7 @@ dependencies:
|
|
|
120
120
|
version: 1.0.2
|
|
121
121
|
type: :runtime
|
|
122
122
|
prerelease: false
|
|
123
|
-
version_requirements: *
|
|
123
|
+
version_requirements: *70248368299680
|
|
124
124
|
description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
|
125
125
|
crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
|
|
126
126
|
is also a standalone crawler that has a sophisticated statistics monitoring interface
|
|
@@ -310,13 +310,13 @@ files:
|
|
|
310
310
|
- spec/spec.opts
|
|
311
311
|
- spec/spec_helper.rb
|
|
312
312
|
- lib/cobweb.rb
|
|
313
|
+
- lib/cobweb_crawl_helper.rb
|
|
313
314
|
- lib/cobweb_crawler.rb
|
|
314
315
|
- lib/cobweb_finished_job.rb
|
|
315
316
|
- lib/cobweb_links.rb
|
|
316
317
|
- lib/cobweb_process_job.rb
|
|
317
318
|
- lib/cobweb_version.rb
|
|
318
319
|
- lib/content_link_parser.rb
|
|
319
|
-
- lib/crawl.rb
|
|
320
320
|
- lib/crawl_job.rb
|
|
321
321
|
- lib/encoding_safe_process_job.rb
|
|
322
322
|
- lib/hash_util.rb
|