cobweb 0.0.68 → 0.0.70

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.68
2
+ h1. Cobweb v0.0.70
3
3
 
4
4
  !https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
5
5
 
@@ -138,15 +138,13 @@ You can also run within a block and get access to each page as it is being crawl
138
138
  bc. statistics = CobwebCrawler.new(:cache => 600).crawl("http://www.pepsico.com") do |page|
139
139
  puts "Just crawled #{page[:url]} and got a status of #{page[:status_code]}."
140
140
  end
141
- puts "Finished Crawl in "
141
+ puts "Finished Crawl with #{statistics[:page_count]} pages and #{statistics[:asset_count]} assets."
142
142
 
143
+ h3. CobwebCrawlHelper
143
144
 
145
+ The CobwebCrawlHelper class is a helper class to assist in getting information about a crawl and to perform functions against the crawl
144
146
 
145
- h3. Crawl
146
-
147
- The crawl class is a helper class to assist in getting information about a crawl and to perform functions against the crawl
148
-
149
- bc. crawl = Crawl.new(options)
147
+ bc. crawl = CobwebCrawlHelper.new(options)
150
148
 
151
149
  * options - the hash of options passed into Cobweb.new (must include a :crawl_id)
152
150
 
@@ -1,5 +1,5 @@
1
1
  # The crawl class gives easy access to information about the crawl, and gives the ability to stop a crawl
2
- class Crawl
2
+ class CobwebCrawlHelper
3
3
 
4
4
  attr_accessor :id
5
5
 
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "0.0.68"
6
+ "0.0.70"
7
7
  end
8
8
 
9
9
  end
@@ -14,7 +14,7 @@ class CrawlJob
14
14
  # change all hash keys to symbols
15
15
  content_request = HashUtil.deep_symbolize_keys(content_request)
16
16
  @content_request = content_request
17
- @crawl = Crawl.new(content_request)
17
+ @crawl = CobwebCrawlHelper.new(content_request)
18
18
 
19
19
  content_request[:redis_options] = {} unless content_request.has_key? :redis_options
20
20
  content_request[:crawl_limit_by_page] = false unless content_request.has_key? :crawl_limit_by_page
@@ -28,13 +28,14 @@ class CrawlJob
28
28
  # check we haven't crawled this url before
29
29
  unless @redis.sismember "crawled", content_request[:url]
30
30
  # if there is no limit or we're still under it lets get the url
31
- if within_crawl_limits?(content_request[:crawl_limit]) and @crawl.status != Crawl::CANCELLED
31
+ if within_crawl_limits?(content_request[:crawl_limit]) and @crawl.status != CobwebCrawlHelper::CANCELLED
32
32
  content = Cobweb.new(content_request).get(content_request[:url], content_request)
33
33
  if content_request[:url] == @redis.get("original_base_url")
34
34
  @redis.set("crawled_base_url", content[:base_url])
35
35
  end
36
36
  if is_permitted_type(content)
37
37
  begin
38
+ @redis.incr "inprogress"
38
39
  # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
39
40
  @redis.srem "queued", content_request[:url]
40
41
  @redis.sadd "crawled", content_request[:url]
@@ -70,8 +71,8 @@ class CrawlJob
70
71
 
71
72
  internal_links.each do |link|
72
73
  puts link
73
- puts "Not enqueuing due to cancelled crawl" if @crawl.status == Crawl::CANCELLED
74
- if within_queue_limits?(content_request[:crawl_limit]) and @crawl.status != Crawl::CANCELLED
74
+ puts "Not enqueuing due to cancelled crawl" if @crawl.status == CobwebCrawlHelper::CANCELLED
75
+ if within_queue_limits?(content_request[:crawl_limit]) and @crawl.status != CobwebCrawlHelper::CANCELLED
75
76
  enqueue_content(content_request, link)
76
77
  end
77
78
  end
@@ -88,6 +89,7 @@ class CrawlJob
88
89
  end
89
90
 
90
91
  ensure
92
+ @redis.decr "inprogress"
91
93
  #update the queued and crawled lists if we are within the crawl limits.
92
94
 
93
95
  # update the queue and crawl counts -- doing this very late in the piece so that the following transaction all occurs at once.
@@ -126,10 +128,10 @@ class CrawlJob
126
128
 
127
129
  end
128
130
 
129
- # Sets the crawl status to Crawl::FINISHED and enqueues the crawl finished job
131
+ # Sets the crawl status to CobwebCrawlHelper::FINISHED and enqueues the crawl finished job
130
132
  def self.finished(content_request)
131
133
  # finished
132
- if @crawl.status != Crawl::FINISHED and @crawl.status != Crawl::CANCELLED
134
+ if @crawl.status != CobwebCrawlHelper::FINISHED and @crawl.status != CobwebCrawlHelper::CANCELLED && @redis.get("inprogress").to_i==0
133
135
  ap "CRAWL FINISHED #{content_request[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}" if content_request[:debug]
134
136
  @stats.end_crawl(content_request)
135
137
 
@@ -195,8 +197,9 @@ class CrawlJob
195
197
  new_request = content_request.clone
196
198
  new_request[:url] = link
197
199
  new_request[:parent] = content_request[:url]
198
- Resque.enqueue(CrawlJob, new_request)
200
+ #to help prevent accidentally double processing a link, let's mark it as queued just before the Resque.enqueue statement, rather than just after.
199
201
  @redis.sadd "queued", link
202
+ Resque.enqueue(CrawlJob, new_request)
200
203
  increment_queue_counter
201
204
  end
202
205
 
@@ -16,16 +16,16 @@ class Stats
16
16
  @redis.hset "crawl_details", key, options[key].to_s
17
17
  end
18
18
  end
19
- @redis.hset "statistics", "current_status", Crawl::STARTING
19
+ @redis.hset "statistics", "current_status", CobwebCrawlHelper::STARTING
20
20
  end
21
21
 
22
22
  # Removes the crawl from the running crawls and updates status
23
23
  def end_crawl(options, cancelled=false)
24
24
  @full_redis.srem "cobweb_crawls", options[:crawl_id]
25
25
  if cancelled
26
- @redis.hset "statistics", "current_status", Crawl::CANCELLED
26
+ @redis.hset "statistics", "current_status", CobwebCrawlHelper::CANCELLED
27
27
  else
28
- @redis.hset "statistics", "current_status", Crawl::FINISHED
28
+ @redis.hset "statistics", "current_status", CobwebCrawlHelper::FINISHED
29
29
  end
30
30
  @redis.del "crawl_details"
31
31
  end
@@ -158,7 +158,7 @@ class Stats
158
158
 
159
159
  # Sets the current status of the crawl
160
160
  def update_status(status)
161
- #@redis.hset("statistics", "current_status", status) unless status == Crawl::CANCELLED
161
+ #@redis.hset("statistics", "current_status", status) unless status == CobwebCrawlHelper::CANCELLED
162
162
  end
163
163
 
164
164
  # Returns the current status of the crawl
@@ -33,7 +33,7 @@ describe Cobweb, :local_only => true do
33
33
  end
34
34
  it "should not crawl anything if nothing has started" do
35
35
  crawl = @cobweb.start(@base_url)
36
- crawl_obj = Crawl.new(crawl)
36
+ crawl_obj = CobwebCrawlHelper.new(crawl)
37
37
  crawl_obj.destroy
38
38
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
39
39
  wait_for_crawl_finished crawl[:crawl_id]
@@ -42,7 +42,7 @@ describe Cobweb, :local_only => true do
42
42
 
43
43
  it "should not complete the crawl when cancelled" do
44
44
  crawl = @cobweb.start(@base_url)
45
- crawl_obj = Crawl.new(crawl)
45
+ crawl_obj = CobwebCrawlHelper.new(crawl)
46
46
  sleep 6
47
47
  crawl_obj.destroy
48
48
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
@@ -231,7 +231,7 @@ def wait_for_crawl_finished(crawl_id, timeout=20)
231
231
  end
232
232
 
233
233
  def running?(crawl_id)
234
- @stat.get_status != Crawl::FINISHED and @stat.get_status != Crawl::CANCELLED
234
+ @stat.get_status != CobwebCrawlHelper::FINISHED and @stat.get_status != CobwebCrawlHelper::CANCELLED
235
235
  end
236
236
 
237
237
  def clear_queues
@@ -1,23 +1,23 @@
1
1
  require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
2
 
3
- describe Crawl do
3
+ describe CobwebCrawlHelper do
4
4
 
5
5
  # this spec tests the crawl object
6
6
 
7
7
  describe "initialize" do
8
8
  describe "without data" do
9
9
  it "should raise an exception" do
10
- lambda {Crawl.new}.should raise_exception
10
+ lambda {CobwebCrawlHelper.new}.should raise_exception
11
11
  end
12
12
  end
13
13
 
14
14
  describe "with data" do
15
15
  before(:each) do
16
16
  data = {:crawl_id => "asdf"}
17
- @crawl = Crawl.new(data)
17
+ @crawl = CobwebCrawlHelper.new(data)
18
18
  end
19
19
  it "should create a crawl object" do
20
- @crawl.should be_an_instance_of Crawl
20
+ @crawl.should be_an_instance_of CobwebCrawlHelper
21
21
  end
22
22
  it "should return an id" do
23
23
  @crawl.should respond_to "id"
@@ -46,7 +46,7 @@ describe Crawl do
46
46
  end
47
47
  describe "after called" do
48
48
  before(:each) do
49
- @crawl = Crawl.new({:crawl_id => "crawl_0_id"})
49
+ @crawl = CobwebCrawlHelper.new({:crawl_id => "crawl_0_id"})
50
50
  @crawl.destroy
51
51
  end
52
52
  it "should delete only the crawl specified" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.68
4
+ version: 0.0.70
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-10 00:00:00.000000000 Z
12
+ date: 2012-09-16 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70324863540700 !ruby/object:Gem::Requirement
16
+ requirement: &70248368307060 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70324863540700
24
+ version_requirements: *70248368307060
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70324863539560 !ruby/object:Gem::Requirement
27
+ requirement: &70248368306020 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70324863539560
35
+ version_requirements: *70248368306020
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70324863538960 !ruby/object:Gem::Requirement
38
+ requirement: &70248368305360 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70324863538960
46
+ version_requirements: *70248368305360
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70324863537700 !ruby/object:Gem::Requirement
49
+ requirement: &70248368304140 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70324863537700
57
+ version_requirements: *70248368304140
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70324863537120 !ruby/object:Gem::Requirement
60
+ requirement: &70248368303560 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70324863537120
68
+ version_requirements: *70248368303560
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70324863536500 !ruby/object:Gem::Requirement
71
+ requirement: &70248368302820 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70324863536500
79
+ version_requirements: *70248368302820
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70324863535620 !ruby/object:Gem::Requirement
82
+ requirement: &70248368302080 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70324863535620
90
+ version_requirements: *70248368302080
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70324863534860 !ruby/object:Gem::Requirement
93
+ requirement: &70248368301260 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70324863534860
101
+ version_requirements: *70248368301260
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70324863534000 !ruby/object:Gem::Requirement
104
+ requirement: &70248368300400 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70324863534000
112
+ version_requirements: *70248368300400
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70324863533220 !ruby/object:Gem::Requirement
115
+ requirement: &70248368299680 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70324863533220
123
+ version_requirements: *70248368299680
124
124
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
125
125
  crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
126
126
  is also a standalone crawler that has a sophisticated statistics monitoring interface
@@ -310,13 +310,13 @@ files:
310
310
  - spec/spec.opts
311
311
  - spec/spec_helper.rb
312
312
  - lib/cobweb.rb
313
+ - lib/cobweb_crawl_helper.rb
313
314
  - lib/cobweb_crawler.rb
314
315
  - lib/cobweb_finished_job.rb
315
316
  - lib/cobweb_links.rb
316
317
  - lib/cobweb_process_job.rb
317
318
  - lib/cobweb_version.rb
318
319
  - lib/content_link_parser.rb
319
- - lib/crawl.rb
320
320
  - lib/crawl_job.rb
321
321
  - lib/encoding_safe_process_job.rb
322
322
  - lib/hash_util.rb