cobweb 0.0.68 → 0.0.70

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.68
2
+ h1. Cobweb v0.0.70
3
3
 
4
4
  !https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
5
5
 
@@ -138,15 +138,13 @@ You can also run within a block and get access to each page as it is being crawl
138
138
  bc. statistics = CobwebCrawler.new(:cache => 600).crawl("http://www.pepsico.com") do |page|
139
139
  puts "Just crawled #{page[:url]} and got a status of #{page[:status_code]}."
140
140
  end
141
- puts "Finished Crawl in "
141
+ puts "Finished Crawl with #{statistics[:page_count]} pages and #{statistics[:asset_count]} assets."
142
142
 
143
+ h3. CobwebCrawlHelper
143
144
 
145
+ The CobwebCrawlHelper class is a helper class to assist in getting information about a crawl and to perform functions against the crawl
144
146
 
145
- h3. Crawl
146
-
147
- The crawl class is a helper class to assist in getting information about a crawl and to perform functions against the crawl
148
-
149
- bc. crawl = Crawl.new(options)
147
+ bc. crawl = CobwebCrawlHelper.new(options)
150
148
 
151
149
  * options - the hash of options passed into Cobweb.new (must include a :crawl_id)
152
150
 
@@ -1,5 +1,5 @@
1
1
  # The crawl class gives easy access to information about the crawl, and gives the ability to stop a crawl
2
- class Crawl
2
+ class CobwebCrawlHelper
3
3
 
4
4
  attr_accessor :id
5
5
 
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "0.0.68"
6
+ "0.0.70"
7
7
  end
8
8
 
9
9
  end
@@ -14,7 +14,7 @@ class CrawlJob
14
14
  # change all hash keys to symbols
15
15
  content_request = HashUtil.deep_symbolize_keys(content_request)
16
16
  @content_request = content_request
17
- @crawl = Crawl.new(content_request)
17
+ @crawl = CobwebCrawlHelper.new(content_request)
18
18
 
19
19
  content_request[:redis_options] = {} unless content_request.has_key? :redis_options
20
20
  content_request[:crawl_limit_by_page] = false unless content_request.has_key? :crawl_limit_by_page
@@ -28,13 +28,14 @@ class CrawlJob
28
28
  # check we haven't crawled this url before
29
29
  unless @redis.sismember "crawled", content_request[:url]
30
30
  # if there is no limit or we're still under it lets get the url
31
- if within_crawl_limits?(content_request[:crawl_limit]) and @crawl.status != Crawl::CANCELLED
31
+ if within_crawl_limits?(content_request[:crawl_limit]) and @crawl.status != CobwebCrawlHelper::CANCELLED
32
32
  content = Cobweb.new(content_request).get(content_request[:url], content_request)
33
33
  if content_request[:url] == @redis.get("original_base_url")
34
34
  @redis.set("crawled_base_url", content[:base_url])
35
35
  end
36
36
  if is_permitted_type(content)
37
37
  begin
38
+ @redis.incr "inprogress"
38
39
  # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
39
40
  @redis.srem "queued", content_request[:url]
40
41
  @redis.sadd "crawled", content_request[:url]
@@ -70,8 +71,8 @@ class CrawlJob
70
71
 
71
72
  internal_links.each do |link|
72
73
  puts link
73
- puts "Not enqueuing due to cancelled crawl" if @crawl.status == Crawl::CANCELLED
74
- if within_queue_limits?(content_request[:crawl_limit]) and @crawl.status != Crawl::CANCELLED
74
+ puts "Not enqueuing due to cancelled crawl" if @crawl.status == CobwebCrawlHelper::CANCELLED
75
+ if within_queue_limits?(content_request[:crawl_limit]) and @crawl.status != CobwebCrawlHelper::CANCELLED
75
76
  enqueue_content(content_request, link)
76
77
  end
77
78
  end
@@ -88,6 +89,7 @@ class CrawlJob
88
89
  end
89
90
 
90
91
  ensure
92
+ @redis.decr "inprogress"
91
93
  #update the queued and crawled lists if we are within the crawl limits.
92
94
 
93
95
  # update the queue and crawl counts -- doing this very late in the piece so that the following transaction all occurs at once.
@@ -126,10 +128,10 @@ class CrawlJob
126
128
 
127
129
  end
128
130
 
129
- # Sets the crawl status to Crawl::FINISHED and enqueues the crawl finished job
131
+ # Sets the crawl status to CobwebCrawlHelper::FINISHED and enqueues the crawl finished job
130
132
  def self.finished(content_request)
131
133
  # finished
132
- if @crawl.status != Crawl::FINISHED and @crawl.status != Crawl::CANCELLED
134
+ if @crawl.status != CobwebCrawlHelper::FINISHED and @crawl.status != CobwebCrawlHelper::CANCELLED && @redis.get("inprogress").to_i==0
133
135
  ap "CRAWL FINISHED #{content_request[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}" if content_request[:debug]
134
136
  @stats.end_crawl(content_request)
135
137
 
@@ -195,8 +197,9 @@ class CrawlJob
195
197
  new_request = content_request.clone
196
198
  new_request[:url] = link
197
199
  new_request[:parent] = content_request[:url]
198
- Resque.enqueue(CrawlJob, new_request)
200
+ #to help prevent accidentally double processing a link, let's mark it as queued just before the Resque.enqueue statement, rather than just after.
199
201
  @redis.sadd "queued", link
202
+ Resque.enqueue(CrawlJob, new_request)
200
203
  increment_queue_counter
201
204
  end
202
205
 
@@ -16,16 +16,16 @@ class Stats
16
16
  @redis.hset "crawl_details", key, options[key].to_s
17
17
  end
18
18
  end
19
- @redis.hset "statistics", "current_status", Crawl::STARTING
19
+ @redis.hset "statistics", "current_status", CobwebCrawlHelper::STARTING
20
20
  end
21
21
 
22
22
  # Removes the crawl from the running crawls and updates status
23
23
  def end_crawl(options, cancelled=false)
24
24
  @full_redis.srem "cobweb_crawls", options[:crawl_id]
25
25
  if cancelled
26
- @redis.hset "statistics", "current_status", Crawl::CANCELLED
26
+ @redis.hset "statistics", "current_status", CobwebCrawlHelper::CANCELLED
27
27
  else
28
- @redis.hset "statistics", "current_status", Crawl::FINISHED
28
+ @redis.hset "statistics", "current_status", CobwebCrawlHelper::FINISHED
29
29
  end
30
30
  @redis.del "crawl_details"
31
31
  end
@@ -158,7 +158,7 @@ class Stats
158
158
 
159
159
  # Sets the current status of the crawl
160
160
  def update_status(status)
161
- #@redis.hset("statistics", "current_status", status) unless status == Crawl::CANCELLED
161
+ #@redis.hset("statistics", "current_status", status) unless status == CobwebCrawlHelper::CANCELLED
162
162
  end
163
163
 
164
164
  # Returns the current status of the crawl
@@ -33,7 +33,7 @@ describe Cobweb, :local_only => true do
33
33
  end
34
34
  it "should not crawl anything if nothing has started" do
35
35
  crawl = @cobweb.start(@base_url)
36
- crawl_obj = Crawl.new(crawl)
36
+ crawl_obj = CobwebCrawlHelper.new(crawl)
37
37
  crawl_obj.destroy
38
38
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
39
39
  wait_for_crawl_finished crawl[:crawl_id]
@@ -42,7 +42,7 @@ describe Cobweb, :local_only => true do
42
42
 
43
43
  it "should not complete the crawl when cancelled" do
44
44
  crawl = @cobweb.start(@base_url)
45
- crawl_obj = Crawl.new(crawl)
45
+ crawl_obj = CobwebCrawlHelper.new(crawl)
46
46
  sleep 6
47
47
  crawl_obj.destroy
48
48
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
@@ -231,7 +231,7 @@ def wait_for_crawl_finished(crawl_id, timeout=20)
231
231
  end
232
232
 
233
233
  def running?(crawl_id)
234
- @stat.get_status != Crawl::FINISHED and @stat.get_status != Crawl::CANCELLED
234
+ @stat.get_status != CobwebCrawlHelper::FINISHED and @stat.get_status != CobwebCrawlHelper::CANCELLED
235
235
  end
236
236
 
237
237
  def clear_queues
@@ -1,23 +1,23 @@
1
1
  require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
2
 
3
- describe Crawl do
3
+ describe CobwebCrawlHelper do
4
4
 
5
5
  # this spec tests the crawl object
6
6
 
7
7
  describe "initialize" do
8
8
  describe "without data" do
9
9
  it "should raise an exception" do
10
- lambda {Crawl.new}.should raise_exception
10
+ lambda {CobwebCrawlHelper.new}.should raise_exception
11
11
  end
12
12
  end
13
13
 
14
14
  describe "with data" do
15
15
  before(:each) do
16
16
  data = {:crawl_id => "asdf"}
17
- @crawl = Crawl.new(data)
17
+ @crawl = CobwebCrawlHelper.new(data)
18
18
  end
19
19
  it "should create a crawl object" do
20
- @crawl.should be_an_instance_of Crawl
20
+ @crawl.should be_an_instance_of CobwebCrawlHelper
21
21
  end
22
22
  it "should return an id" do
23
23
  @crawl.should respond_to "id"
@@ -46,7 +46,7 @@ describe Crawl do
46
46
  end
47
47
  describe "after called" do
48
48
  before(:each) do
49
- @crawl = Crawl.new({:crawl_id => "crawl_0_id"})
49
+ @crawl = CobwebCrawlHelper.new({:crawl_id => "crawl_0_id"})
50
50
  @crawl.destroy
51
51
  end
52
52
  it "should delete only the crawl specified" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.68
4
+ version: 0.0.70
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-10 00:00:00.000000000 Z
12
+ date: 2012-09-16 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70324863540700 !ruby/object:Gem::Requirement
16
+ requirement: &70248368307060 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70324863540700
24
+ version_requirements: *70248368307060
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70324863539560 !ruby/object:Gem::Requirement
27
+ requirement: &70248368306020 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70324863539560
35
+ version_requirements: *70248368306020
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70324863538960 !ruby/object:Gem::Requirement
38
+ requirement: &70248368305360 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70324863538960
46
+ version_requirements: *70248368305360
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70324863537700 !ruby/object:Gem::Requirement
49
+ requirement: &70248368304140 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70324863537700
57
+ version_requirements: *70248368304140
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70324863537120 !ruby/object:Gem::Requirement
60
+ requirement: &70248368303560 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70324863537120
68
+ version_requirements: *70248368303560
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70324863536500 !ruby/object:Gem::Requirement
71
+ requirement: &70248368302820 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70324863536500
79
+ version_requirements: *70248368302820
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70324863535620 !ruby/object:Gem::Requirement
82
+ requirement: &70248368302080 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70324863535620
90
+ version_requirements: *70248368302080
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70324863534860 !ruby/object:Gem::Requirement
93
+ requirement: &70248368301260 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70324863534860
101
+ version_requirements: *70248368301260
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70324863534000 !ruby/object:Gem::Requirement
104
+ requirement: &70248368300400 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70324863534000
112
+ version_requirements: *70248368300400
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70324863533220 !ruby/object:Gem::Requirement
115
+ requirement: &70248368299680 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70324863533220
123
+ version_requirements: *70248368299680
124
124
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
125
125
  crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
126
126
  is also a standalone crawler that has a sophisticated statistics monitoring interface
@@ -310,13 +310,13 @@ files:
310
310
  - spec/spec.opts
311
311
  - spec/spec_helper.rb
312
312
  - lib/cobweb.rb
313
+ - lib/cobweb_crawl_helper.rb
313
314
  - lib/cobweb_crawler.rb
314
315
  - lib/cobweb_finished_job.rb
315
316
  - lib/cobweb_links.rb
316
317
  - lib/cobweb_process_job.rb
317
318
  - lib/cobweb_version.rb
318
319
  - lib/content_link_parser.rb
319
- - lib/crawl.rb
320
320
  - lib/crawl_job.rb
321
321
  - lib/encoding_safe_process_job.rb
322
322
  - lib/hash_util.rb