cobweb 0.0.37 → 0.0.38

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.37
2
+ h1. Cobweb v0.0.38
3
3
 
4
4
  h2. Intro
5
5
 
@@ -98,6 +98,8 @@ bq. crawler.head("http://www.google.com/")
98
98
  h3. Contributing/Testing
99
99
 
100
100
  Feel free to contribute small or large bits of code, just please make sure that there are rspec test for the features your submitting. We also test on travis at http://travis-ci.org/#!/stewartmckee/cobweb if you want to see the state of the project.
101
+
102
+ Continuous integration testing is performed by the excellent Travis: http://travis-ci.org/#!/stewartmckee/cobweb
101
103
 
102
104
  h2. License
103
105
 
@@ -20,7 +20,7 @@ class Cobweb
20
20
  # investigate using event machine for single threaded crawling
21
21
 
22
22
  def self.version
23
- "0.0.37"
23
+ "0.0.38"
24
24
  end
25
25
 
26
26
  def method_missing(method_sym, *arguments, &block)
@@ -10,8 +10,9 @@ class CrawlJob
10
10
  def self.perform(content_request)
11
11
 
12
12
  # change all hash keys to symbols
13
- content_request = content_request.deep_symbolize_keys
13
+ content_request = self.deep_symbolize_keys(content_request)
14
14
 
15
+ content_request[:redis_options] = {} unless content_request.has_key? :redis_options
15
16
  @redis = NamespacedRedis.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
16
17
 
17
18
  @debug = content_request[:debug]
@@ -62,18 +63,12 @@ class CrawlJob
62
63
  end
63
64
 
64
65
  # if there's nothing left queued or the crawled limit has been reached
65
- if @queue_counter == 0 || @crawl_counter >= content_request[:crawl_limit].to_i
66
-
67
- # finished
68
- stats = @redis.hgetall "statistics"
69
- stats[:total_pages] = @redis.get("total_pages").to_i
70
- stats[:total_assets] = @redis.get("total_assets").to_i
71
- stats[:crawl_counter] = @crawl_counter
72
- stats[:queue_counter] = @queue_counter
73
- stats[:crawled] = @redis.smembers "crawled"
74
-
75
- Resque.enqueue(const_get(content_request[:crawl_finished_queue]), stats.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
76
-
66
+ if content_request[:crawl_limit].nil? || content_request[:crawl_limit] == 0
67
+ if @redis.scard("queued") == 0
68
+ finished(content_request)
69
+ end
70
+ elsif @queue_counter == 0 || @crawl_counter >= content_request[:crawl_limit].to_i
71
+ finished(content_request)
77
72
  end
78
73
  end
79
74
  else
@@ -84,6 +79,14 @@ class CrawlJob
84
79
 
85
80
  end
86
81
 
82
+ def self.finished(content_request)
83
+ # finished
84
+
85
+ Stats.set_totals
86
+
87
+ Resque.enqueue(const_get(content_request[:crawl_finished_queue]), Stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
88
+ end
89
+
87
90
  def self.send_to_processing_queue(content, content_request)
88
91
  content_to_send = content.merge({:internal_urls => internal_patterns, :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
89
92
  if content_request[:use_encoding_safe_process_job]
@@ -175,4 +178,15 @@ class CrawlJob
175
178
  @crawl_counter = @redis.get("crawl-counter").to_i
176
179
  @queue_counter = @redis.get("queue-counter").to_i
177
180
  end
181
+ def self.deep_symbolize_keys(hash)
182
+ hash.keys.each do |key|
183
+ value = hash[key]
184
+ hash.delete(key)
185
+ hash[key.to_sym] = value
186
+ if hash[key.to_sym].instance_of? Hash
187
+ hash[key.to_sym] = self.deep_symbolize_keys(hash[key.to_sym])
188
+ end
189
+ end
190
+ hash
191
+ end
178
192
  end
@@ -5,13 +5,32 @@ class Stats < Sinatra::Base
5
5
 
6
6
  def self.update_statistics(statistics)
7
7
  @@statistics = statistics
8
+ @@statistics
9
+ end
10
+
11
+ def self.get_statistics
12
+ @@statistics ||= {}
8
13
  end
9
14
 
10
15
  def self.update_status(status)
11
16
  @@status = status
12
17
  end
13
18
 
19
+ def self.set_totals
20
+ stats = @redis.hgetall "statistics"
21
+ stats[:total_pages] = @redis.get("total_pages").to_i
22
+ stats[:total_assets] = @redis.get("total_assets").to_i
23
+ stats[:crawl_counter] = @crawl_counter
24
+ stats[:queue_counter] = @queue_counter
25
+ stats[:crawled] = @redis.smembers "crawled"
26
+
27
+ Stats.update_statistics(stats)
28
+ end
29
+
14
30
  def self.set_statistics_in_redis(redis, content)
31
+
32
+ @redis = redis
33
+
15
34
  crawl_counter = redis.get("crawl-counter").to_i
16
35
  queue_counter = redis.get("queue-counter").to_i
17
36
 
@@ -62,6 +81,7 @@ class Stats < Sinatra::Base
62
81
  end
63
82
  redis.hset "statistics", "status_counts", status_counts.to_json
64
83
 
84
+ @@statistics = @redis.hgetall "statistics"
65
85
  end
66
86
 
67
87
  set :views, settings.root + '/../views'
@@ -0,0 +1,41 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ describe Cobweb do
4
+
5
+ before(:each) do
6
+ @base_url = "http://www.baseurl.com/"
7
+ @cobweb = Cobweb.new :quiet => true, :cache => nil
8
+ end
9
+
10
+ describe "detect finish of crawl" do
11
+
12
+ describe "with no crawl limit" do
13
+ before(:each) do
14
+ @request = {
15
+ :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
16
+ :url => @base_url,
17
+ :crawl_limit => nil
18
+ }
19
+ end
20
+
21
+ it "should not limit crawl"
22
+ it "should detect the end or crawl"
23
+ end
24
+
25
+ describe "with a crawl limit" do
26
+ @request = {
27
+ :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
28
+ :url => @base_url,
29
+ :crawl_limit => 3
30
+ }
31
+
32
+ it "should limit crawl"
33
+ it "should detect the end or crawl based on limit"
34
+
35
+ end
36
+
37
+
38
+ end
39
+
40
+
41
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.37
4
+ version: 0.0.38
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-24 00:00:00.000000000 Z
12
+ date: 2012-04-26 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70275220162200 !ruby/object:Gem::Requirement
16
+ requirement: &70264627738060 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70275220162200
24
+ version_requirements: *70264627738060
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70275220161740 !ruby/object:Gem::Requirement
27
+ requirement: &70264627737640 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70275220161740
35
+ version_requirements: *70264627737640
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70275220177640 !ruby/object:Gem::Requirement
38
+ requirement: &70264627737220 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70275220177640
46
+ version_requirements: *70264627737220
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70275220177220 !ruby/object:Gem::Requirement
49
+ requirement: &70264627736800 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70275220177220
57
+ version_requirements: *70264627736800
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70275220176640 !ruby/object:Gem::Requirement
60
+ requirement: &70264627736380 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70275220176640
68
+ version_requirements: *70264627736380
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70275220176220 !ruby/object:Gem::Requirement
71
+ requirement: &70264627735960 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70275220176220
79
+ version_requirements: *70264627735960
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70275220175780 !ruby/object:Gem::Requirement
82
+ requirement: &70264627735540 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70275220175780
90
+ version_requirements: *70264627735540
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70275220175360 !ruby/object:Gem::Requirement
93
+ requirement: &70264627735120 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70275220175360
101
+ version_requirements: *70264627735120
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70275220174940 !ruby/object:Gem::Requirement
104
+ requirement: &70264627734700 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70275220174940
112
+ version_requirements: *70264627734700
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70275220174520 !ruby/object:Gem::Requirement
115
+ requirement: &70264627734280 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: '0'
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70275220174520
123
+ version_requirements: *70264627734280
124
124
  description: Web Crawler that uses resque background job engine to allow you to cluster
125
125
  your crawl.
126
126
  email: stewart@rockwellcottage.com
@@ -130,6 +130,7 @@ extra_rdoc_files:
130
130
  - README.textile
131
131
  files:
132
132
  - spec/cobweb/cobweb_crawler_spec.rb
133
+ - spec/cobweb/cobweb_job_spec.rb
133
134
  - spec/cobweb/cobweb_spec.rb
134
135
  - spec/cobweb/content_link_parser_spec.rb
135
136
  - spec/samples/sample_html_links.html