cobweb 0.0.19 → 0.0.20

Sign up to get free protection for your applications and to get access to all the features.
@@ -29,8 +29,7 @@ class Cobweb
29
29
  @options[:debug] = false unless @options.has_key?(:debug)
30
30
  @options[:cache] = 300 unless @options.has_key?(:cache)
31
31
  @options[:timeout] = 10 unless @options.has_key?(:timeout)
32
- @options[:redis_options] = {} unless @options.has_key?(:redis_options)
33
-
32
+ @options[:redis_options] = {} unless @options.has_key?(:redis_options)
34
33
  end
35
34
 
36
35
  def start(base_url)
@@ -41,33 +40,40 @@ class Cobweb
41
40
  }
42
41
 
43
42
  request.merge!(@options)
44
- redis = NamespacedRedis.new(Redis.new(request[:redis_options]), "cobweb-#{request[:crawl_id]}")
45
- redis.hset "statistics", "queued_at", DateTime.now
43
+ @redis = NamespacedRedis.new(Redis.new(request[:redis_options]), "cobweb-#{VERSION}-#{request[:crawl_id]}")
44
+ @redis.hset "statistics", "queued_at", DateTime.now
46
45
 
47
46
  Resque.enqueue(CrawlJob, request)
48
47
  end
49
48
 
50
- def get(url, redirect_limit = @options[:redirect_limit])
49
+ def get(url, options = @options)
51
50
 
52
- raise "url cannot be nil" if url.nil?
51
+ raise "url cannot be nil" if url.nil?
53
52
 
54
53
  absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => true, :force_escaping => false, :remove_anchors => true)
55
54
 
56
55
  # get the unique id for this request
57
56
  unique_id = Digest::SHA1.hexdigest(url.to_s)
57
+ redirect_limit = options[:redirect_limit]
58
58
 
59
59
  # connect to redis
60
- redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb")
60
+ ap options
61
+ if options.has_key? :crawl_id
62
+ redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{VERSION}-#{options[:crawl_id]}")
63
+ else
64
+ redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{VERSION}")
65
+ end
61
66
 
67
+ ap "===== HEAD NAMESPACE ====="
68
+ ap redis.namespace
69
+ ap "===== HEAD NAMESPACE ====="
70
+
62
71
  content = {}
63
72
 
64
73
  # check if it has already been cached
65
74
  if redis.get(unique_id) and @options[:cache]
66
75
  puts "Cache hit for #{url}" unless @options[:quiet]
67
- content = JSON.parse(redis.get(unique_id)).deep_symbolize_keys
68
- content[:body] = Base64.decode64(content[:body])
69
-
70
- content
76
+ content = Marshal.load(redis.get(unique_id)).deep_symbolize_keys
71
77
  else
72
78
  # this url is valid for processing so lets get on with it
73
79
  uri = Addressable::URI.parse(url.strip)
@@ -143,8 +149,7 @@ class Cobweb
143
149
  end
144
150
  # add content to cache if required
145
151
  if @options[:cache]
146
- content[:body] = Base64.encode64(content[:body])
147
- redis.set(unique_id, content.to_json)
152
+ redis.set(unique_id, Marshal.dump(content))
148
153
  redis.expire unique_id, @options[:cache].to_i
149
154
  end
150
155
  rescue RedirectError => e
@@ -196,28 +201,32 @@ class Cobweb
196
201
  content
197
202
  end
198
203
 
199
- def head(url, redirect_limit = @options[:redirect_limit])
204
+ def head(url, options = @options)
200
205
  raise "url cannot be nil" if url.nil?
201
206
 
202
207
  absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
203
208
 
204
209
  # get the unique id for this request
205
210
  unique_id = Digest::SHA1.hexdigest(url)
211
+ redirect_limit = options[:redirect_limit]
206
212
 
207
213
  # connect to redis
208
- redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb")
214
+ if options.has_key? :crawl_id
215
+ redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{VERSION}-#{options[:crawl_id]}")
216
+ else
217
+ redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{VERSION}")
218
+ end
209
219
 
220
+ ap "===== HEAD NAMESPACE ====="
221
+ ap redis.namespace
222
+ ap "===== HEAD NAMESPACE ====="
223
+
210
224
  content = {}
211
225
 
212
226
  # check if it has already been cached
213
- if (redis.get(unique_id) or redis.get("head-#{unique_id}")) and @options[:cache]
227
+ if redis.get("head-#{unique_id}") and @options[:cache]
214
228
  puts "Cache hit for #{url}" unless @options[:quiet]
215
- if redis.get(unique_id)
216
- content = JSON.parse(redis.get(unique_id)).deep_symbolize_keys
217
- else
218
- content = JSON.parse(redis.get("head-#{unique_id}")).deep_symbolize_keys
219
- end
220
- content
229
+ Marshal.load(redis.get("head-#{unique_id}")).deep_symbolize_keys
221
230
  else
222
231
  print "Retrieving #{url }... " unless @options[:quiet]
223
232
  uri = Addressable::URI.parse(url.strip)
@@ -259,7 +268,7 @@ class Cobweb
259
268
  # add content to cache if required
260
269
  if @options[:cache]
261
270
  puts "Stored in cache [head-#{unique_id}]" if @options[:debug]
262
- redis.set("head-#{unique_id}", content.to_json)
271
+ redis.set("head-#{unique_id}", Marshal.dump(content))
263
272
  redis.expire "head-#{unique_id}", @options[:cache].to_i
264
273
  else
265
274
  puts "Not storing in cache as cache disabled" if @options[:debug]
@@ -0,0 +1 @@
1
+ VERSION = "0.0.20"
@@ -28,7 +28,8 @@ class CrawlJob
28
28
  def self.perform(content_request)
29
29
  # change all hash keys to symbols
30
30
  content_request.deep_symbolize_keys
31
- redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{content_request[:crawl_id]}")
31
+ redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{VERSION}-#{content_request[:crawl_id]}")
32
+ ap redis.namespace
32
33
  @absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
33
34
 
34
35
  # check we haven't crawled this url before
@@ -40,7 +41,7 @@ class CrawlJob
40
41
  redis.incr "crawl-counter"
41
42
  crawl_counter += 1
42
43
  if crawl_counter <= content_request[:crawl_limit].to_i
43
- content = Cobweb.new(content_request).get(content_request[:url])
44
+ content = Cobweb.new(content_request).get(content_request[:url], content_request)
44
45
 
45
46
  ## update statistics
46
47
  if redis.hexists "statistics", "average_response_time"
@@ -0,0 +1,9 @@
1
+ class Robots
2
+
3
+ def initialize(url, file_name="robots.txt")
4
+ uri = URI.parse(url)
5
+ [uri.scheme, "://", uri.host, ":", uri.port, "/", file_name].join
6
+ Cobweb.new(:cache => 6000).get([uri.scheme, "://", uri.host, ":", uri.port, "/", file_name].join)
7
+
8
+ end
9
+ end
@@ -20,12 +20,15 @@ class Stats < Sinatra::Base
20
20
  haml :statistics
21
21
  end
22
22
 
23
- end
24
-
25
- thread = Thread.new do
26
- Stats.run!
27
23
 
28
- ## we need to manually kill the main thread as sinatra traps the interrupts
29
- Thread.main.kill
24
+ def self.start
25
+ thread = Thread.new do
26
+ Stats.run!
27
+
28
+ ## we need to manually kill the main thread as sinatra traps the interrupts
29
+ Thread.main.kill
30
+ end
31
+ end
30
32
  end
31
33
 
34
+
@@ -169,19 +169,68 @@ describe Cobweb do
169
169
 
170
170
  end
171
171
  end
172
+
173
+ describe "with cache" do
174
+
175
+ before(:each) do
176
+ @cobweb = Cobweb.new :quiet => true, :cache => 200
177
+ end
178
+
179
+ describe "content object" do
180
+ it "should return the url" do
181
+ @cobweb.get(@base_url)[:url].should == @base_url
182
+ @cobweb.get(@base_url)[:url].should == @base_url
183
+ end
184
+ it "should return correct content-type" do
185
+ @mock_http_response.stub!(:content_type).and_return("image/jpeg")
186
+ @cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
187
+ @cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
188
+ end
189
+ it "should return correct status-code" do
190
+ @mock_http_response.stub!(:code).and_return(404)
191
+ @cobweb.get(@base_url)[:status_code].should == 404
192
+ @cobweb.get(@base_url)[:status_code].should == 404
193
+ end
194
+ it "should return correct status-code" do
195
+ @mock_http_response.stub!(:code).and_return(404)
196
+ @cobweb.get(@base_url)[:status_code].should == 404
197
+ @cobweb.get(@base_url)[:status_code].should == 404
198
+ end
199
+ it "should return correct character_set" do
200
+ @cobweb.get(@base_url)[:character_set].should == "UTF-8"
201
+ @cobweb.get(@base_url)[:character_set].should == "UTF-8"
202
+ end
203
+ it "should return correct content_length" do
204
+ @cobweb.get(@base_url)[:length].should == 1024
205
+ @cobweb.get(@base_url)[:length].should == 1024
206
+ end
207
+ it "should return correct content_body" do
208
+ @cobweb.get(@base_url)[:body].should == "asdf"
209
+ @cobweb.get(@base_url)[:body].should == "asdf"
210
+ end
211
+ it "should return correct location" do
212
+ @cobweb.get(@base_url)[:location].should == nil
213
+ @cobweb.get(@base_url)[:location].should == nil
214
+
215
+ @mock_http_response.stub!(:[]).with("location").and_return("http://google.com/")
216
+ @cobweb.get(@base_url)[:location].should == "http://google.com/"
217
+ @cobweb.get(@base_url)[:location].should == "http://google.com/"
218
+ end
219
+ it "should return correct headers" do
220
+ @cobweb.get(@base_url)[:headers].should == @default_headers
221
+ @cobweb.get(@base_url)[:headers].should == @default_headers
222
+ end
223
+ it "should return correct a hash of links" do
224
+ @cobweb.get(@base_url)[:links].should be_an_instance_of Hash
225
+ @cobweb.get(@base_url)[:links].should be_an_instance_of Hash
226
+ end
227
+ it "should return the response time for the url" do
228
+ @cobweb.get(@base_url)[:response_time].should be_an_instance_of Float
229
+ @cobweb.get(@base_url)[:response_time].should be_an_instance_of Float
230
+ end
231
+ end
232
+
233
+ end
172
234
  end
173
235
  end
174
-
175
- describe "without mock" do
176
-
177
- it "should throw exception when server is unavailable" #do
178
- # lambda {@cobweb.get({:url => "http://www.oasdjgoisadjgoisdiog.com"})}.should raise_error URI::InvalidURIError
179
- #end
180
-
181
- it "should return a valid content hash when url doesn't exist on a live server" do
182
- status_code = @cobweb.get("http://test.com/laskdjflsdajf")[:status_code]
183
- status_code.should == 404
184
- end
185
-
186
- end
187
236
  end
@@ -1 +1,13 @@
1
1
  require File.expand_path(File.dirname(__FILE__) + '/../lib/cobweb')
2
+ require 'mock_redis'
3
+
4
+ RSpec.configure do |config|
5
+ config.before(:each) {
6
+ #redis_mock = double("redis")
7
+ #ap redis_mock
8
+ #redis_mock.stub(:new).and_return(MockRedis.new)
9
+
10
+ Redis.new.flushdb
11
+ }
12
+
13
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.19
4
+ version: 0.0.20
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-03-04 00:00:00.000000000 Z
12
+ date: 2012-03-09 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70137319788100 !ruby/object:Gem::Requirement
16
+ requirement: &70234753393720 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70137319788100
24
+ version_requirements: *70234753393720
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70137319787220 !ruby/object:Gem::Requirement
27
+ requirement: &70234753392840 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70137319787220
35
+ version_requirements: *70234753392840
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: absolutize
38
- requirement: &70137319786300 !ruby/object:Gem::Requirement
38
+ requirement: &70234753392180 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70137319786300
46
+ version_requirements: *70234753392180
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: nokogiri
49
- requirement: &70137319785420 !ruby/object:Gem::Requirement
49
+ requirement: &70234753391520 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70137319785420
57
+ version_requirements: *70234753391520
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: addressable
60
- requirement: &70137319784540 !ruby/object:Gem::Requirement
60
+ requirement: &70234753390840 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70137319784540
68
+ version_requirements: *70234753390840
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rspec
71
- requirement: &70137319783560 !ruby/object:Gem::Requirement
71
+ requirement: &70234753390200 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70137319783560
79
+ version_requirements: *70234753390200
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: awesome_print
82
- requirement: &70137319782700 !ruby/object:Gem::Requirement
82
+ requirement: &70234753389280 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70137319782700
90
+ version_requirements: *70234753389280
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: sinatra
93
- requirement: &70137319781560 !ruby/object:Gem::Requirement
93
+ requirement: &70234753388360 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70137319781560
101
+ version_requirements: *70234753388360
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: thin
104
- requirement: &70137319780360 !ruby/object:Gem::Requirement
104
+ requirement: &70234753387580 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70137319780360
112
+ version_requirements: *70234753387580
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: haml
115
- requirement: &70137319779820 !ruby/object:Gem::Requirement
115
+ requirement: &70234753386980 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: '0'
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70137319779820
123
+ version_requirements: *70234753386980
124
124
  description:
125
125
  email: stewart@rockwellcottage.com
126
126
  executables: []
@@ -134,6 +134,7 @@ files:
134
134
  - spec/samples/sample_html_links.html
135
135
  - spec/spec.opts
136
136
  - spec/spec_helper.rb
137
+ - lib/cobweb/version.rb
137
138
  - lib/cobweb.rb
138
139
  - lib/cobweb_crawler.rb
139
140
  - lib/cobweb_finished_job.rb
@@ -143,6 +144,7 @@ files:
143
144
  - lib/hash.rb
144
145
  - lib/namespaced_redis.rb
145
146
  - lib/redirect_error.rb
147
+ - lib/robots.rb
146
148
  - lib/stats.rb
147
149
  - views/statistics.haml
148
150
  - README.textile