cobweb 0.0.19 → 0.0.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,8 +29,7 @@ class Cobweb
29
29
  @options[:debug] = false unless @options.has_key?(:debug)
30
30
  @options[:cache] = 300 unless @options.has_key?(:cache)
31
31
  @options[:timeout] = 10 unless @options.has_key?(:timeout)
32
- @options[:redis_options] = {} unless @options.has_key?(:redis_options)
33
-
32
+ @options[:redis_options] = {} unless @options.has_key?(:redis_options)
34
33
  end
35
34
 
36
35
  def start(base_url)
@@ -41,33 +40,40 @@ class Cobweb
41
40
  }
42
41
 
43
42
  request.merge!(@options)
44
- redis = NamespacedRedis.new(Redis.new(request[:redis_options]), "cobweb-#{request[:crawl_id]}")
45
- redis.hset "statistics", "queued_at", DateTime.now
43
+ @redis = NamespacedRedis.new(Redis.new(request[:redis_options]), "cobweb-#{VERSION}-#{request[:crawl_id]}")
44
+ @redis.hset "statistics", "queued_at", DateTime.now
46
45
 
47
46
  Resque.enqueue(CrawlJob, request)
48
47
  end
49
48
 
50
- def get(url, redirect_limit = @options[:redirect_limit])
49
+ def get(url, options = @options)
51
50
 
52
- raise "url cannot be nil" if url.nil?
51
+ raise "url cannot be nil" if url.nil?
53
52
 
54
53
  absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => true, :force_escaping => false, :remove_anchors => true)
55
54
 
56
55
  # get the unique id for this request
57
56
  unique_id = Digest::SHA1.hexdigest(url.to_s)
57
+ redirect_limit = options[:redirect_limit]
58
58
 
59
59
  # connect to redis
60
- redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb")
60
+ ap options
61
+ if options.has_key? :crawl_id
62
+ redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{VERSION}-#{options[:crawl_id]}")
63
+ else
64
+ redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{VERSION}")
65
+ end
61
66
 
67
+ ap "===== HEAD NAMESPACE ====="
68
+ ap redis.namespace
69
+ ap "===== HEAD NAMESPACE ====="
70
+
62
71
  content = {}
63
72
 
64
73
  # check if it has already been cached
65
74
  if redis.get(unique_id) and @options[:cache]
66
75
  puts "Cache hit for #{url}" unless @options[:quiet]
67
- content = JSON.parse(redis.get(unique_id)).deep_symbolize_keys
68
- content[:body] = Base64.decode64(content[:body])
69
-
70
- content
76
+ content = Marshal.load(redis.get(unique_id)).deep_symbolize_keys
71
77
  else
72
78
  # this url is valid for processing so lets get on with it
73
79
  uri = Addressable::URI.parse(url.strip)
@@ -143,8 +149,7 @@ class Cobweb
143
149
  end
144
150
  # add content to cache if required
145
151
  if @options[:cache]
146
- content[:body] = Base64.encode64(content[:body])
147
- redis.set(unique_id, content.to_json)
152
+ redis.set(unique_id, Marshal.dump(content))
148
153
  redis.expire unique_id, @options[:cache].to_i
149
154
  end
150
155
  rescue RedirectError => e
@@ -196,28 +201,32 @@ class Cobweb
196
201
  content
197
202
  end
198
203
 
199
- def head(url, redirect_limit = @options[:redirect_limit])
204
+ def head(url, options = @options)
200
205
  raise "url cannot be nil" if url.nil?
201
206
 
202
207
  absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
203
208
 
204
209
  # get the unique id for this request
205
210
  unique_id = Digest::SHA1.hexdigest(url)
211
+ redirect_limit = options[:redirect_limit]
206
212
 
207
213
  # connect to redis
208
- redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb")
214
+ if options.has_key? :crawl_id
215
+ redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{VERSION}-#{options[:crawl_id]}")
216
+ else
217
+ redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{VERSION}")
218
+ end
209
219
 
220
+ ap "===== HEAD NAMESPACE ====="
221
+ ap redis.namespace
222
+ ap "===== HEAD NAMESPACE ====="
223
+
210
224
  content = {}
211
225
 
212
226
  # check if it has already been cached
213
- if (redis.get(unique_id) or redis.get("head-#{unique_id}")) and @options[:cache]
227
+ if redis.get("head-#{unique_id}") and @options[:cache]
214
228
  puts "Cache hit for #{url}" unless @options[:quiet]
215
- if redis.get(unique_id)
216
- content = JSON.parse(redis.get(unique_id)).deep_symbolize_keys
217
- else
218
- content = JSON.parse(redis.get("head-#{unique_id}")).deep_symbolize_keys
219
- end
220
- content
229
+ Marshal.load(redis.get("head-#{unique_id}")).deep_symbolize_keys
221
230
  else
222
231
  print "Retrieving #{url }... " unless @options[:quiet]
223
232
  uri = Addressable::URI.parse(url.strip)
@@ -259,7 +268,7 @@ class Cobweb
259
268
  # add content to cache if required
260
269
  if @options[:cache]
261
270
  puts "Stored in cache [head-#{unique_id}]" if @options[:debug]
262
- redis.set("head-#{unique_id}", content.to_json)
271
+ redis.set("head-#{unique_id}", Marshal.dump(content))
263
272
  redis.expire "head-#{unique_id}", @options[:cache].to_i
264
273
  else
265
274
  puts "Not storing in cache as cache disabled" if @options[:debug]
@@ -0,0 +1 @@
1
+ VERSION = "0.0.20"
@@ -28,7 +28,8 @@ class CrawlJob
28
28
  def self.perform(content_request)
29
29
  # change all hash keys to symbols
30
30
  content_request.deep_symbolize_keys
31
- redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{content_request[:crawl_id]}")
31
+ redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{VERSION}-#{content_request[:crawl_id]}")
32
+ ap redis.namespace
32
33
  @absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
33
34
 
34
35
  # check we haven't crawled this url before
@@ -40,7 +41,7 @@ class CrawlJob
40
41
  redis.incr "crawl-counter"
41
42
  crawl_counter += 1
42
43
  if crawl_counter <= content_request[:crawl_limit].to_i
43
- content = Cobweb.new(content_request).get(content_request[:url])
44
+ content = Cobweb.new(content_request).get(content_request[:url], content_request)
44
45
 
45
46
  ## update statistics
46
47
  if redis.hexists "statistics", "average_response_time"
@@ -0,0 +1,9 @@
1
+ class Robots
2
+
3
+ def initialize(url, file_name="robots.txt")
4
+ uri = URI.parse(url)
5
+ [uri.scheme, "://", uri.host, ":", uri.port, "/", file_name].join
6
+ Cobweb.new(:cache => 6000).get([uri.scheme, "://", uri.host, ":", uri.port, "/", file_name].join)
7
+
8
+ end
9
+ end
@@ -20,12 +20,15 @@ class Stats < Sinatra::Base
20
20
  haml :statistics
21
21
  end
22
22
 
23
- end
24
-
25
- thread = Thread.new do
26
- Stats.run!
27
23
 
28
- ## we need to manually kill the main thread as sinatra traps the interrupts
29
- Thread.main.kill
24
+ def self.start
25
+ thread = Thread.new do
26
+ Stats.run!
27
+
28
+ ## we need to manually kill the main thread as sinatra traps the interrupts
29
+ Thread.main.kill
30
+ end
31
+ end
30
32
  end
31
33
 
34
+
@@ -169,19 +169,68 @@ describe Cobweb do
169
169
 
170
170
  end
171
171
  end
172
+
173
+ describe "with cache" do
174
+
175
+ before(:each) do
176
+ @cobweb = Cobweb.new :quiet => true, :cache => 200
177
+ end
178
+
179
+ describe "content object" do
180
+ it "should return the url" do
181
+ @cobweb.get(@base_url)[:url].should == @base_url
182
+ @cobweb.get(@base_url)[:url].should == @base_url
183
+ end
184
+ it "should return correct content-type" do
185
+ @mock_http_response.stub!(:content_type).and_return("image/jpeg")
186
+ @cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
187
+ @cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
188
+ end
189
+ it "should return correct status-code" do
190
+ @mock_http_response.stub!(:code).and_return(404)
191
+ @cobweb.get(@base_url)[:status_code].should == 404
192
+ @cobweb.get(@base_url)[:status_code].should == 404
193
+ end
194
+ it "should return correct status-code" do
195
+ @mock_http_response.stub!(:code).and_return(404)
196
+ @cobweb.get(@base_url)[:status_code].should == 404
197
+ @cobweb.get(@base_url)[:status_code].should == 404
198
+ end
199
+ it "should return correct character_set" do
200
+ @cobweb.get(@base_url)[:character_set].should == "UTF-8"
201
+ @cobweb.get(@base_url)[:character_set].should == "UTF-8"
202
+ end
203
+ it "should return correct content_length" do
204
+ @cobweb.get(@base_url)[:length].should == 1024
205
+ @cobweb.get(@base_url)[:length].should == 1024
206
+ end
207
+ it "should return correct content_body" do
208
+ @cobweb.get(@base_url)[:body].should == "asdf"
209
+ @cobweb.get(@base_url)[:body].should == "asdf"
210
+ end
211
+ it "should return correct location" do
212
+ @cobweb.get(@base_url)[:location].should == nil
213
+ @cobweb.get(@base_url)[:location].should == nil
214
+
215
+ @mock_http_response.stub!(:[]).with("location").and_return("http://google.com/")
216
+ @cobweb.get(@base_url)[:location].should == "http://google.com/"
217
+ @cobweb.get(@base_url)[:location].should == "http://google.com/"
218
+ end
219
+ it "should return correct headers" do
220
+ @cobweb.get(@base_url)[:headers].should == @default_headers
221
+ @cobweb.get(@base_url)[:headers].should == @default_headers
222
+ end
223
+ it "should return correct a hash of links" do
224
+ @cobweb.get(@base_url)[:links].should be_an_instance_of Hash
225
+ @cobweb.get(@base_url)[:links].should be_an_instance_of Hash
226
+ end
227
+ it "should return the response time for the url" do
228
+ @cobweb.get(@base_url)[:response_time].should be_an_instance_of Float
229
+ @cobweb.get(@base_url)[:response_time].should be_an_instance_of Float
230
+ end
231
+ end
232
+
233
+ end
172
234
  end
173
235
  end
174
-
175
- describe "without mock" do
176
-
177
- it "should throw exception when server is unavailable" #do
178
- # lambda {@cobweb.get({:url => "http://www.oasdjgoisadjgoisdiog.com"})}.should raise_error URI::InvalidURIError
179
- #end
180
-
181
- it "should return a valid content hash when url doesn't exist on a live server" do
182
- status_code = @cobweb.get("http://test.com/laskdjflsdajf")[:status_code]
183
- status_code.should == 404
184
- end
185
-
186
- end
187
236
  end
@@ -1 +1,13 @@
1
1
  require File.expand_path(File.dirname(__FILE__) + '/../lib/cobweb')
2
+ require 'mock_redis'
3
+
4
+ RSpec.configure do |config|
5
+ config.before(:each) {
6
+ #redis_mock = double("redis")
7
+ #ap redis_mock
8
+ #redis_mock.stub(:new).and_return(MockRedis.new)
9
+
10
+ Redis.new.flushdb
11
+ }
12
+
13
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.19
4
+ version: 0.0.20
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-03-04 00:00:00.000000000 Z
12
+ date: 2012-03-09 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70137319788100 !ruby/object:Gem::Requirement
16
+ requirement: &70234753393720 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70137319788100
24
+ version_requirements: *70234753393720
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70137319787220 !ruby/object:Gem::Requirement
27
+ requirement: &70234753392840 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70137319787220
35
+ version_requirements: *70234753392840
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: absolutize
38
- requirement: &70137319786300 !ruby/object:Gem::Requirement
38
+ requirement: &70234753392180 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70137319786300
46
+ version_requirements: *70234753392180
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: nokogiri
49
- requirement: &70137319785420 !ruby/object:Gem::Requirement
49
+ requirement: &70234753391520 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70137319785420
57
+ version_requirements: *70234753391520
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: addressable
60
- requirement: &70137319784540 !ruby/object:Gem::Requirement
60
+ requirement: &70234753390840 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70137319784540
68
+ version_requirements: *70234753390840
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rspec
71
- requirement: &70137319783560 !ruby/object:Gem::Requirement
71
+ requirement: &70234753390200 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70137319783560
79
+ version_requirements: *70234753390200
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: awesome_print
82
- requirement: &70137319782700 !ruby/object:Gem::Requirement
82
+ requirement: &70234753389280 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70137319782700
90
+ version_requirements: *70234753389280
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: sinatra
93
- requirement: &70137319781560 !ruby/object:Gem::Requirement
93
+ requirement: &70234753388360 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70137319781560
101
+ version_requirements: *70234753388360
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: thin
104
- requirement: &70137319780360 !ruby/object:Gem::Requirement
104
+ requirement: &70234753387580 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70137319780360
112
+ version_requirements: *70234753387580
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: haml
115
- requirement: &70137319779820 !ruby/object:Gem::Requirement
115
+ requirement: &70234753386980 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: '0'
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70137319779820
123
+ version_requirements: *70234753386980
124
124
  description:
125
125
  email: stewart@rockwellcottage.com
126
126
  executables: []
@@ -134,6 +134,7 @@ files:
134
134
  - spec/samples/sample_html_links.html
135
135
  - spec/spec.opts
136
136
  - spec/spec_helper.rb
137
+ - lib/cobweb/version.rb
137
138
  - lib/cobweb.rb
138
139
  - lib/cobweb_crawler.rb
139
140
  - lib/cobweb_finished_job.rb
@@ -143,6 +144,7 @@ files:
143
144
  - lib/hash.rb
144
145
  - lib/namespaced_redis.rb
145
146
  - lib/redirect_error.rb
147
+ - lib/robots.rb
146
148
  - lib/stats.rb
147
149
  - views/statistics.haml
148
150
  - README.textile