cobweb 0.0.33 → 0.0.34

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.33
2
+ h1. Cobweb v0.0.34
3
3
 
4
4
  h2. Intro
5
5
 
@@ -4,6 +4,7 @@ require 'resque'
4
4
  require "addressable/uri"
5
5
  require 'digest/sha1'
6
6
  require 'base64'
7
+ require 'namespaced_redis'
7
8
 
8
9
  Dir[File.dirname(__FILE__) + '/**/*.rb'].each do |file|
9
10
  require file
@@ -19,7 +20,7 @@ class Cobweb
19
20
  # investigate using event machine for single threaded crawling
20
21
 
21
22
  def self.version
22
- "0.0.33"
23
+ "0.0.34"
23
24
  end
24
25
 
25
26
  def method_missing(method_sym, *arguments, &block)
@@ -61,7 +62,7 @@ class Cobweb
61
62
  end
62
63
 
63
64
  request.merge!(@options)
64
- @redis = NamespacedRedis.new(Redis.new(request[:redis_options]), "cobweb-#{Cobweb.version}-#{request[:crawl_id]}")
65
+ @redis = NamespacedRedis.new(request[:redis_options], "cobweb-#{Cobweb.version}-#{request[:crawl_id]}")
65
66
  @redis.hset "statistics", "queued_at", DateTime.now
66
67
  @redis.set("crawl-counter", 0)
67
68
  @redis.set("queue-counter", 1)
@@ -74,9 +75,11 @@ class Cobweb
74
75
  end
75
76
 
76
77
  def get(url, options = @options)
77
-
78
78
  raise "url cannot be nil" if url.nil?
79
-
79
+ uri = Addressable::URI.parse(url)
80
+ uri.fragment=nil
81
+ url = uri.to_s
82
+
80
83
  # get the unique id for this request
81
84
  unique_id = Digest::SHA1.hexdigest(url.to_s)
82
85
  if options.has_key?(:redirect_limit) and !options[:redirect_limit].nil?
@@ -87,9 +90,9 @@ class Cobweb
87
90
 
88
91
  # connect to redis
89
92
  if options.has_key? :crawl_id
90
- redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}-#{options[:crawl_id]}")
93
+ redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}-#{options[:crawl_id]}")
91
94
  else
92
- redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}")
95
+ redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}")
93
96
  end
94
97
 
95
98
  content = {:base_url => url}
@@ -100,8 +103,8 @@ class Cobweb
100
103
  content = deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
101
104
  else
102
105
  # this url is valid for processing so lets get on with it
103
- uri = Addressable::URI.parse(url.strip)
104
-
106
+ #TODO the @http here is different from in head. Should it be? - in head we are using a method-scoped variable.
107
+
105
108
  # retrieve data
106
109
  unless @http && @http.address == uri.host && @http.port == uri.inferred_port
107
110
  puts "Creating connection to #{uri.host}..." unless @options[:quiet]
@@ -125,7 +128,7 @@ class Cobweb
125
128
  puts "redirected... " unless @options[:quiet]
126
129
 
127
130
  # get location to redirect to
128
- url = Addressable::URI.join(uri, response['location']).to_s
131
+ url = UriHelper.join_no_fragment(uri, response['location'])
129
132
 
130
133
  # decrement redirect limit
131
134
  redirect_limit = redirect_limit - 1
@@ -231,7 +234,10 @@ class Cobweb
231
234
 
232
235
  def head(url, options = @options)
233
236
  raise "url cannot be nil" if url.nil?
234
-
237
+ uri = Addressable::URI.parse(url.strip)
238
+ uri.fragment=nil
239
+ url = uri.to_s
240
+
235
241
  # get the unique id for this request
236
242
  unique_id = Digest::SHA1.hexdigest(url)
237
243
  if options.has_key?(:redirect_limit) and !options[:redirect_limit].nil?
@@ -242,9 +248,9 @@ class Cobweb
242
248
 
243
249
  # connect to redis
244
250
  if options.has_key? :crawl_id
245
- redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}-#{options[:crawl_id]}")
251
+ redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}-#{options[:crawl_id]}")
246
252
  else
247
- redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}")
253
+ redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}")
248
254
  end
249
255
 
250
256
  content = {}
@@ -255,8 +261,7 @@ class Cobweb
255
261
  content = deep_symbolize_keys(Marshal.load(redis.get("head-#{unique_id}")))
256
262
  else
257
263
  print "Retrieving #{url }... " unless @options[:quiet]
258
- uri = Addressable::URI.parse(url.strip)
259
-
264
+
260
265
  # retrieve data
261
266
  http = Net::HTTP.new(uri.host, uri.inferred_port)
262
267
  if uri.scheme == "https"
@@ -269,11 +274,12 @@ class Cobweb
269
274
  http.open_timeout = @options[:timeout].to_i
270
275
 
271
276
  begin
272
- response = http.head(uri.to_s)
273
-
277
+ request = Net::HTTP::Head.new uri.request_uri
278
+ response = http.request request
279
+
274
280
  if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
275
281
  puts "redirected... " unless @options[:quiet]
276
- url = Addressable::URI.parse(response['location']).to_s
282
+ url = UriHelper.join_no_fragment(uri, response['location'])
277
283
  redirect_limit = redirect_limit - 1
278
284
  options = options.clone
279
285
  options[:redirect_limit]=redirect_limit
@@ -1,6 +1,7 @@
1
1
  require 'digest/md5'
2
2
  require 'date'
3
3
  require 'ap'
4
+ #require 'namespaced_redis'
4
5
 
5
6
  class CobwebCrawler
6
7
 
@@ -11,8 +12,8 @@ class CobwebCrawler
11
12
 
12
13
  @options[:redis_options] = {:host => "127.0.0.1"} unless @options.has_key? :redis_options
13
14
  crawl_id = Digest::MD5.hexdigest(DateTime.now.inspect.to_s)
14
-
15
- @redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{crawl_id}")
15
+
16
+ @redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{crawl_id}")
16
17
 
17
18
  @cobweb = Cobweb.new(@options)
18
19
  end
@@ -3,6 +3,7 @@ class CrawlJob
3
3
  require "net/https"
4
4
  require "uri"
5
5
  require "redis"
6
+ require 'namespaced_redis'
6
7
 
7
8
  @queue = :cobweb_crawl_job
8
9
 
@@ -11,7 +12,7 @@ class CrawlJob
11
12
  # change all hash keys to symbols
12
13
  content_request = content_request.deep_symbolize_keys
13
14
 
14
- @redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
15
+ @redis = NamespacedRedis.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
15
16
 
16
17
  @debug = content_request[:debug]
17
18
 
@@ -57,7 +58,7 @@ class CrawlJob
57
58
 
58
59
  #if the enqueue counter has been requested update that
59
60
  if content_request.has_key? :enqueue_counter_key
60
- enqueue_redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), content_request[:enqueue_counter_namespace].to_s)
61
+ enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
61
62
  current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
62
63
  enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
63
64
  end
@@ -126,11 +127,12 @@ class CrawlJob
126
127
  def self.all_links_from_content(content)
127
128
  links = content[:links].keys.map{|key| content[:links][key]}.flatten
128
129
  links.reject!{|link| link.starts_with?("javascript:")}
129
- links = links.map{|link| Addressable::URI.join(content[:url], link)}
130
+ links = links.map{|link| UriHelper.join_no_fragment(content[:url], link) }
130
131
  links.select!{|link| link.scheme == "http" || link.scheme == "https"}
132
+ links.uniq
131
133
  links
132
134
  end
133
-
135
+
134
136
  def self.enqueue_content(content_request, link)
135
137
  new_request = content_request.clone
136
138
  new_request[:url] = link
@@ -0,0 +1,8 @@
1
+ class UriHelper
2
+ def self.join_no_fragment(content, link)
3
+ new_link = Addressable::URI.join(content, link)
4
+ new_link.fragment=nil
5
+ new_link
6
+ end
7
+
8
+ end
@@ -117,7 +117,7 @@ describe Cobweb do
117
117
 
118
118
  #end
119
119
  it "should not follow with redirect disabled" do
120
- @cobweb = Cobweb.new(:follow_redirects => false, :cache => nil)
120
+ @cobweb = Cobweb.new(:follow_redirects => false, :cache => 3)
121
121
  @mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
122
122
 
123
123
  content = @cobweb.get(@base_url)
@@ -133,7 +133,8 @@ describe Cobweb do
133
133
  describe "with cache" do
134
134
 
135
135
  before(:each) do
136
- @cobweb = Cobweb.new :quiet => true, :cache => 200
136
+ @cobweb = Cobweb.new :quiet => true, :cache => 1
137
+ Redis.new.flushdb
137
138
  end
138
139
 
139
140
  describe "content object" do
@@ -169,8 +170,8 @@ describe Cobweb do
169
170
  @cobweb.get(@base_url)[:body].should == "asdf"
170
171
  end
171
172
  it "should return correct headers" do
172
- @cobweb.get(@base_url)[:headers].should == @default_headers
173
- @cobweb.get(@base_url)[:headers].should == @default_headers
173
+ @cobweb.get(@base_url)[:headers].should == @symbolized_default_headers
174
+ @cobweb.get(@base_url)[:headers].should == @symbolized_default_headers
174
175
  end
175
176
  it "should return correct a hash of links" do
176
177
  @cobweb.get(@base_url)[:links].should be_an_instance_of Hash
@@ -181,7 +182,29 @@ describe Cobweb do
181
182
  @cobweb.get(@base_url)[:response_time].should be_an_instance_of Float
182
183
  end
183
184
  end
184
-
185
185
  end
186
+ describe "location setting" do
187
+ it "Get should strip fragments" do
188
+ Net::HTTP.should_receive(:new).with("www.google.com", 80)
189
+ Net::HTTP::Get.should_receive(:new).with("/")
190
+ @cobweb.get("http://www.google.com/#ignore")
191
+ end
192
+ it "head should strip fragments" do
193
+ Net::HTTP.should_receive(:new).with("www.google.com", 80)
194
+ Net::HTTP::Head.should_receive(:new).with("/").and_return(@mock_http_request)
195
+ @cobweb.head("http://www.google.com/#ignore")
196
+ end
197
+ it "get should not strip path" do
198
+ Net::HTTP.should_receive(:new).with("www.google.com", 80)
199
+ Net::HTTP::Get.should_receive(:new).with("/path/to/stuff")
200
+ @cobweb.get("http://www.google.com/path/to/stuff#ignore")
201
+ end
202
+ it "get should not strip query string" do
203
+ Net::HTTP.should_receive(:new).with("www.google.com", 80)
204
+ Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string")
205
+ @cobweb.get("http://www.google.com/path/to/stuff?query_string#ignore")
206
+ end
207
+ end
208
+
186
209
  end
187
210
  end
@@ -3,11 +3,10 @@ require 'mock_redis'
3
3
 
4
4
  RSpec.configure do |config|
5
5
  config.before(:each) {
6
- #redis_mock = double("redis")
7
- #ap redis_mock
8
- #redis_mock.stub(:new).and_return(MockRedis.new)
6
+ redis_mock = double("redis")
7
+ redis_mock.stub(:new).and_return(@redis_mock_object)
9
8
 
10
- Redis.new.flushdb
9
+ #redis_mock.flushdb
11
10
 
12
11
 
13
12
  @default_headers = {"Cache-Control" => "private, max-age=0",
@@ -19,6 +18,15 @@ RSpec.configure do |config|
19
18
  "Server" => "gws",
20
19
  "X-XSS-Protection" => "1; mode=block"}
21
20
 
21
+ @symbolized_default_headers = {:"Cache-Control" => "private, max-age=0",
22
+ :"Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
23
+ :"Expires" => "-1",
24
+ :"Content-Type" => "text/html; charset=UTF-8",
25
+ :"Content-Encoding" => "",
26
+ :"Transfer-Encoding" => "chunked",
27
+ :"Server" => "gws",
28
+ :"X-XSS-Protection" => "1; mode=block"}
29
+
22
30
  @mock_http_client = mock(Net::HTTP)
23
31
  @mock_http_request = mock(Net::HTTPRequest)
24
32
  @mock_http_redirect_request = mock(Net::HTTPRequest)
@@ -33,7 +41,9 @@ RSpec.configure do |config|
33
41
  Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
34
42
  Net::HTTP::Get.stub!(:new).with("/redirect.html").and_return(@mock_http_redirect_request)
35
43
  Net::HTTP::Get.stub!(:new).with("/redirect2.html").and_return(@mock_http_redirect_request2)
36
-
44
+
45
+ Net::HTTP::Head.stub!(:new).and_return(@mock_http_request)
46
+
37
47
  @mock_http_client.stub!(:request).with(@mock_http_request).and_return(@mock_http_response)
38
48
  @mock_http_client.stub!(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
39
49
  @mock_http_client.stub!(:request).with(@mock_http_redirect_request2).and_return(@mock_http_redirect_response2)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.33
4
+ version: 0.0.34
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-12 00:00:00.000000000 Z
12
+ date: 2012-04-19 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70153227362400 !ruby/object:Gem::Requirement
16
+ requirement: &70240600898440 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70153227362400
24
+ version_requirements: *70240600898440
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70153227361980 !ruby/object:Gem::Requirement
27
+ requirement: &70240600898020 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70153227361980
35
+ version_requirements: *70240600898020
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70153227361560 !ruby/object:Gem::Requirement
38
+ requirement: &70240600897600 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70153227361560
46
+ version_requirements: *70240600897600
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70153227361140 !ruby/object:Gem::Requirement
49
+ requirement: &70240600897180 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70153227361140
57
+ version_requirements: *70240600897180
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70153227360720 !ruby/object:Gem::Requirement
60
+ requirement: &70240600913120 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70153227360720
68
+ version_requirements: *70240600913120
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70153227360300 !ruby/object:Gem::Requirement
71
+ requirement: &70240600912700 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70153227360300
79
+ version_requirements: *70240600912700
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70153227359880 !ruby/object:Gem::Requirement
82
+ requirement: &70240600912280 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70153227359880
90
+ version_requirements: *70240600912280
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70153227359460 !ruby/object:Gem::Requirement
93
+ requirement: &70240600911860 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70153227359460
101
+ version_requirements: *70240600911860
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70153227359040 !ruby/object:Gem::Requirement
104
+ requirement: &70240600911440 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,7 +109,18 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70153227359040
112
+ version_requirements: *70240600911440
113
+ - !ruby/object:Gem::Dependency
114
+ name: namespaced_redis
115
+ requirement: &70240600911020 !ruby/object:Gem::Requirement
116
+ none: false
117
+ requirements:
118
+ - - ! '>='
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
121
+ type: :runtime
122
+ prerelease: false
123
+ version_requirements: *70240600911020
113
124
  description: Web Crawler that uses resque background job engine to allow you to cluster
114
125
  your crawl.
115
126
  email: stewart@rockwellcottage.com
@@ -130,10 +141,10 @@ files:
130
141
  - lib/cobweb_process_job.rb
131
142
  - lib/content_link_parser.rb
132
143
  - lib/crawl_job.rb
133
- - lib/namespaced_redis.rb
134
144
  - lib/redirect_error.rb
135
145
  - lib/robots.rb
136
146
  - lib/stats.rb
147
+ - lib/uri_helper.rb
137
148
  - views/statistics.haml
138
149
  - README.textile
139
150
  homepage: http://github.com/stewartmckee/cobweb
@@ -1,88 +0,0 @@
1
- class NamespacedRedis
2
- def initialize(redis, namespace="")
3
- raise "redis must be supplied" if redis.nil?
4
- @redis = redis
5
- @namespace = namespace
6
- end
7
-
8
- def sismember(key, member)
9
- @redis.sismember namespaced(key), member
10
- end
11
-
12
- def sadd(key, value)
13
- @redis.sadd namespaced(key), value
14
- end
15
-
16
- def srem(key, member)
17
- @redis.srem namespaced(key), member
18
- end
19
-
20
- def spop(key)
21
- @redis.spop namespaced(key)
22
- end
23
-
24
- def smembers(key)
25
- @redis.smembers namespaced(key)
26
- end
27
-
28
- def scard(key)
29
- @redis.scard namespaced(key)
30
- end
31
-
32
- def get(key)
33
- @redis.get namespaced(key)
34
- end
35
-
36
- def incr(key)
37
- @redis.incr namespaced(key)
38
- end
39
-
40
- def decr(key)
41
- @redis.decr namespaced(key)
42
- end
43
-
44
- def exist(key)
45
- @redis.exist namespaced(key)
46
- end
47
-
48
- def set(key, value)
49
- @redis.set namespaced(key), value
50
- end
51
-
52
- def hget(key, member)
53
- @redis.hget namespaced(key), member
54
- end
55
-
56
- def hgetall(key)
57
- @redis.hgetall namespaced(key)
58
- end
59
-
60
- def hset(key, member, value)
61
- @redis.hset namespaced(key), member, value
62
- end
63
-
64
- def hexists(key, member)
65
- @redis.hexists namespaced(key), member
66
- end
67
-
68
- def del(key)
69
- @redis.del namespaced(key)
70
- end
71
-
72
- def expire(key, value)
73
- @redis.expire namespaced(key), value
74
- end
75
-
76
- def namespaced(key)
77
- "#{@namespace}-#{key}"
78
- end
79
-
80
- def native
81
- @redis
82
- end
83
-
84
- def namespace
85
- @namespace
86
- end
87
-
88
- end