cobweb 0.0.33 → 0.0.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.33
2
+ h1. Cobweb v0.0.34
3
3
 
4
4
  h2. Intro
5
5
 
@@ -4,6 +4,7 @@ require 'resque'
4
4
  require "addressable/uri"
5
5
  require 'digest/sha1'
6
6
  require 'base64'
7
+ require 'namespaced_redis'
7
8
 
8
9
  Dir[File.dirname(__FILE__) + '/**/*.rb'].each do |file|
9
10
  require file
@@ -19,7 +20,7 @@ class Cobweb
19
20
  # investigate using event machine for single threaded crawling
20
21
 
21
22
  def self.version
22
- "0.0.33"
23
+ "0.0.34"
23
24
  end
24
25
 
25
26
  def method_missing(method_sym, *arguments, &block)
@@ -61,7 +62,7 @@ class Cobweb
61
62
  end
62
63
 
63
64
  request.merge!(@options)
64
- @redis = NamespacedRedis.new(Redis.new(request[:redis_options]), "cobweb-#{Cobweb.version}-#{request[:crawl_id]}")
65
+ @redis = NamespacedRedis.new(request[:redis_options], "cobweb-#{Cobweb.version}-#{request[:crawl_id]}")
65
66
  @redis.hset "statistics", "queued_at", DateTime.now
66
67
  @redis.set("crawl-counter", 0)
67
68
  @redis.set("queue-counter", 1)
@@ -74,9 +75,11 @@ class Cobweb
74
75
  end
75
76
 
76
77
  def get(url, options = @options)
77
-
78
78
  raise "url cannot be nil" if url.nil?
79
-
79
+ uri = Addressable::URI.parse(url)
80
+ uri.fragment=nil
81
+ url = uri.to_s
82
+
80
83
  # get the unique id for this request
81
84
  unique_id = Digest::SHA1.hexdigest(url.to_s)
82
85
  if options.has_key?(:redirect_limit) and !options[:redirect_limit].nil?
@@ -87,9 +90,9 @@ class Cobweb
87
90
 
88
91
  # connect to redis
89
92
  if options.has_key? :crawl_id
90
- redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}-#{options[:crawl_id]}")
93
+ redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}-#{options[:crawl_id]}")
91
94
  else
92
- redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}")
95
+ redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}")
93
96
  end
94
97
 
95
98
  content = {:base_url => url}
@@ -100,8 +103,8 @@ class Cobweb
100
103
  content = deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
101
104
  else
102
105
  # this url is valid for processing so lets get on with it
103
- uri = Addressable::URI.parse(url.strip)
104
-
106
+ #TODO the @http here is different from in head. Should it be? - in head we are using a method-scoped variable.
107
+
105
108
  # retrieve data
106
109
  unless @http && @http.address == uri.host && @http.port == uri.inferred_port
107
110
  puts "Creating connection to #{uri.host}..." unless @options[:quiet]
@@ -125,7 +128,7 @@ class Cobweb
125
128
  puts "redirected... " unless @options[:quiet]
126
129
 
127
130
  # get location to redirect to
128
- url = Addressable::URI.join(uri, response['location']).to_s
131
+ url = UriHelper.join_no_fragment(uri, response['location'])
129
132
 
130
133
  # decrement redirect limit
131
134
  redirect_limit = redirect_limit - 1
@@ -231,7 +234,10 @@ class Cobweb
231
234
 
232
235
  def head(url, options = @options)
233
236
  raise "url cannot be nil" if url.nil?
234
-
237
+ uri = Addressable::URI.parse(url.strip)
238
+ uri.fragment=nil
239
+ url = uri.to_s
240
+
235
241
  # get the unique id for this request
236
242
  unique_id = Digest::SHA1.hexdigest(url)
237
243
  if options.has_key?(:redirect_limit) and !options[:redirect_limit].nil?
@@ -242,9 +248,9 @@ class Cobweb
242
248
 
243
249
  # connect to redis
244
250
  if options.has_key? :crawl_id
245
- redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}-#{options[:crawl_id]}")
251
+ redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}-#{options[:crawl_id]}")
246
252
  else
247
- redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}")
253
+ redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}")
248
254
  end
249
255
 
250
256
  content = {}
@@ -255,8 +261,7 @@ class Cobweb
255
261
  content = deep_symbolize_keys(Marshal.load(redis.get("head-#{unique_id}")))
256
262
  else
257
263
  print "Retrieving #{url }... " unless @options[:quiet]
258
- uri = Addressable::URI.parse(url.strip)
259
-
264
+
260
265
  # retrieve data
261
266
  http = Net::HTTP.new(uri.host, uri.inferred_port)
262
267
  if uri.scheme == "https"
@@ -269,11 +274,12 @@ class Cobweb
269
274
  http.open_timeout = @options[:timeout].to_i
270
275
 
271
276
  begin
272
- response = http.head(uri.to_s)
273
-
277
+ request = Net::HTTP::Head.new uri.request_uri
278
+ response = http.request request
279
+
274
280
  if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
275
281
  puts "redirected... " unless @options[:quiet]
276
- url = Addressable::URI.parse(response['location']).to_s
282
+ url = UriHelper.join_no_fragment(uri, response['location'])
277
283
  redirect_limit = redirect_limit - 1
278
284
  options = options.clone
279
285
  options[:redirect_limit]=redirect_limit
@@ -1,6 +1,7 @@
1
1
  require 'digest/md5'
2
2
  require 'date'
3
3
  require 'ap'
4
+ #require 'namespaced_redis'
4
5
 
5
6
  class CobwebCrawler
6
7
 
@@ -11,8 +12,8 @@ class CobwebCrawler
11
12
 
12
13
  @options[:redis_options] = {:host => "127.0.0.1"} unless @options.has_key? :redis_options
13
14
  crawl_id = Digest::MD5.hexdigest(DateTime.now.inspect.to_s)
14
-
15
- @redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{crawl_id}")
15
+
16
+ @redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{crawl_id}")
16
17
 
17
18
  @cobweb = Cobweb.new(@options)
18
19
  end
@@ -3,6 +3,7 @@ class CrawlJob
3
3
  require "net/https"
4
4
  require "uri"
5
5
  require "redis"
6
+ require 'namespaced_redis'
6
7
 
7
8
  @queue = :cobweb_crawl_job
8
9
 
@@ -11,7 +12,7 @@ class CrawlJob
11
12
  # change all hash keys to symbols
12
13
  content_request = content_request.deep_symbolize_keys
13
14
 
14
- @redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
15
+ @redis = NamespacedRedis.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
15
16
 
16
17
  @debug = content_request[:debug]
17
18
 
@@ -57,7 +58,7 @@ class CrawlJob
57
58
 
58
59
  #if the enqueue counter has been requested update that
59
60
  if content_request.has_key? :enqueue_counter_key
60
- enqueue_redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), content_request[:enqueue_counter_namespace].to_s)
61
+ enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
61
62
  current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
62
63
  enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
63
64
  end
@@ -126,11 +127,12 @@ class CrawlJob
126
127
  def self.all_links_from_content(content)
127
128
  links = content[:links].keys.map{|key| content[:links][key]}.flatten
128
129
  links.reject!{|link| link.starts_with?("javascript:")}
129
- links = links.map{|link| Addressable::URI.join(content[:url], link)}
130
+ links = links.map{|link| UriHelper.join_no_fragment(content[:url], link) }
130
131
  links.select!{|link| link.scheme == "http" || link.scheme == "https"}
132
+ links.uniq
131
133
  links
132
134
  end
133
-
135
+
134
136
  def self.enqueue_content(content_request, link)
135
137
  new_request = content_request.clone
136
138
  new_request[:url] = link
@@ -0,0 +1,8 @@
1
+ class UriHelper
2
+ def self.join_no_fragment(content, link)
3
+ new_link = Addressable::URI.join(content, link)
4
+ new_link.fragment=nil
5
+ new_link
6
+ end
7
+
8
+ end
@@ -117,7 +117,7 @@ describe Cobweb do
117
117
 
118
118
  #end
119
119
  it "should not follow with redirect disabled" do
120
- @cobweb = Cobweb.new(:follow_redirects => false, :cache => nil)
120
+ @cobweb = Cobweb.new(:follow_redirects => false, :cache => 3)
121
121
  @mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
122
122
 
123
123
  content = @cobweb.get(@base_url)
@@ -133,7 +133,8 @@ describe Cobweb do
133
133
  describe "with cache" do
134
134
 
135
135
  before(:each) do
136
- @cobweb = Cobweb.new :quiet => true, :cache => 200
136
+ @cobweb = Cobweb.new :quiet => true, :cache => 1
137
+ Redis.new.flushdb
137
138
  end
138
139
 
139
140
  describe "content object" do
@@ -169,8 +170,8 @@ describe Cobweb do
169
170
  @cobweb.get(@base_url)[:body].should == "asdf"
170
171
  end
171
172
  it "should return correct headers" do
172
- @cobweb.get(@base_url)[:headers].should == @default_headers
173
- @cobweb.get(@base_url)[:headers].should == @default_headers
173
+ @cobweb.get(@base_url)[:headers].should == @symbolized_default_headers
174
+ @cobweb.get(@base_url)[:headers].should == @symbolized_default_headers
174
175
  end
175
176
  it "should return correct a hash of links" do
176
177
  @cobweb.get(@base_url)[:links].should be_an_instance_of Hash
@@ -181,7 +182,29 @@ describe Cobweb do
181
182
  @cobweb.get(@base_url)[:response_time].should be_an_instance_of Float
182
183
  end
183
184
  end
184
-
185
185
  end
186
+ describe "location setting" do
187
+ it "Get should strip fragments" do
188
+ Net::HTTP.should_receive(:new).with("www.google.com", 80)
189
+ Net::HTTP::Get.should_receive(:new).with("/")
190
+ @cobweb.get("http://www.google.com/#ignore")
191
+ end
192
+ it "head should strip fragments" do
193
+ Net::HTTP.should_receive(:new).with("www.google.com", 80)
194
+ Net::HTTP::Head.should_receive(:new).with("/").and_return(@mock_http_request)
195
+ @cobweb.head("http://www.google.com/#ignore")
196
+ end
197
+ it "get should not strip path" do
198
+ Net::HTTP.should_receive(:new).with("www.google.com", 80)
199
+ Net::HTTP::Get.should_receive(:new).with("/path/to/stuff")
200
+ @cobweb.get("http://www.google.com/path/to/stuff#ignore")
201
+ end
202
+ it "get should not strip query string" do
203
+ Net::HTTP.should_receive(:new).with("www.google.com", 80)
204
+ Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string")
205
+ @cobweb.get("http://www.google.com/path/to/stuff?query_string#ignore")
206
+ end
207
+ end
208
+
186
209
  end
187
210
  end
@@ -3,11 +3,10 @@ require 'mock_redis'
3
3
 
4
4
  RSpec.configure do |config|
5
5
  config.before(:each) {
6
- #redis_mock = double("redis")
7
- #ap redis_mock
8
- #redis_mock.stub(:new).and_return(MockRedis.new)
6
+ redis_mock = double("redis")
7
+ redis_mock.stub(:new).and_return(@redis_mock_object)
9
8
 
10
- Redis.new.flushdb
9
+ #redis_mock.flushdb
11
10
 
12
11
 
13
12
  @default_headers = {"Cache-Control" => "private, max-age=0",
@@ -19,6 +18,15 @@ RSpec.configure do |config|
19
18
  "Server" => "gws",
20
19
  "X-XSS-Protection" => "1; mode=block"}
21
20
 
21
+ @symbolized_default_headers = {:"Cache-Control" => "private, max-age=0",
22
+ :"Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
23
+ :"Expires" => "-1",
24
+ :"Content-Type" => "text/html; charset=UTF-8",
25
+ :"Content-Encoding" => "",
26
+ :"Transfer-Encoding" => "chunked",
27
+ :"Server" => "gws",
28
+ :"X-XSS-Protection" => "1; mode=block"}
29
+
22
30
  @mock_http_client = mock(Net::HTTP)
23
31
  @mock_http_request = mock(Net::HTTPRequest)
24
32
  @mock_http_redirect_request = mock(Net::HTTPRequest)
@@ -33,7 +41,9 @@ RSpec.configure do |config|
33
41
  Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
34
42
  Net::HTTP::Get.stub!(:new).with("/redirect.html").and_return(@mock_http_redirect_request)
35
43
  Net::HTTP::Get.stub!(:new).with("/redirect2.html").and_return(@mock_http_redirect_request2)
36
-
44
+
45
+ Net::HTTP::Head.stub!(:new).and_return(@mock_http_request)
46
+
37
47
  @mock_http_client.stub!(:request).with(@mock_http_request).and_return(@mock_http_response)
38
48
  @mock_http_client.stub!(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
39
49
  @mock_http_client.stub!(:request).with(@mock_http_redirect_request2).and_return(@mock_http_redirect_response2)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.33
4
+ version: 0.0.34
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-12 00:00:00.000000000 Z
12
+ date: 2012-04-19 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70153227362400 !ruby/object:Gem::Requirement
16
+ requirement: &70240600898440 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70153227362400
24
+ version_requirements: *70240600898440
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70153227361980 !ruby/object:Gem::Requirement
27
+ requirement: &70240600898020 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70153227361980
35
+ version_requirements: *70240600898020
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70153227361560 !ruby/object:Gem::Requirement
38
+ requirement: &70240600897600 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70153227361560
46
+ version_requirements: *70240600897600
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70153227361140 !ruby/object:Gem::Requirement
49
+ requirement: &70240600897180 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70153227361140
57
+ version_requirements: *70240600897180
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70153227360720 !ruby/object:Gem::Requirement
60
+ requirement: &70240600913120 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70153227360720
68
+ version_requirements: *70240600913120
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70153227360300 !ruby/object:Gem::Requirement
71
+ requirement: &70240600912700 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70153227360300
79
+ version_requirements: *70240600912700
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70153227359880 !ruby/object:Gem::Requirement
82
+ requirement: &70240600912280 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70153227359880
90
+ version_requirements: *70240600912280
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70153227359460 !ruby/object:Gem::Requirement
93
+ requirement: &70240600911860 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70153227359460
101
+ version_requirements: *70240600911860
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70153227359040 !ruby/object:Gem::Requirement
104
+ requirement: &70240600911440 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,7 +109,18 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70153227359040
112
+ version_requirements: *70240600911440
113
+ - !ruby/object:Gem::Dependency
114
+ name: namespaced_redis
115
+ requirement: &70240600911020 !ruby/object:Gem::Requirement
116
+ none: false
117
+ requirements:
118
+ - - ! '>='
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
121
+ type: :runtime
122
+ prerelease: false
123
+ version_requirements: *70240600911020
113
124
  description: Web Crawler that uses resque background job engine to allow you to cluster
114
125
  your crawl.
115
126
  email: stewart@rockwellcottage.com
@@ -130,10 +141,10 @@ files:
130
141
  - lib/cobweb_process_job.rb
131
142
  - lib/content_link_parser.rb
132
143
  - lib/crawl_job.rb
133
- - lib/namespaced_redis.rb
134
144
  - lib/redirect_error.rb
135
145
  - lib/robots.rb
136
146
  - lib/stats.rb
147
+ - lib/uri_helper.rb
137
148
  - views/statistics.haml
138
149
  - README.textile
139
150
  homepage: http://github.com/stewartmckee/cobweb
@@ -1,88 +0,0 @@
1
- class NamespacedRedis
2
- def initialize(redis, namespace="")
3
- raise "redis must be supplied" if redis.nil?
4
- @redis = redis
5
- @namespace = namespace
6
- end
7
-
8
- def sismember(key, member)
9
- @redis.sismember namespaced(key), member
10
- end
11
-
12
- def sadd(key, value)
13
- @redis.sadd namespaced(key), value
14
- end
15
-
16
- def srem(key, member)
17
- @redis.srem namespaced(key), member
18
- end
19
-
20
- def spop(key)
21
- @redis.spop namespaced(key)
22
- end
23
-
24
- def smembers(key)
25
- @redis.smembers namespaced(key)
26
- end
27
-
28
- def scard(key)
29
- @redis.scard namespaced(key)
30
- end
31
-
32
- def get(key)
33
- @redis.get namespaced(key)
34
- end
35
-
36
- def incr(key)
37
- @redis.incr namespaced(key)
38
- end
39
-
40
- def decr(key)
41
- @redis.decr namespaced(key)
42
- end
43
-
44
- def exist(key)
45
- @redis.exist namespaced(key)
46
- end
47
-
48
- def set(key, value)
49
- @redis.set namespaced(key), value
50
- end
51
-
52
- def hget(key, member)
53
- @redis.hget namespaced(key), member
54
- end
55
-
56
- def hgetall(key)
57
- @redis.hgetall namespaced(key)
58
- end
59
-
60
- def hset(key, member, value)
61
- @redis.hset namespaced(key), member, value
62
- end
63
-
64
- def hexists(key, member)
65
- @redis.hexists namespaced(key), member
66
- end
67
-
68
- def del(key)
69
- @redis.del namespaced(key)
70
- end
71
-
72
- def expire(key, value)
73
- @redis.expire namespaced(key), value
74
- end
75
-
76
- def namespaced(key)
77
- "#{@namespace}-#{key}"
78
- end
79
-
80
- def native
81
- @redis
82
- end
83
-
84
- def namespace
85
- @namespace
86
- end
87
-
88
- end