cobweb 0.0.32 → 0.0.33

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.32
2
+ h1. Cobweb v0.0.33
3
3
 
4
4
  h2. Intro
5
5
 
@@ -95,13 +95,9 @@ Simple get that obey's the options supplied in new.
95
95
  bq. crawler.head("http://www.google.com/")
96
96
 
97
97
 
98
- h2. Planned Work
98
+ h3. Contributing/Testing
99
99
 
100
- These features are what I'd like to see added to cobweb. If your really interested in any of these features give me a ping, it will give me a good excuse to get them done!
101
-
102
- * remove resque requirement - I'd like the crawler to work independant of resque as a normal standalone crawler
103
- * fix crawl finished notification
104
- * improve internal link control
100
+ Feel free to contribute small or large bits of code, just please make sure that there are rspec test for the features your submitting. We also test on travis at http://travis-ci.org/#!/stewartmckee/cobweb if you want to see the state of the project.
105
101
 
106
102
  h2. License
107
103
 
@@ -19,7 +19,7 @@ class Cobweb
19
19
  # investigate using event machine for single threaded crawling
20
20
 
21
21
  def self.version
22
- "0.0.32"
22
+ "0.0.33"
23
23
  end
24
24
 
25
25
  def method_missing(method_sym, *arguments, &block)
@@ -97,7 +97,7 @@ class Cobweb
97
97
  # check if it has already been cached
98
98
  if redis.get(unique_id) and @options[:cache]
99
99
  puts "Cache hit for #{url}" unless @options[:quiet]
100
- content = Marshal.load(redis.get(unique_id)).deep_symbolize_keys
100
+ content = deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
101
101
  else
102
102
  # this url is valid for processing so lets get on with it
103
103
  uri = Addressable::URI.parse(url.strip)
@@ -169,7 +169,7 @@ class Cobweb
169
169
  content[:body] = Base64.encode64(response.body)
170
170
  end
171
171
  content[:location] = response["location"]
172
- content[:headers] = response.to_hash.deep_symbolize_keys
172
+ content[:headers] = deep_symbolize_keys(response.to_hash)
173
173
  # parse data for links
174
174
  link_parser = ContentLinkParser.new(content[:url], content[:body])
175
175
  content[:links] = link_parser.link_data
@@ -252,7 +252,7 @@ class Cobweb
252
252
  # check if it has already been cached
253
253
  if redis.get("head-#{unique_id}") and @options[:cache]
254
254
  puts "Cache hit for #{url}" unless @options[:quiet]
255
- content = Marshal.load(redis.get("head-#{unique_id}")).deep_symbolize_keys
255
+ content = deep_symbolize_keys(Marshal.load(redis.get("head-#{unique_id}")))
256
256
  else
257
257
  print "Retrieving #{url }... " unless @options[:quiet]
258
258
  uri = Addressable::URI.parse(url.strip)
@@ -336,9 +336,17 @@ class Cobweb
336
336
  content
337
337
  end
338
338
  end
339
+
340
+ def deep_symbolize_keys(hash)
341
+ hash.keys.each do |key|
342
+ value = hash[key]
343
+ hash.delete(key)
344
+ hash[key.to_sym] = value
345
+ if hash[key.to_sym].instance_of? Hash
346
+ hash[key.to_sym] = deep_symbolize_keys(hash[key.to_sym])
347
+ end
348
+ end
349
+ hash
350
+ end
351
+
339
352
  end
340
-
341
-
342
-
343
-
344
-
@@ -18,7 +18,10 @@ class ContentLinkParser
18
18
  @options[:tags][:images] = [["img[src]", "src"]]
19
19
  @options[:tags][:related] = [["link[rel]", "href"]]
20
20
  @options[:tags][:scripts] = [["script[src]", "src"]]
21
- @options[:tags][:styles] = [["link[rel='stylesheet'][href]", "href"], ["style[@type^='text/css']", /url\("?(.*?)"?\)/]]
21
+ @options[:tags][:styles] = [["link[rel='stylesheet'][href]", "href"], ["style[@type^='text/css']", lambda{|array,tag|
22
+ first_regex =/url\((['"]?)(.*?)\1\)/
23
+ tag.content.scan(first_regex) {|match| array << Addressable::URI.parse(match[1]).to_s}
24
+ }]]
22
25
 
23
26
  #clear the default tags if required
24
27
  @options[:tags] = {} if @options[:ignore_default_tags]
@@ -67,6 +70,13 @@ class ContentLinkParser
67
70
  rescue
68
71
  end
69
72
  end
73
+ elsif attribute.instance_of? Proc
74
+ @doc.css(selector).each do |tag|
75
+ begin
76
+ attribute.call(array, tag)
77
+ rescue
78
+ end
79
+ end
70
80
  end
71
81
  end
72
82
 
@@ -35,19 +35,21 @@ class CrawlJob
35
35
  # set the base url if this is the first page
36
36
  set_base_url @redis, content, content_request
37
37
 
38
- internal_links = all_links_from_content(content).map{|link| link.to_s}
38
+ if within_queue_limits?(content_request[:crawl_limit])
39
+ internal_links = all_links_from_content(content).map{|link| link.to_s}
39
40
 
40
- # reject the link if we've crawled it or queued it
41
- internal_links.reject!{|link| @redis.sismember("crawled", link)}
42
- internal_links.reject!{|link| @redis.sismember("queued", link)}
41
+ # reject the link if we've crawled it or queued it
42
+ internal_links.reject!{|link| @redis.sismember("crawled", link)}
43
+ internal_links.reject!{|link| @redis.sismember("queued", link)}
43
44
 
44
- # select the link if its internal
45
- internal_links.select!{|link| internal_link?(link)}
45
+ # select the link if its internal
46
+ internal_links.select!{|link| internal_link?(link)}
46
47
 
47
- internal_links.each do |link|
48
- enqueue_content(content_request, link) if within_queue_limits?(content_request[:crawl_limit])
48
+ internal_links.each do |link|
49
+ enqueue_content(content_request, link) if within_queue_limits?(content_request[:crawl_limit])
50
+ end
49
51
  end
50
-
52
+
51
53
  # enqueue to processing queue
52
54
  Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:internal_urls => internal_patterns, :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]}))
53
55
  puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
@@ -74,11 +76,10 @@ class CrawlJob
74
76
  Resque.enqueue(const_get(content_request[:crawl_finished_queue]), stats.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
75
77
 
76
78
  end
77
-
78
- else
79
- puts "Crawl Limit Exceeded by #{@crawl_counter - content_request[:crawl_limit].to_i} objects" if content_request[:debug]
80
79
  end
81
80
  else
81
+ @redis.srem "queued", content_request[:url]
82
+ decrement_queue_counter
82
83
  puts "Already crawled #{content_request[:url]}" if content_request[:debug]
83
84
  end
84
85
 
@@ -87,11 +88,11 @@ class CrawlJob
87
88
  private
88
89
 
89
90
  def self.within_crawl_limits?(crawl_limit)
90
- crawl_limit.nil? or @crawl_counter <= crawl_limit.to_i
91
+ crawl_limit.nil? or @crawl_counter < crawl_limit.to_i
91
92
  end
92
93
 
93
94
  def self.within_queue_limits?(crawl_limit)
94
- within_crawl_limits?(crawl_limit) and (crawl_limit.nil? or @queue_counter <= crawl_limit.to_i)
95
+ within_crawl_limits?(crawl_limit) and (crawl_limit.nil? or (@queue_counter + @crawl_counter) < crawl_limit.to_i)
95
96
  end
96
97
 
97
98
  def self.set_base_url(redis, content, content_request)
@@ -17,41 +17,40 @@ describe CobwebCrawler do
17
17
 
18
18
  end
19
19
 
20
- describe "with mock" do
21
-
22
-
23
- it "should generate a cobweb_crawler object" do
24
- CobwebCrawler.new.should be_an_instance_of CobwebCrawler
20
+
21
+ it "should generate a cobweb_crawler object" do
22
+ CobwebCrawler.new.should be_an_instance_of CobwebCrawler
23
+ end
24
+
25
+ describe "crawl" do
26
+ it "should crawl a site" do
27
+
28
+ # temporary tests to run crawler - proper specs to follow.. honest
29
+
30
+ crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
31
+
32
+ statistics = crawler.crawl("http://rockwellcottage.heroku.com/")
33
+
34
+ statistics.should_not be_nil
35
+ statistics.should be_an_instance_of Hash
36
+
25
37
  end
26
38
 
27
- describe "crawl" do
28
- it "should crawl a site" do
29
-
30
- # temporary tests to run crawler - proper specs to follow.. honest
31
-
32
- crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
33
-
34
- statistics = crawler.crawl("http://rockwellcottage.heroku.com/")
35
-
36
- ap statistics
37
-
38
- end
39
-
40
- it "should take a block" do
39
+ it "should take a block" do
41
40
 
42
- # temporary tests to run crawler - proper specs to follow.. honest
41
+ # temporary tests to run crawler - proper specs to follow.. honest
43
42
 
44
- crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
45
-
46
- statistics = crawler.crawl("http://www.rockwellcottage.com/") do |content, statistics|
47
- ap content[:url]
48
- ap statistics[:average_length]
49
- end
50
-
51
- ap statistics
52
-
43
+ crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
44
+
45
+ statistics = crawler.crawl("http://www.rockwellcottage.com/") do |content, statistics|
46
+ content[:url].should_not be_nil
47
+ statistics[:average_length].should_not be_nil
53
48
  end
54
- end
49
+
50
+ statistics.should_not be_nil
51
+ statistics.should be_an_instance_of Hash
52
+
53
+ end
55
54
  end
56
55
 
57
56
  end
@@ -3,253 +3,185 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
3
3
  describe Cobweb do
4
4
 
5
5
  before(:each) do
6
-
7
6
  @base_url = "http://www.baseurl.com/"
8
-
9
- @default_headers = {"Cache-Control" => "private, max-age=0",
10
- "Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
11
- "Expires" => "-1",
12
- "Content-Type" => "text/html; charset=UTF-8",
13
- "Content-Encoding" => "gzip",
14
- "Transfer-Encoding" => "chunked",
15
- "Server" => "gws",
16
- "X-XSS-Protection" => "1; mode=block"}
17
-
18
7
  @cobweb = Cobweb.new :quiet => true, :cache => nil
19
- end
20
-
21
- describe "with mock" do
22
- before(:each) do
23
- @mock_http_client = mock(Net::HTTP)
24
- @mock_http_request = mock(Net::HTTPRequest)
25
- @mock_http_redirect_request = mock(Net::HTTPRequest)
26
- @mock_http_redirect_request2 = mock(Net::HTTPRequest)
27
-
28
- @mock_http_response = mock(Net::HTTPResponse)
29
- @mock_http_redirect_response = mock(Net::HTTPRedirection)
30
- @mock_http_redirect_response2 = mock(Net::HTTPRedirection)
31
- @mock_http_get = mock(Net::HTTP::Get)
32
-
33
- Net::HTTP.stub!(:new).and_return(@mock_http_client)
34
- Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
35
- Net::HTTP::Get.stub!(:new).with("/redirect.html").and_return(@mock_http_redirect_request)
36
- Net::HTTP::Get.stub!(:new).with("/redirect2.html").and_return(@mock_http_redirect_request2)
37
-
38
- @mock_http_client.stub!(:request).with(@mock_http_request).and_return(@mock_http_response)
39
- @mock_http_client.stub!(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
40
- @mock_http_client.stub!(:request).with(@mock_http_redirect_request2).and_return(@mock_http_redirect_response2)
41
- @mock_http_client.stub!(:read_timeout=).and_return(nil)
42
- @mock_http_client.stub!(:open_timeout=).and_return(nil)
43
- @mock_http_client.stub!(:start).and_return(@mock_http_response)
44
- @mock_http_client.stub!(:address).and_return("www.baseurl.com")
45
- @mock_http_client.stub!(:port).and_return("80 ")
46
-
47
- @mock_http_response.stub!(:code).and_return(200)
48
- @mock_http_response.stub!(:content_type).and_return("text/html")
49
- @mock_http_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
50
- @mock_http_response.stub!(:[]).with("location").and_return(@default_headers["location"])
51
- @mock_http_response.stub!(:content_length).and_return(1024)
52
- @mock_http_response.stub!(:body).and_return("asdf")
53
- @mock_http_response.stub!(:to_hash).and_return(@default_headers)
54
-
55
- @mock_http_redirect_response.stub!(:code).and_return(301)
56
- @mock_http_redirect_response.stub!(:content_type).and_return("text/html")
57
- @mock_http_redirect_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
58
- @mock_http_redirect_response.stub!(:[]).with("location").and_return("http://redirected-to.com/redirect2.html")
59
- @mock_http_redirect_response.stub!(:content_length).and_return(2048)
60
- @mock_http_redirect_response.stub!(:body).and_return("redirected body")
61
- @mock_http_redirect_response.stub!(:to_hash).and_return(@default_headers)
62
-
63
- @mock_http_redirect_response2.stub!(:code).and_return(301)
64
- @mock_http_redirect_response2.stub!(:content_type).and_return("text/html")
65
- @mock_http_redirect_response2.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
66
- @mock_http_redirect_response2.stub!(:[]).with("location").and_return("http://redirected-to.com/redirected.html")
67
- @mock_http_redirect_response2.stub!(:content_length).and_return(2048)
68
- @mock_http_redirect_response2.stub!(:body).and_return("redirected body")
69
- @mock_http_redirect_response2.stub!(:to_hash).and_return(@default_headers)
70
-
8
+ end
9
+
10
+ it "should generate a cobweb object" do
11
+ Cobweb.new.should be_an_instance_of Cobweb
12
+ end
13
+
14
+ it "should setup with defaults" do
15
+ cobweb = Cobweb.new
16
+
17
+ options = cobweb.instance_eval("@options")
18
+
19
+ options[:follow_redirects].should == true
20
+ options[:redirect_limit].should == 10
21
+ options[:processing_queue].should == CobwebProcessJob
22
+ options[:crawl_finished_queue].should == CobwebFinishedJob
23
+ options[:quiet].should == true
24
+ options[:debug].should == false
25
+ options[:cache].should == 300
26
+ options[:timeout].should == 10
27
+ options[:redis_options].should == {}
28
+ options[:internal_urls].should == []
29
+
30
+ end
31
+
32
+ describe "get" do
33
+ it "should return a hash with default values" do
34
+ @cobweb.get(@base_url).should be_an_instance_of Hash
71
35
  end
72
36
 
73
- it "should generate a cobweb object" do
74
- Cobweb.new.should be_an_instance_of Cobweb
37
+ it "should return a hash with default values without quiet option" do
38
+ @cobweb.get(@base_url).should be_an_instance_of Hash
75
39
  end
76
40
 
77
- it "should setup with defaults" do
78
- cobweb = Cobweb.new
79
-
80
- options = cobweb.instance_eval("@options")
81
- ap options
82
-
83
- options[:follow_redirects].should == true
84
- options[:redirect_limit].should == 10
85
- options[:processing_queue].should == CobwebProcessJob
86
- options[:crawl_finished_queue].should == CobwebFinishedJob
87
- options[:quiet].should == true
88
- options[:debug].should == false
89
- options[:cache].should == 300
90
- options[:timeout].should == 10
91
- options[:redis_options].should == {}
92
- options[:internal_urls].should == []
93
-
41
+ it "should raise exception if there is no url" do
42
+ lambda {@cobweb.get(nil)}.should raise_error("url cannot be nil")
94
43
  end
95
44
 
96
- describe "get" do
97
- it "should return a hash with default values" do
98
- @cobweb.get(@base_url).should be_an_instance_of Hash
45
+ describe "content object" do
46
+ it "should return the url" do
47
+ @cobweb.get(@base_url)[:url].should == @base_url
48
+ end
49
+ it "should return correct content-type" do
50
+ @mock_http_response.stub!(:content_type).and_return("image/jpeg")
51
+ @cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
52
+ end
53
+ it "should return correct status-code" do
54
+ @mock_http_response.stub!(:code).and_return(404)
55
+ @cobweb.get(@base_url)[:status_code].should == 404
56
+ end
57
+ it "should return correct status-code" do
58
+ @mock_http_response.stub!(:code).and_return(404)
59
+ @cobweb.get(@base_url)[:status_code].should == 404
60
+ end
61
+ it "should return correct character_set" do
62
+ @cobweb.get(@base_url)[:character_set].should == "UTF-8"
63
+ end
64
+ it "should return correct content_length" do
65
+ @cobweb.get(@base_url)[:length].should == 1024
66
+ end
67
+ it "should return correct content_body" do
68
+ @cobweb.get(@base_url)[:body].should == "asdf"
69
+ end
70
+ it "should return correct location" do
71
+ @cobweb.get(@base_url)[:location].should == nil
72
+
73
+ @mock_http_response.stub!(:[]).with("location").and_return("http://google.com/")
74
+ @cobweb.get(@base_url)[:location].should == "http://google.com/"
75
+ end
76
+ it "should return correct headers" do
77
+ @cobweb.get(@base_url)[:headers].should == @default_headers
78
+ end
79
+ it "should return correct a hash of links" do
80
+ @cobweb.get(@base_url)[:links].should be_an_instance_of Hash
81
+ end
82
+ it "should return the response time for the url" do
83
+ @cobweb.get(@base_url)[:response_time].should be_an_instance_of Float
84
+ end
85
+ end
86
+ describe "with redirect" do
87
+
88
+ before(:each) do
89
+ @base_url = "http://redirect-me.com/redirect.html"
90
+ @cobweb = Cobweb.new(:follow_redirects => true, :quiet => true, :cache => nil)
91
+
92
+ @mock_http_response.stub!(:[]).with("location").and_return("http://google.com/")
93
+ @mock_http_redirect_response.stub!(:[]).with("location").and_return("http://redirected-to.com/redirect2.html")
94
+ @mock_http_redirect_response2.stub!(:[]).with("location").and_return("http://redirected-to.com/redirected.html")
95
+
99
96
  end
100
97
 
101
- it "should return a hash with default values without quiet option" do
102
- @cobweb.get(@base_url).should be_an_instance_of Hash
98
+ it "should flow through redirect" #do
99
+
100
+ #@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
101
+ #@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
102
+ #
103
+ #content = @cobweb.get(@base_url)
104
+ #content.should be_an_instance_of HashHelper
105
+ #ap content
106
+ #content[:url].should == "http://redirect-me.com/redirect.html"
107
+ #content[:redirect_through].length.should == 2
108
+ #content[:mime_type].should == "text/html"
109
+ #content[:body].should == "asdf"
110
+
111
+ #end
112
+ it "should return the path followed" #do
113
+ #@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
114
+ #
115
+ #content = @cobweb.get(@base_url)
116
+ #content[:redirect_through].should == ["http://redirected-to.com/redirect2.html", "http://redirected-to.com/redirected.html"]
117
+
118
+ #end
119
+ it "should not follow with redirect disabled" do
120
+ @cobweb = Cobweb.new(:follow_redirects => false, :cache => nil)
121
+ @mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
122
+
123
+ content = @cobweb.get(@base_url)
124
+ content[:url].should == "http://redirect-me.com/redirect.html"
125
+ content[:redirect_through].should be_nil
126
+ content[:status_code].should == 301
127
+ content[:mime_type].should == "text/html"
128
+ content[:body].should == "redirected body"
129
+
103
130
  end
131
+ end
132
+
133
+ describe "with cache" do
104
134
 
105
- it "should raise exception if there is no url" do
106
- lambda {@cobweb.get(nil)}.should raise_error("url cannot be nil")
135
+ before(:each) do
136
+ @cobweb = Cobweb.new :quiet => true, :cache => 200
107
137
  end
108
138
 
109
139
  describe "content object" do
110
140
  it "should return the url" do
111
141
  @cobweb.get(@base_url)[:url].should == @base_url
142
+ @cobweb.get(@base_url)[:url].should == @base_url
112
143
  end
113
144
  it "should return correct content-type" do
114
145
  @mock_http_response.stub!(:content_type).and_return("image/jpeg")
115
146
  @cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
147
+ @cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
116
148
  end
117
149
  it "should return correct status-code" do
118
150
  @mock_http_response.stub!(:code).and_return(404)
119
151
  @cobweb.get(@base_url)[:status_code].should == 404
152
+ @cobweb.get(@base_url)[:status_code].should == 404
120
153
  end
121
154
  it "should return correct status-code" do
122
155
  @mock_http_response.stub!(:code).and_return(404)
123
156
  @cobweb.get(@base_url)[:status_code].should == 404
157
+ @cobweb.get(@base_url)[:status_code].should == 404
124
158
  end
125
159
  it "should return correct character_set" do
126
160
  @cobweb.get(@base_url)[:character_set].should == "UTF-8"
161
+ @cobweb.get(@base_url)[:character_set].should == "UTF-8"
127
162
  end
128
163
  it "should return correct content_length" do
129
164
  @cobweb.get(@base_url)[:length].should == 1024
165
+ @cobweb.get(@base_url)[:length].should == 1024
130
166
  end
131
167
  it "should return correct content_body" do
132
168
  @cobweb.get(@base_url)[:body].should == "asdf"
133
- end
134
- it "should return correct location" do
135
- @cobweb.get(@base_url)[:location].should == nil
136
-
137
- @mock_http_response.stub!(:[]).with("location").and_return("http://google.com/")
138
- @cobweb.get(@base_url)[:location].should == "http://google.com/"
169
+ @cobweb.get(@base_url)[:body].should == "asdf"
139
170
  end
140
171
  it "should return correct headers" do
141
172
  @cobweb.get(@base_url)[:headers].should == @default_headers
173
+ @cobweb.get(@base_url)[:headers].should == @default_headers
142
174
  end
143
175
  it "should return correct a hash of links" do
144
176
  @cobweb.get(@base_url)[:links].should be_an_instance_of Hash
177
+ @cobweb.get(@base_url)[:links].should be_an_instance_of Hash
145
178
  end
146
179
  it "should return the response time for the url" do
147
180
  @cobweb.get(@base_url)[:response_time].should be_an_instance_of Float
148
- end
149
- end
150
- describe "with redirect" do
151
-
152
- before(:each) do
153
- @base_url = "http://redirect-me.com/redirect.html"
154
- @cobweb = Cobweb.new(:follow_redirects => true, :quiet => true, :cache => nil)
155
- end
156
-
157
- it "should flow through redirect" #do
158
-
159
- #@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
160
- #@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
161
- #
162
- #content = @cobweb.get(@base_url)
163
- #content.should be_an_instance_of HashHelper
164
- #ap content
165
- #content[:url].should == "http://redirect-me.com/redirect.html"
166
- #content[:redirect_through].length.should == 2
167
- #content[:mime_type].should == "text/html"
168
- #content[:body].should == "asdf"
169
-
170
- #end
171
- it "should return the path followed" #do
172
- #@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
173
- #
174
- #content = @cobweb.get(@base_url)
175
- #content[:redirect_through].should == ["http://redirected-to.com/redirect2.html", "http://redirected-to.com/redirected.html"]
176
-
177
- #end
178
- it "should not follow with redirect disabled" do
179
- @cobweb = Cobweb.new(:follow_redirects => false, :cache => nil)
180
- @mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
181
-
182
- content = @cobweb.get(@base_url)
183
- content[:url].should == "http://redirect-me.com/redirect.html"
184
- content[:redirect_through].should be_nil
185
- content[:status_code].should == 301
186
- content[:mime_type].should == "text/html"
187
- content[:body].should == "redirected body"
188
-
181
+ @cobweb.get(@base_url)[:response_time].should be_an_instance_of Float
189
182
  end
190
183
  end
191
184
 
192
- describe "with cache" do
193
-
194
- before(:each) do
195
- @cobweb = Cobweb.new :quiet => true, :cache => 200
196
- end
197
-
198
- describe "content object" do
199
- it "should return the url" do
200
- @cobweb.get(@base_url)[:url].should == @base_url
201
- @cobweb.get(@base_url)[:url].should == @base_url
202
- end
203
- it "should return correct content-type" do
204
- @mock_http_response.stub!(:content_type).and_return("image/jpeg")
205
- @cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
206
- @cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
207
- end
208
- it "should return correct status-code" do
209
- @mock_http_response.stub!(:code).and_return(404)
210
- @cobweb.get(@base_url)[:status_code].should == 404
211
- @cobweb.get(@base_url)[:status_code].should == 404
212
- end
213
- it "should return correct status-code" do
214
- @mock_http_response.stub!(:code).and_return(404)
215
- @cobweb.get(@base_url)[:status_code].should == 404
216
- @cobweb.get(@base_url)[:status_code].should == 404
217
- end
218
- it "should return correct character_set" do
219
- @cobweb.get(@base_url)[:character_set].should == "UTF-8"
220
- @cobweb.get(@base_url)[:character_set].should == "UTF-8"
221
- end
222
- it "should return correct content_length" do
223
- @cobweb.get(@base_url)[:length].should == 1024
224
- @cobweb.get(@base_url)[:length].should == 1024
225
- end
226
- it "should return correct content_body" do
227
- @cobweb.get(@base_url)[:body].should == "asdf"
228
- @cobweb.get(@base_url)[:body].should == "asdf"
229
- end
230
- it "should return correct location" do
231
- @cobweb.get(@base_url)[:location].should == nil
232
- @cobweb.get(@base_url)[:location].should == nil
233
-
234
- @mock_http_response.stub!(:[]).with("location").and_return("http://google.com/")
235
- @cobweb.get(@base_url)[:location].should == "http://google.com/"
236
- @cobweb.get(@base_url)[:location].should == "http://google.com/"
237
- end
238
- it "should return correct headers" do
239
- @cobweb.get(@base_url)[:headers].should == @default_headers
240
- @cobweb.get(@base_url)[:headers].should == @default_headers
241
- end
242
- it "should return correct a hash of links" do
243
- @cobweb.get(@base_url)[:links].should be_an_instance_of Hash
244
- @cobweb.get(@base_url)[:links].should be_an_instance_of Hash
245
- end
246
- it "should return the response time for the url" do
247
- @cobweb.get(@base_url)[:response_time].should be_an_instance_of Float
248
- @cobweb.get(@base_url)[:response_time].should be_an_instance_of Float
249
- end
250
- end
251
-
252
- end
253
- end
185
+ end
254
186
  end
255
187
  end
@@ -3,102 +3,139 @@ require File.expand_path(File.dirname(__FILE__) + '/../../lib/content_link_parse
3
3
 
4
4
  describe ContentLinkParser do
5
5
 
6
- before(:each) do
7
- @base_url = "http://www.baseurl.com/"
8
- @content = File.read(File.dirname(__FILE__) + "/../samples/sample_html_links.html")
9
- @content_parser = ContentLinkParser.new("http://sample-links.com/", @content)
10
- end
11
-
12
- it "should load the sample document" do
13
- @content.should_not be_nil
14
- @content.should_not be_empty
15
- end
16
-
17
- it "should create a content link parser" do
18
- @content_parser.should_not be_nil
19
- @content_parser.should be_an_instance_of ContentLinkParser
20
- end
21
-
22
- describe "using default tags" do
23
- describe "returning general links" do
24
- it "should return some links from the sample data" do
25
- links = @content_parser.links
26
- links.should_not be_nil
27
- links.should_not be_empty
28
- end
29
- it "should return the correct links" do
30
- links = @content_parser.links
31
- links.length.should == 4
32
- end
6
+ describe "Sample Links Document" do
7
+ before(:each) do
8
+ @base_url = "http://www.baseurl.com/"
9
+ @content = File.read(File.dirname(__FILE__) + "/../samples/sample_html_links.html")
10
+ @content_parser = ContentLinkParser.new("http://sample-links.com/", @content)
33
11
  end
34
- describe "returning image links" do
35
- it "should return some image links from the sample data" do
36
- links = @content_parser.images
37
- links.should_not be_nil
38
- links.should_not be_empty
12
+
13
+ it "should load the sample document" do
14
+ @content.should_not be_nil
15
+ @content.should_not be_empty
16
+ end
17
+
18
+ it "should create a content link parser" do
19
+ @content_parser.should_not be_nil
20
+ @content_parser.should be_an_instance_of ContentLinkParser
21
+ end
22
+
23
+ describe "using default tags" do
24
+ describe "returning general links" do
25
+ it "should return some links from the sample data" do
26
+ links = @content_parser.links
27
+ links.should_not be_nil
28
+ links.should_not be_empty
29
+ end
30
+ it "should return the correct links" do
31
+ links = @content_parser.links
32
+ links.length.should == 4
33
+ end
39
34
  end
40
- it "should return the correct links" do
41
- links = @content_parser.images
42
- links.length.should == 1
35
+ describe "returning image links" do
36
+ it "should return some image links from the sample data" do
37
+ links = @content_parser.images
38
+ links.should_not be_nil
39
+ links.should_not be_empty
40
+ end
41
+ it "should return the correct links" do
42
+ links = @content_parser.images
43
+ links.length.should == 1
44
+ end
43
45
  end
44
- end
45
- describe "returning related links" do
46
- it "should return some related links from the sample data" do
47
- links = @content_parser.related
48
- links.should_not be_nil
49
- links.should_not be_empty
46
+ describe "returning related links" do
47
+ it "should return some related links from the sample data" do
48
+ links = @content_parser.related
49
+ links.should_not be_nil
50
+ links.should_not be_empty
51
+ end
52
+ it "should return the correct links" do
53
+ links = @content_parser.related
54
+ links.length.should == 2
55
+ end
50
56
  end
51
- it "should return the correct links" do
52
- links = @content_parser.related
53
- links.length.should == 2
57
+ describe "returning script links" do
58
+ it "should return some script links from the sample data" do
59
+ links = @content_parser.scripts
60
+ links.should_not be_nil
61
+ links.should_not be_empty
62
+ end
63
+ it "should return the correct links" do
64
+ links = @content_parser.scripts
65
+ links.length.should == 1
66
+ end
54
67
  end
55
- end
56
- describe "returning script links" do
57
- it "should return some script links from the sample data" do
58
- links = @content_parser.scripts
59
- links.should_not be_nil
60
- links.should_not be_empty
68
+ describe "returning style links" do
69
+ it "should return some style links from the sample data" do
70
+ links = @content_parser.styles
71
+ links.should_not be_nil
72
+ links.should_not be_empty
73
+ end
74
+ it "should return the correct links" do
75
+ links = @content_parser.styles
76
+ links.length.should == 3
77
+ end
61
78
  end
62
- it "should return the correct links" do
63
- links = @content_parser.scripts
64
- links.length.should == 1
79
+ describe "returning unknown link type" do
80
+ it "should return an empty array" do
81
+ links = @content_parser.asdfasdfsadf
82
+ links.should_not be_nil
83
+ links.should be_an_instance_of Array
84
+ end
65
85
  end
66
86
  end
67
- describe "returning style links" do
68
- it "should return some style links from the sample data" do
69
- links = @content_parser.styles
70
- links.should_not be_nil
71
- links.should_not be_empty
72
- end
73
- it "should return the correct links" do
74
- links = @content_parser.styles
75
- links.length.should == 3
87
+
88
+ describe "returning all link data" do
89
+ it "should return a hash with all link data" do
90
+ link_data = @content_parser.link_data
91
+ link_data.should_not be_nil
92
+ link_data.should be_an_instance_of Hash
93
+
94
+ link_data.keys.length.should == 5
95
+ link_data[:links].length.should == 4
76
96
  end
77
97
  end
78
- describe "returning unknown link type" do
79
- it "should return an empty array" do
80
- links = @content_parser.asdfasdfsadf
81
- links.should_not be_nil
82
- links.should be_an_instance_of Array
98
+
99
+ describe "ignoring default tags" do
100
+ it "should not return any links" do
101
+ parser = ContentLinkParser.new("http://sample-links.com", @content, :ignore_default_tags => true)
102
+ parser.links.should be_empty
83
103
  end
84
104
  end
85
105
  end
86
106
 
87
- describe "returning all link data" do
88
- it "should return a hash with all link data" do
89
- link_data = @content_parser.link_data
90
- link_data.should_not be_nil
91
- link_data.should be_an_instance_of Hash
92
-
93
- link_data.keys.length.should == 5
94
- link_data[:links].length.should == 4
107
+ describe "simple style based documents" do
108
+ def create_content(url)
109
+ <<-eos
110
+ <html>
111
+ <head>
112
+ <STYLE TYPE="text/css" MEDIA="screen, projection">
113
+ @import url(#{url});
114
+ </STYLE>
115
+ </head>
116
+ </html>
117
+ eos
95
118
  end
96
- end
97
-
98
- describe "ignoring default tags" do
99
- it "should not return any links" do
100
- parser = ContentLinkParser.new("http://sample-links.com", @content, :ignore_default_tags => true)
101
- parser.links.should be_empty
119
+ before :each do
120
+ @base_url = "http://www.baseurl.com/path"
121
+ end
122
+ it "should have the right link for a single quoted style" do
123
+ @content_parser = ContentLinkParser.new("http://sample-links.com/", create_content("'/new'"))
124
+ styles = @content_parser.styles
125
+ styles.length.should==1
126
+ styles[0].should=="/new"
127
+ end
128
+ it "should have the right link for a double quoted style" do
129
+ @content_parser = ContentLinkParser.new("http://sample-links.com/", create_content('"/new"'))
130
+ styles = @content_parser.styles
131
+ styles.length.should==1
132
+ styles[0].should=="/new"
133
+ end
134
+ it "should just leave links with differing quotes alone" do
135
+ @content_parser = ContentLinkParser.new("http://sample-links.com/", create_content('"new\''))
136
+ styles = @content_parser.styles
137
+ styles.length.should==1
138
+ styles[0].should=="\"new'"
102
139
  end
103
140
  end
104
141
  end
@@ -8,6 +8,67 @@ RSpec.configure do |config|
8
8
  #redis_mock.stub(:new).and_return(MockRedis.new)
9
9
 
10
10
  Redis.new.flushdb
11
+
12
+
13
+ @default_headers = {"Cache-Control" => "private, max-age=0",
14
+ "Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
15
+ "Expires" => "-1",
16
+ "Content-Type" => "text/html; charset=UTF-8",
17
+ "Content-Encoding" => "",
18
+ "Transfer-Encoding" => "chunked",
19
+ "Server" => "gws",
20
+ "X-XSS-Protection" => "1; mode=block"}
21
+
22
+ @mock_http_client = mock(Net::HTTP)
23
+ @mock_http_request = mock(Net::HTTPRequest)
24
+ @mock_http_redirect_request = mock(Net::HTTPRequest)
25
+ @mock_http_redirect_request2 = mock(Net::HTTPRequest)
26
+
27
+ @mock_http_response = mock(Net::HTTPResponse)
28
+ @mock_http_redirect_response = mock(Net::HTTPRedirection)
29
+ @mock_http_redirect_response2 = mock(Net::HTTPRedirection)
30
+ @mock_http_get = mock(Net::HTTP::Get)
31
+
32
+ Net::HTTP.stub!(:new).and_return(@mock_http_client)
33
+ Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
34
+ Net::HTTP::Get.stub!(:new).with("/redirect.html").and_return(@mock_http_redirect_request)
35
+ Net::HTTP::Get.stub!(:new).with("/redirect2.html").and_return(@mock_http_redirect_request2)
36
+
37
+ @mock_http_client.stub!(:request).with(@mock_http_request).and_return(@mock_http_response)
38
+ @mock_http_client.stub!(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
39
+ @mock_http_client.stub!(:request).with(@mock_http_redirect_request2).and_return(@mock_http_redirect_response2)
40
+ @mock_http_client.stub!(:read_timeout=).and_return(nil)
41
+ @mock_http_client.stub!(:open_timeout=).and_return(nil)
42
+ @mock_http_client.stub!(:start).and_return(@mock_http_response)
43
+ @mock_http_client.stub!(:address).and_return("www.baseurl.com")
44
+ @mock_http_client.stub!(:port).and_return("80 ")
45
+
46
+ @mock_http_response.stub!(:code).and_return(200)
47
+ @mock_http_response.stub!(:content_type).and_return("text/html")
48
+ @mock_http_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
49
+ @mock_http_response.stub!(:[]).with("location").and_return(@default_headers["location"])
50
+ @mock_http_response.stub!(:[]).with("Content-Encoding").and_return(@default_headers["Content-Encoding"])
51
+ @mock_http_response.stub!(:content_length).and_return(1024)
52
+ @mock_http_response.stub!(:body).and_return("asdf")
53
+ @mock_http_response.stub!(:to_hash).and_return(@default_headers)
54
+
55
+ @mock_http_redirect_response.stub!(:code).and_return(301)
56
+ @mock_http_redirect_response.stub!(:content_type).and_return("text/html")
57
+ @mock_http_redirect_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
58
+ @mock_http_redirect_response.stub!(:[]).with("location").and_return("http://redirected-to.com/redirect2.html")
59
+ @mock_http_redirect_response.stub!(:[]).with("Content-Encoding").and_return(@default_headers["Content-Encoding"])
60
+ @mock_http_redirect_response.stub!(:content_length).and_return(2048)
61
+ @mock_http_redirect_response.stub!(:body).and_return("redirected body")
62
+ @mock_http_redirect_response.stub!(:to_hash).and_return(@default_headers)
63
+
64
+ @mock_http_redirect_response2.stub!(:code).and_return(301)
65
+ @mock_http_redirect_response2.stub!(:content_type).and_return("text/html")
66
+ @mock_http_redirect_response2.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
67
+ @mock_http_redirect_response2.stub!(:[]).with("location").and_return("http://redirected-to.com/redirected.html")
68
+ @mock_http_redirect_response2.stub!(:[]).with("Content-Encoding").and_return(@default_headers["Content-Encoding"])
69
+ @mock_http_redirect_response2.stub!(:content_length).and_return(2048)
70
+ @mock_http_redirect_response2.stub!(:body).and_return("redirected body")
71
+ @mock_http_redirect_response2.stub!(:to_hash).and_return(@default_headers)
11
72
  }
12
73
 
13
74
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.32
4
+ version: 0.0.33
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-03-19 00:00:00.000000000 Z
12
+ date: 2012-04-12 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70185084752540 !ruby/object:Gem::Requirement
16
+ requirement: &70153227362400 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70185084752540
24
+ version_requirements: *70153227362400
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70185084752120 !ruby/object:Gem::Requirement
27
+ requirement: &70153227361980 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70185084752120
35
+ version_requirements: *70153227361980
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70185084751700 !ruby/object:Gem::Requirement
38
+ requirement: &70153227361560 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70185084751700
46
+ version_requirements: *70153227361560
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70185084751280 !ruby/object:Gem::Requirement
49
+ requirement: &70153227361140 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70185084751280
57
+ version_requirements: *70153227361140
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70185084750860 !ruby/object:Gem::Requirement
60
+ requirement: &70153227360720 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70185084750860
68
+ version_requirements: *70153227360720
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70185084750440 !ruby/object:Gem::Requirement
71
+ requirement: &70153227360300 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70185084750440
79
+ version_requirements: *70153227360300
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70185084750020 !ruby/object:Gem::Requirement
82
+ requirement: &70153227359880 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70185084750020
90
+ version_requirements: *70153227359880
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70185084749600 !ruby/object:Gem::Requirement
93
+ requirement: &70153227359460 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70185084749600
101
+ version_requirements: *70153227359460
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70185084765540 !ruby/object:Gem::Requirement
104
+ requirement: &70153227359040 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,7 +109,7 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70185084765540
112
+ version_requirements: *70153227359040
113
113
  description: Web Crawler that uses resque background job engine to allow you to cluster
114
114
  your crawl.
115
115
  email: stewart@rockwellcottage.com