cobweb 1.0.24 → 1.0.25

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b19a42aa5998a48743870fb7c0ed4e02ab0109f3
4
- data.tar.gz: ad64a73409057595e0b60ab9cdf0d1b8e70339e6
3
+ metadata.gz: 433da726316611ac2835723ff04e645fb00a3dc4
4
+ data.tar.gz: e85d23955c5ddbb02cf69baef0c5e240ff7d8204
5
5
  SHA512:
6
- metadata.gz: c91107bc0bb4cf6257b4707cbaaee723ad898750150404863715edc0ce9619bf7c0156ffd292758d5dccadf641a0b8d24347569d443d11c8c42d5c9ff446ffa9
7
- data.tar.gz: 61fa0d909f9c2763d04fcd189e0b47ed55fe7290726ff7d1b72c7c902e409612e7d9e64a800b41f046c1682c1f4ced18442ab98d2752ba0057fd24988a738ff8
6
+ metadata.gz: 23b0e6707b07bcad8621f8c547f48c3c8d0abf946e6454ffa3fee599fdfac5ab01942579c3f4a16ce9e743034225e68695f77df752032acc0c9fbfdfeb7e43ce
7
+ data.tar.gz: 702b5ed7c93e56f3994c7bee735ef1a44ceabd0cf349e1976b3692a094465b949ca618c106f1896e33551caeb64fe5383b57f53b8a8b4f18144b41255c092ad7
@@ -1,4 +1,4 @@
1
- h1. Cobweb v1.0.24
1
+ h1. Cobweb v1.0.25
2
2
 
3
3
  "@cobweb_gem":https://twitter.com/cobweb_gem
4
4
  !https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
@@ -146,7 +146,7 @@ class Cobweb
146
146
  content = {:base_url => url}
147
147
 
148
148
  # check if it has already been cached
149
- if ((@options[:cache_type] == :crawl_based && redis.get(unique_id)) || (@options[:cache_type] == :full && full_redis.get(unique_id))) && @options[:cache]
149
+ if @options[:cache] && ((@options[:cache_type] == :crawl_based && redis.get(unique_id)) || (@options[:cache_type] == :full && full_redis.get(unique_id)))
150
150
  if @options[:cache_type] == :crawl_based
151
151
  puts "Cache hit in crawl for #{url}" unless @options[:quiet]
152
152
  content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
@@ -250,7 +250,10 @@ class Cobweb
250
250
  end
251
251
  end
252
252
  rescue RedirectError => e
253
- raise e if @options[:raise_exceptions]
253
+ if @options[:raise_exceptions]
254
+ puts "Re-Raising error #{e.message} on #{uri.to_s}"
255
+ raise e
256
+ end
254
257
  puts "ERROR RedirectError: #{e.message}"
255
258
 
256
259
  ## generate a blank content
@@ -327,7 +330,7 @@ class Cobweb
327
330
  content = {:base_url => url}
328
331
 
329
332
  # check if it has already been cached
330
- if redis.get("head-#{unique_id}") and @options[:cache]
333
+ if @options[:cache] && redis.get("head-#{unique_id}")
331
334
  puts "Cache hit for #{url}" unless @options[:quiet]
332
335
  content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get("head-#{unique_id}")))
333
336
  else
@@ -453,9 +456,9 @@ class Cobweb
453
456
  pattern = pattern.gsub(".", "\\.")
454
457
  pattern = pattern.gsub("?", "\\?")
455
458
  pattern = pattern.gsub("+", "\\\\+")
456
- pattern = pattern.gsub("*", ".*?")
457
- if !options.has_key?(:treat_https_as_http) || options[:treat_https_as_http]
458
- pattern = pattern.gsub("https", "https?")
459
+ pattern = pattern.gsub("*", ".*?")
460
+ if options[:treat_https_as_http] || !options.has_key?(:treat_https_as_http)
461
+ pattern = pattern.gsub("http:", "https?:")
459
462
  end
460
463
  pattern
461
464
  end
@@ -27,7 +27,8 @@ class CobwebCrawler
27
27
  @options[:seed_urls].map{|link| @redis.sadd "queued", link }
28
28
 
29
29
  @options[:crawl_linked_external] = false unless @options.has_key? :crawl_linked_external
30
-
30
+
31
+ @options[:treat_https_as_http] = true unless @options.has_key? :treat_https_as_http
31
32
  @debug = @options[:debug]
32
33
 
33
34
  @stats = Stats.new(@options.merge(:crawl_id => @crawl_id))
@@ -100,16 +101,19 @@ class CobwebCrawler
100
101
 
101
102
  document_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https]).uniq
102
103
 
104
+
103
105
  # select the link if its internal (eliminate external before expensive lookups in queued and crawled)
104
106
  cobweb_links = CobwebLinks.new(@options)
105
107
 
106
108
  internal_links = document_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
107
109
 
108
110
  # if the site has the same content for http and https then normalize to http
111
+
109
112
  if @options[:treat_https_as_http]
110
113
  internal_links.map!{|link| link.gsub(/^https/, "http")}
111
114
  end
112
115
 
116
+
113
117
  # reject the link if we've crawled it or queued it
114
118
  internal_links.reject!{|link| @redis.sismember("crawled", link)}
115
119
  internal_links.reject!{|link| @redis.sismember("queued", link)}
@@ -12,9 +12,9 @@ class CobwebLinks
12
12
  @options[:external_urls] = [] unless @options.has_key? :external_urls
13
13
  @options[:debug] = false unless @options.has_key? :debug
14
14
 
15
- @internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
16
- @external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
17
-
15
+ @internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, @options)}")}
16
+ @external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, @options)}")}
17
+
18
18
  end
19
19
 
20
20
  def allowed?(link)
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.24"
6
+ "1.0.25"
7
7
  end
8
8
 
9
9
  end
@@ -4,14 +4,14 @@ if Gem::Specification.find_all_by_name("sidekiq", ">=1.0.0").count >= 1
4
4
  require 'sidekiq'
5
5
  else
6
6
  SIDEKIQ_INSTALLED = false
7
- puts "sidekiq gem not installed, skipping crawl_worker specs"
7
+ puts "sidekiq gem not installed, skipping crawl_worker specs" if defined?(ENVIRONMENT) && ENVIRONMENT=="test"
8
8
  end
9
9
  if Gem::Specification.find_all_by_name("resque", ">=1.0.0").count >= 1
10
10
  RESQUE_INSTALLED = true
11
11
  require 'resque'
12
12
  else
13
13
  RESQUE_INSTALLED = false
14
- puts "resque gem not installed, skipping crawl_job specs"
14
+ puts "resque gem not installed, skipping crawl_job specs" if defined?(ENVIRONMENT) && ENVIRONMENT=="test"
15
15
  end
16
16
 
17
17
  module Sidekiq
@@ -53,6 +53,39 @@ describe CobwebCrawler do
53
53
 
54
54
  end
55
55
 
56
+ context "internal_links" do
57
+ it "should match internal links without being explicitly set" do
58
+ crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1})
59
+ crawler.crawl(@base_url)
60
+ queued_links = @redis_mock_object.smembers("queued")
61
+ queued_links.should_not include("http://themeforest.net/item/cleandream/490140")
62
+ queued_links.should include("http://localhost:3532/secure")
63
+ end
64
+ context "with https" do
65
+ it "should match https by default" do
66
+ crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1})
67
+ crawler.crawl(@base_url)
68
+ queued_links = @redis_mock_object.smembers("queued")
69
+ queued_links.should_not include("https://localhost:3532/secure")
70
+ queued_links.should include("http://localhost:3532/secure")
71
+ end
72
+ it "should not define https as different if treat_https_as_http is true" do
73
+ crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1, :treat_https_as_http => true})
74
+ crawler.crawl(@base_url)
75
+ queued_links = @redis_mock_object.smembers("queued")
76
+ queued_links.should_not include("https://localhost:3532/secure")
77
+ queued_links.should include("http://localhost:3532/secure")
78
+ end
79
+ it "should define https as different if treat_https_as_http is false" do
80
+ crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1, :treat_https_as_http => false})
81
+ crawler.crawl(@base_url)
82
+ queued_links = @redis_mock_object.smembers("queued")
83
+ queued_links.should_not include("https://localhost:3532/secure")
84
+ queued_links.should_not include("http://localhost:3532/secure")
85
+ end
86
+ end
87
+ end
88
+
56
89
  context "storing inbound links" do
57
90
 
58
91
  before(:each) do
@@ -61,19 +61,19 @@ describe Cobweb do
61
61
 
62
62
  context "with https ignored" do
63
63
  it "should ignore https" do
64
- result = Cobweb.escape_pattern_for_regex("https://asdf.com")
64
+ result = Cobweb.escape_pattern_for_regex("http://asdf.com")
65
65
  result.should eql "https?://asdf\\.com"
66
66
  end
67
67
  it "should ignore https" do
68
- result = Cobweb.escape_pattern_for_regex("https://asdf.com", :treat_https_as_http => true)
68
+ result = Cobweb.escape_pattern_for_regex("http://asdf.com", :treat_https_as_http => true)
69
69
  result.should eql "https?://asdf\\.com"
70
70
  end
71
71
  end
72
72
 
73
73
  context "without https ignored" do
74
74
  it "should ignore https" do
75
- result = Cobweb.escape_pattern_for_regex("https://asdf.com", :treat_https_as_http => false)
76
- result.should eql "https://asdf\\.com"
75
+ result = Cobweb.escape_pattern_for_regex("http://asdf.com", :treat_https_as_http => false)
76
+ result.should eql "http://asdf\\.com"
77
77
  end
78
78
  end
79
79
 
@@ -711,6 +711,8 @@
711
711
  <a href="gfx/photos/07xl.jpg" class="zoom"><img src="gfx/photos/07.jpg" class="shadow" alt="Photo" /></a>
712
712
  <a href="gfx/photos/08xl.jpg" class="zoom"><img src="gfx/photos/08.jpg" class="shadow" alt="Photo" /></a>
713
713
  <a href="gfx/photos/09xl.jpg" class="zoom"><img src="gfx/photos/09.jpg" class="shadow" alt="Photo" /></a>
714
+
715
+ <a href="https://localhost:3532/secure">HTTPS Link</a>
714
716
 
715
717
  <a href="#"><img src="gfx/photos/11.jpg" class="shadow" alt="Photo" /></a>
716
718
  <a href="#"><img src="gfx/photos/12.jpg" class="shadow" alt="Photo" /></a>
@@ -37,10 +37,11 @@ RSpec.configure do |config|
37
37
 
38
38
  config.before(:each) {
39
39
 
40
- #redis_mock = double("redis")
41
- #redis_mock.stub(:new).and_return(@redis_mock_object)
40
+ @redis_mock_object = MockRedis.new
41
+ Redis.stub(:new).and_return(@redis_mock_object)
42
+ Redis::Namespace.stub(:new).and_return(@redis_mock_object)
42
43
 
43
- #redis_mock.flushdb
44
+ @redis_mock_object.flushdb
44
45
 
45
46
  }
46
47
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.24
4
+ version: 1.0.25
5
5
  platform: ruby
6
6
  authors:
7
7
  - Stewart McKee
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-22 00:00:00.000000000 Z
11
+ date: 2015-01-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: redis