cobweb 1.0.24 → 1.0.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b19a42aa5998a48743870fb7c0ed4e02ab0109f3
4
- data.tar.gz: ad64a73409057595e0b60ab9cdf0d1b8e70339e6
3
+ metadata.gz: 433da726316611ac2835723ff04e645fb00a3dc4
4
+ data.tar.gz: e85d23955c5ddbb02cf69baef0c5e240ff7d8204
5
5
  SHA512:
6
- metadata.gz: c91107bc0bb4cf6257b4707cbaaee723ad898750150404863715edc0ce9619bf7c0156ffd292758d5dccadf641a0b8d24347569d443d11c8c42d5c9ff446ffa9
7
- data.tar.gz: 61fa0d909f9c2763d04fcd189e0b47ed55fe7290726ff7d1b72c7c902e409612e7d9e64a800b41f046c1682c1f4ced18442ab98d2752ba0057fd24988a738ff8
6
+ metadata.gz: 23b0e6707b07bcad8621f8c547f48c3c8d0abf946e6454ffa3fee599fdfac5ab01942579c3f4a16ce9e743034225e68695f77df752032acc0c9fbfdfeb7e43ce
7
+ data.tar.gz: 702b5ed7c93e56f3994c7bee735ef1a44ceabd0cf349e1976b3692a094465b949ca618c106f1896e33551caeb64fe5383b57f53b8a8b4f18144b41255c092ad7
@@ -1,4 +1,4 @@
1
- h1. Cobweb v1.0.24
1
+ h1. Cobweb v1.0.25
2
2
 
3
3
  "@cobweb_gem":https://twitter.com/cobweb_gem
4
4
  !https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
@@ -146,7 +146,7 @@ class Cobweb
146
146
  content = {:base_url => url}
147
147
 
148
148
  # check if it has already been cached
149
- if ((@options[:cache_type] == :crawl_based && redis.get(unique_id)) || (@options[:cache_type] == :full && full_redis.get(unique_id))) && @options[:cache]
149
+ if @options[:cache] && ((@options[:cache_type] == :crawl_based && redis.get(unique_id)) || (@options[:cache_type] == :full && full_redis.get(unique_id)))
150
150
  if @options[:cache_type] == :crawl_based
151
151
  puts "Cache hit in crawl for #{url}" unless @options[:quiet]
152
152
  content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
@@ -250,7 +250,10 @@ class Cobweb
250
250
  end
251
251
  end
252
252
  rescue RedirectError => e
253
- raise e if @options[:raise_exceptions]
253
+ if @options[:raise_exceptions]
254
+ puts "Re-Raising error #{e.message} on #{uri.to_s}"
255
+ raise e
256
+ end
254
257
  puts "ERROR RedirectError: #{e.message}"
255
258
 
256
259
  ## generate a blank content
@@ -327,7 +330,7 @@ class Cobweb
327
330
  content = {:base_url => url}
328
331
 
329
332
  # check if it has already been cached
330
- if redis.get("head-#{unique_id}") and @options[:cache]
333
+ if @options[:cache] && redis.get("head-#{unique_id}")
331
334
  puts "Cache hit for #{url}" unless @options[:quiet]
332
335
  content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get("head-#{unique_id}")))
333
336
  else
@@ -453,9 +456,9 @@ class Cobweb
453
456
  pattern = pattern.gsub(".", "\\.")
454
457
  pattern = pattern.gsub("?", "\\?")
455
458
  pattern = pattern.gsub("+", "\\\\+")
456
- pattern = pattern.gsub("*", ".*?")
457
- if !options.has_key?(:treat_https_as_http) || options[:treat_https_as_http]
458
- pattern = pattern.gsub("https", "https?")
459
+ pattern = pattern.gsub("*", ".*?")
460
+ if options[:treat_https_as_http] || !options.has_key?(:treat_https_as_http)
461
+ pattern = pattern.gsub("http:", "https?:")
459
462
  end
460
463
  pattern
461
464
  end
@@ -27,7 +27,8 @@ class CobwebCrawler
27
27
  @options[:seed_urls].map{|link| @redis.sadd "queued", link }
28
28
 
29
29
  @options[:crawl_linked_external] = false unless @options.has_key? :crawl_linked_external
30
-
30
+
31
+ @options[:treat_https_as_http] = true unless @options.has_key? :treat_https_as_http
31
32
  @debug = @options[:debug]
32
33
 
33
34
  @stats = Stats.new(@options.merge(:crawl_id => @crawl_id))
@@ -100,16 +101,19 @@ class CobwebCrawler
100
101
 
101
102
  document_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https]).uniq
102
103
 
104
+
103
105
  # select the link if its internal (eliminate external before expensive lookups in queued and crawled)
104
106
  cobweb_links = CobwebLinks.new(@options)
105
107
 
106
108
  internal_links = document_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
107
109
 
108
110
  # if the site has the same content for http and https then normalize to http
111
+
109
112
  if @options[:treat_https_as_http]
110
113
  internal_links.map!{|link| link.gsub(/^https/, "http")}
111
114
  end
112
115
 
116
+
113
117
  # reject the link if we've crawled it or queued it
114
118
  internal_links.reject!{|link| @redis.sismember("crawled", link)}
115
119
  internal_links.reject!{|link| @redis.sismember("queued", link)}
@@ -12,9 +12,9 @@ class CobwebLinks
12
12
  @options[:external_urls] = [] unless @options.has_key? :external_urls
13
13
  @options[:debug] = false unless @options.has_key? :debug
14
14
 
15
- @internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
16
- @external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
17
-
15
+ @internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, @options)}")}
16
+ @external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, @options)}")}
17
+
18
18
  end
19
19
 
20
20
  def allowed?(link)
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.24"
6
+ "1.0.25"
7
7
  end
8
8
 
9
9
  end
@@ -4,14 +4,14 @@ if Gem::Specification.find_all_by_name("sidekiq", ">=1.0.0").count >= 1
4
4
  require 'sidekiq'
5
5
  else
6
6
  SIDEKIQ_INSTALLED = false
7
- puts "sidekiq gem not installed, skipping crawl_worker specs"
7
+ puts "sidekiq gem not installed, skipping crawl_worker specs" if defined?(ENVIRONMENT) && ENVIRONMENT=="test"
8
8
  end
9
9
  if Gem::Specification.find_all_by_name("resque", ">=1.0.0").count >= 1
10
10
  RESQUE_INSTALLED = true
11
11
  require 'resque'
12
12
  else
13
13
  RESQUE_INSTALLED = false
14
- puts "resque gem not installed, skipping crawl_job specs"
14
+ puts "resque gem not installed, skipping crawl_job specs" if defined?(ENVIRONMENT) && ENVIRONMENT=="test"
15
15
  end
16
16
 
17
17
  module Sidekiq
@@ -53,6 +53,39 @@ describe CobwebCrawler do
53
53
 
54
54
  end
55
55
 
56
+ context "internal_links" do
57
+ it "should match internal links without being explicitly set" do
58
+ crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1})
59
+ crawler.crawl(@base_url)
60
+ queued_links = @redis_mock_object.smembers("queued")
61
+ queued_links.should_not include("http://themeforest.net/item/cleandream/490140")
62
+ queued_links.should include("http://localhost:3532/secure")
63
+ end
64
+ context "with https" do
65
+ it "should match https by default" do
66
+ crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1})
67
+ crawler.crawl(@base_url)
68
+ queued_links = @redis_mock_object.smembers("queued")
69
+ queued_links.should_not include("https://localhost:3532/secure")
70
+ queued_links.should include("http://localhost:3532/secure")
71
+ end
72
+ it "should not define https as different if treat_https_as_http is true" do
73
+ crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1, :treat_https_as_http => true})
74
+ crawler.crawl(@base_url)
75
+ queued_links = @redis_mock_object.smembers("queued")
76
+ queued_links.should_not include("https://localhost:3532/secure")
77
+ queued_links.should include("http://localhost:3532/secure")
78
+ end
79
+ it "should define https as different if treat_https_as_http is false" do
80
+ crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1, :treat_https_as_http => false})
81
+ crawler.crawl(@base_url)
82
+ queued_links = @redis_mock_object.smembers("queued")
83
+ queued_links.should_not include("https://localhost:3532/secure")
84
+ queued_links.should_not include("http://localhost:3532/secure")
85
+ end
86
+ end
87
+ end
88
+
56
89
  context "storing inbound links" do
57
90
 
58
91
  before(:each) do
@@ -61,19 +61,19 @@ describe Cobweb do
61
61
 
62
62
  context "with https ignored" do
63
63
  it "should ignore https" do
64
- result = Cobweb.escape_pattern_for_regex("https://asdf.com")
64
+ result = Cobweb.escape_pattern_for_regex("http://asdf.com")
65
65
  result.should eql "https?://asdf\\.com"
66
66
  end
67
67
  it "should ignore https" do
68
- result = Cobweb.escape_pattern_for_regex("https://asdf.com", :treat_https_as_http => true)
68
+ result = Cobweb.escape_pattern_for_regex("http://asdf.com", :treat_https_as_http => true)
69
69
  result.should eql "https?://asdf\\.com"
70
70
  end
71
71
  end
72
72
 
73
73
  context "without https ignored" do
74
74
  it "should ignore https" do
75
- result = Cobweb.escape_pattern_for_regex("https://asdf.com", :treat_https_as_http => false)
76
- result.should eql "https://asdf\\.com"
75
+ result = Cobweb.escape_pattern_for_regex("http://asdf.com", :treat_https_as_http => false)
76
+ result.should eql "http://asdf\\.com"
77
77
  end
78
78
  end
79
79
 
@@ -711,6 +711,8 @@
711
711
  <a href="gfx/photos/07xl.jpg" class="zoom"><img src="gfx/photos/07.jpg" class="shadow" alt="Photo" /></a>
712
712
  <a href="gfx/photos/08xl.jpg" class="zoom"><img src="gfx/photos/08.jpg" class="shadow" alt="Photo" /></a>
713
713
  <a href="gfx/photos/09xl.jpg" class="zoom"><img src="gfx/photos/09.jpg" class="shadow" alt="Photo" /></a>
714
+
715
+ <a href="https://localhost:3532/secure">HTTPS Link</a>
714
716
 
715
717
  <a href="#"><img src="gfx/photos/11.jpg" class="shadow" alt="Photo" /></a>
716
718
  <a href="#"><img src="gfx/photos/12.jpg" class="shadow" alt="Photo" /></a>
@@ -37,10 +37,11 @@ RSpec.configure do |config|
37
37
 
38
38
  config.before(:each) {
39
39
 
40
- #redis_mock = double("redis")
41
- #redis_mock.stub(:new).and_return(@redis_mock_object)
40
+ @redis_mock_object = MockRedis.new
41
+ Redis.stub(:new).and_return(@redis_mock_object)
42
+ Redis::Namespace.stub(:new).and_return(@redis_mock_object)
42
43
 
43
- #redis_mock.flushdb
44
+ @redis_mock_object.flushdb
44
45
 
45
46
  }
46
47
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.24
4
+ version: 1.0.25
5
5
  platform: ruby
6
6
  authors:
7
7
  - Stewart McKee
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-22 00:00:00.000000000 Z
11
+ date: 2015-01-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: redis