cobweb 1.0.24 → 1.0.25
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.textile +1 -1
- data/lib/cobweb.rb +9 -6
- data/lib/cobweb_crawler.rb +5 -1
- data/lib/cobweb_links.rb +3 -3
- data/lib/cobweb_version.rb +1 -1
- data/lib/sidekiq/cobweb_helper.rb +2 -2
- data/spec/cobweb/cobweb_crawler_spec.rb +33 -0
- data/spec/cobweb/cobweb_spec.rb +4 -4
- data/spec/samples/sample_site/index.html +2 -0
- data/spec/spec_helper.rb +4 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 433da726316611ac2835723ff04e645fb00a3dc4
|
4
|
+
data.tar.gz: e85d23955c5ddbb02cf69baef0c5e240ff7d8204
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 23b0e6707b07bcad8621f8c547f48c3c8d0abf946e6454ffa3fee599fdfac5ab01942579c3f4a16ce9e743034225e68695f77df752032acc0c9fbfdfeb7e43ce
|
7
|
+
data.tar.gz: 702b5ed7c93e56f3994c7bee735ef1a44ceabd0cf349e1976b3692a094465b949ca618c106f1896e33551caeb64fe5383b57f53b8a8b4f18144b41255c092ad7
|
data/README.textile
CHANGED
data/lib/cobweb.rb
CHANGED
@@ -146,7 +146,7 @@ class Cobweb
|
|
146
146
|
content = {:base_url => url}
|
147
147
|
|
148
148
|
# check if it has already been cached
|
149
|
-
if ((@options[:cache_type] == :crawl_based && redis.get(unique_id)) || (@options[:cache_type] == :full && full_redis.get(unique_id)))
|
149
|
+
if @options[:cache] && ((@options[:cache_type] == :crawl_based && redis.get(unique_id)) || (@options[:cache_type] == :full && full_redis.get(unique_id)))
|
150
150
|
if @options[:cache_type] == :crawl_based
|
151
151
|
puts "Cache hit in crawl for #{url}" unless @options[:quiet]
|
152
152
|
content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
|
@@ -250,7 +250,10 @@ class Cobweb
|
|
250
250
|
end
|
251
251
|
end
|
252
252
|
rescue RedirectError => e
|
253
|
-
|
253
|
+
if @options[:raise_exceptions]
|
254
|
+
puts "Re-Raising error #{e.message} on #{uri.to_s}"
|
255
|
+
raise e
|
256
|
+
end
|
254
257
|
puts "ERROR RedirectError: #{e.message}"
|
255
258
|
|
256
259
|
## generate a blank content
|
@@ -327,7 +330,7 @@ class Cobweb
|
|
327
330
|
content = {:base_url => url}
|
328
331
|
|
329
332
|
# check if it has already been cached
|
330
|
-
if redis.get("head-#{unique_id}")
|
333
|
+
if @options[:cache] && redis.get("head-#{unique_id}")
|
331
334
|
puts "Cache hit for #{url}" unless @options[:quiet]
|
332
335
|
content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get("head-#{unique_id}")))
|
333
336
|
else
|
@@ -453,9 +456,9 @@ class Cobweb
|
|
453
456
|
pattern = pattern.gsub(".", "\\.")
|
454
457
|
pattern = pattern.gsub("?", "\\?")
|
455
458
|
pattern = pattern.gsub("+", "\\\\+")
|
456
|
-
pattern = pattern.gsub("*", ".*?")
|
457
|
-
if !options.has_key?(:treat_https_as_http)
|
458
|
-
pattern = pattern.gsub("
|
459
|
+
pattern = pattern.gsub("*", ".*?")
|
460
|
+
if options[:treat_https_as_http] || !options.has_key?(:treat_https_as_http)
|
461
|
+
pattern = pattern.gsub("http:", "https?:")
|
459
462
|
end
|
460
463
|
pattern
|
461
464
|
end
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -27,7 +27,8 @@ class CobwebCrawler
|
|
27
27
|
@options[:seed_urls].map{|link| @redis.sadd "queued", link }
|
28
28
|
|
29
29
|
@options[:crawl_linked_external] = false unless @options.has_key? :crawl_linked_external
|
30
|
-
|
30
|
+
|
31
|
+
@options[:treat_https_as_http] = true unless @options.has_key? :treat_https_as_http
|
31
32
|
@debug = @options[:debug]
|
32
33
|
|
33
34
|
@stats = Stats.new(@options.merge(:crawl_id => @crawl_id))
|
@@ -100,16 +101,19 @@ class CobwebCrawler
|
|
100
101
|
|
101
102
|
document_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https]).uniq
|
102
103
|
|
104
|
+
|
103
105
|
# select the link if its internal (eliminate external before expensive lookups in queued and crawled)
|
104
106
|
cobweb_links = CobwebLinks.new(@options)
|
105
107
|
|
106
108
|
internal_links = document_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
|
107
109
|
|
108
110
|
# if the site has the same content for http and https then normalize to http
|
111
|
+
|
109
112
|
if @options[:treat_https_as_http]
|
110
113
|
internal_links.map!{|link| link.gsub(/^https/, "http")}
|
111
114
|
end
|
112
115
|
|
116
|
+
|
113
117
|
# reject the link if we've crawled it or queued it
|
114
118
|
internal_links.reject!{|link| @redis.sismember("crawled", link)}
|
115
119
|
internal_links.reject!{|link| @redis.sismember("queued", link)}
|
data/lib/cobweb_links.rb
CHANGED
@@ -12,9 +12,9 @@ class CobwebLinks
|
|
12
12
|
@options[:external_urls] = [] unless @options.has_key? :external_urls
|
13
13
|
@options[:debug] = false unless @options.has_key? :debug
|
14
14
|
|
15
|
-
@internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
|
16
|
-
@external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
|
17
|
-
|
15
|
+
@internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, @options)}")}
|
16
|
+
@external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, @options)}")}
|
17
|
+
|
18
18
|
end
|
19
19
|
|
20
20
|
def allowed?(link)
|
data/lib/cobweb_version.rb
CHANGED
@@ -4,14 +4,14 @@ if Gem::Specification.find_all_by_name("sidekiq", ">=1.0.0").count >= 1
|
|
4
4
|
require 'sidekiq'
|
5
5
|
else
|
6
6
|
SIDEKIQ_INSTALLED = false
|
7
|
-
puts "sidekiq gem not installed, skipping crawl_worker specs"
|
7
|
+
puts "sidekiq gem not installed, skipping crawl_worker specs" if defined?(ENVIRONMENT) && ENVIRONMENT=="test"
|
8
8
|
end
|
9
9
|
if Gem::Specification.find_all_by_name("resque", ">=1.0.0").count >= 1
|
10
10
|
RESQUE_INSTALLED = true
|
11
11
|
require 'resque'
|
12
12
|
else
|
13
13
|
RESQUE_INSTALLED = false
|
14
|
-
puts "resque gem not installed, skipping crawl_job specs"
|
14
|
+
puts "resque gem not installed, skipping crawl_job specs" if defined?(ENVIRONMENT) && ENVIRONMENT=="test"
|
15
15
|
end
|
16
16
|
|
17
17
|
module Sidekiq
|
@@ -53,6 +53,39 @@ describe CobwebCrawler do
|
|
53
53
|
|
54
54
|
end
|
55
55
|
|
56
|
+
context "internal_links" do
|
57
|
+
it "should match internal links without being explicitly set" do
|
58
|
+
crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1})
|
59
|
+
crawler.crawl(@base_url)
|
60
|
+
queued_links = @redis_mock_object.smembers("queued")
|
61
|
+
queued_links.should_not include("http://themeforest.net/item/cleandream/490140")
|
62
|
+
queued_links.should include("http://localhost:3532/secure")
|
63
|
+
end
|
64
|
+
context "with https" do
|
65
|
+
it "should match https by default" do
|
66
|
+
crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1})
|
67
|
+
crawler.crawl(@base_url)
|
68
|
+
queued_links = @redis_mock_object.smembers("queued")
|
69
|
+
queued_links.should_not include("https://localhost:3532/secure")
|
70
|
+
queued_links.should include("http://localhost:3532/secure")
|
71
|
+
end
|
72
|
+
it "should not define https as different if treat_https_as_http is true" do
|
73
|
+
crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1, :treat_https_as_http => true})
|
74
|
+
crawler.crawl(@base_url)
|
75
|
+
queued_links = @redis_mock_object.smembers("queued")
|
76
|
+
queued_links.should_not include("https://localhost:3532/secure")
|
77
|
+
queued_links.should include("http://localhost:3532/secure")
|
78
|
+
end
|
79
|
+
it "should define https as different if treat_https_as_http is false" do
|
80
|
+
crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1, :treat_https_as_http => false})
|
81
|
+
crawler.crawl(@base_url)
|
82
|
+
queued_links = @redis_mock_object.smembers("queued")
|
83
|
+
queued_links.should_not include("https://localhost:3532/secure")
|
84
|
+
queued_links.should_not include("http://localhost:3532/secure")
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
56
89
|
context "storing inbound links" do
|
57
90
|
|
58
91
|
before(:each) do
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -61,19 +61,19 @@ describe Cobweb do
|
|
61
61
|
|
62
62
|
context "with https ignored" do
|
63
63
|
it "should ignore https" do
|
64
|
-
result = Cobweb.escape_pattern_for_regex("
|
64
|
+
result = Cobweb.escape_pattern_for_regex("http://asdf.com")
|
65
65
|
result.should eql "https?://asdf\\.com"
|
66
66
|
end
|
67
67
|
it "should ignore https" do
|
68
|
-
result = Cobweb.escape_pattern_for_regex("
|
68
|
+
result = Cobweb.escape_pattern_for_regex("http://asdf.com", :treat_https_as_http => true)
|
69
69
|
result.should eql "https?://asdf\\.com"
|
70
70
|
end
|
71
71
|
end
|
72
72
|
|
73
73
|
context "without https ignored" do
|
74
74
|
it "should ignore https" do
|
75
|
-
result = Cobweb.escape_pattern_for_regex("
|
76
|
-
result.should eql "
|
75
|
+
result = Cobweb.escape_pattern_for_regex("http://asdf.com", :treat_https_as_http => false)
|
76
|
+
result.should eql "http://asdf\\.com"
|
77
77
|
end
|
78
78
|
end
|
79
79
|
|
@@ -711,6 +711,8 @@
|
|
711
711
|
<a href="gfx/photos/07xl.jpg" class="zoom"><img src="gfx/photos/07.jpg" class="shadow" alt="Photo" /></a>
|
712
712
|
<a href="gfx/photos/08xl.jpg" class="zoom"><img src="gfx/photos/08.jpg" class="shadow" alt="Photo" /></a>
|
713
713
|
<a href="gfx/photos/09xl.jpg" class="zoom"><img src="gfx/photos/09.jpg" class="shadow" alt="Photo" /></a>
|
714
|
+
|
715
|
+
<a href="https://localhost:3532/secure">HTTPS Link</a>
|
714
716
|
|
715
717
|
<a href="#"><img src="gfx/photos/11.jpg" class="shadow" alt="Photo" /></a>
|
716
718
|
<a href="#"><img src="gfx/photos/12.jpg" class="shadow" alt="Photo" /></a>
|
data/spec/spec_helper.rb
CHANGED
@@ -37,10 +37,11 @@ RSpec.configure do |config|
|
|
37
37
|
|
38
38
|
config.before(:each) {
|
39
39
|
|
40
|
-
|
41
|
-
|
40
|
+
@redis_mock_object = MockRedis.new
|
41
|
+
Redis.stub(:new).and_return(@redis_mock_object)
|
42
|
+
Redis::Namespace.stub(:new).and_return(@redis_mock_object)
|
42
43
|
|
43
|
-
|
44
|
+
@redis_mock_object.flushdb
|
44
45
|
|
45
46
|
}
|
46
47
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.25
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Stewart McKee
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-01-
|
11
|
+
date: 2015-01-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: redis
|