cobweb 1.0.24 → 1.0.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.textile +1 -1
- data/lib/cobweb.rb +9 -6
- data/lib/cobweb_crawler.rb +5 -1
- data/lib/cobweb_links.rb +3 -3
- data/lib/cobweb_version.rb +1 -1
- data/lib/sidekiq/cobweb_helper.rb +2 -2
- data/spec/cobweb/cobweb_crawler_spec.rb +33 -0
- data/spec/cobweb/cobweb_spec.rb +4 -4
- data/spec/samples/sample_site/index.html +2 -0
- data/spec/spec_helper.rb +4 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 433da726316611ac2835723ff04e645fb00a3dc4
|
4
|
+
data.tar.gz: e85d23955c5ddbb02cf69baef0c5e240ff7d8204
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 23b0e6707b07bcad8621f8c547f48c3c8d0abf946e6454ffa3fee599fdfac5ab01942579c3f4a16ce9e743034225e68695f77df752032acc0c9fbfdfeb7e43ce
|
7
|
+
data.tar.gz: 702b5ed7c93e56f3994c7bee735ef1a44ceabd0cf349e1976b3692a094465b949ca618c106f1896e33551caeb64fe5383b57f53b8a8b4f18144b41255c092ad7
|
data/README.textile
CHANGED
data/lib/cobweb.rb
CHANGED
@@ -146,7 +146,7 @@ class Cobweb
|
|
146
146
|
content = {:base_url => url}
|
147
147
|
|
148
148
|
# check if it has already been cached
|
149
|
-
if ((@options[:cache_type] == :crawl_based && redis.get(unique_id)) || (@options[:cache_type] == :full && full_redis.get(unique_id)))
|
149
|
+
if @options[:cache] && ((@options[:cache_type] == :crawl_based && redis.get(unique_id)) || (@options[:cache_type] == :full && full_redis.get(unique_id)))
|
150
150
|
if @options[:cache_type] == :crawl_based
|
151
151
|
puts "Cache hit in crawl for #{url}" unless @options[:quiet]
|
152
152
|
content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
|
@@ -250,7 +250,10 @@ class Cobweb
|
|
250
250
|
end
|
251
251
|
end
|
252
252
|
rescue RedirectError => e
|
253
|
-
|
253
|
+
if @options[:raise_exceptions]
|
254
|
+
puts "Re-Raising error #{e.message} on #{uri.to_s}"
|
255
|
+
raise e
|
256
|
+
end
|
254
257
|
puts "ERROR RedirectError: #{e.message}"
|
255
258
|
|
256
259
|
## generate a blank content
|
@@ -327,7 +330,7 @@ class Cobweb
|
|
327
330
|
content = {:base_url => url}
|
328
331
|
|
329
332
|
# check if it has already been cached
|
330
|
-
if redis.get("head-#{unique_id}")
|
333
|
+
if @options[:cache] && redis.get("head-#{unique_id}")
|
331
334
|
puts "Cache hit for #{url}" unless @options[:quiet]
|
332
335
|
content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get("head-#{unique_id}")))
|
333
336
|
else
|
@@ -453,9 +456,9 @@ class Cobweb
|
|
453
456
|
pattern = pattern.gsub(".", "\\.")
|
454
457
|
pattern = pattern.gsub("?", "\\?")
|
455
458
|
pattern = pattern.gsub("+", "\\\\+")
|
456
|
-
pattern = pattern.gsub("*", ".*?")
|
457
|
-
if !options.has_key?(:treat_https_as_http)
|
458
|
-
pattern = pattern.gsub("
|
459
|
+
pattern = pattern.gsub("*", ".*?")
|
460
|
+
if options[:treat_https_as_http] || !options.has_key?(:treat_https_as_http)
|
461
|
+
pattern = pattern.gsub("http:", "https?:")
|
459
462
|
end
|
460
463
|
pattern
|
461
464
|
end
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -27,7 +27,8 @@ class CobwebCrawler
|
|
27
27
|
@options[:seed_urls].map{|link| @redis.sadd "queued", link }
|
28
28
|
|
29
29
|
@options[:crawl_linked_external] = false unless @options.has_key? :crawl_linked_external
|
30
|
-
|
30
|
+
|
31
|
+
@options[:treat_https_as_http] = true unless @options.has_key? :treat_https_as_http
|
31
32
|
@debug = @options[:debug]
|
32
33
|
|
33
34
|
@stats = Stats.new(@options.merge(:crawl_id => @crawl_id))
|
@@ -100,16 +101,19 @@ class CobwebCrawler
|
|
100
101
|
|
101
102
|
document_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https]).uniq
|
102
103
|
|
104
|
+
|
103
105
|
# select the link if its internal (eliminate external before expensive lookups in queued and crawled)
|
104
106
|
cobweb_links = CobwebLinks.new(@options)
|
105
107
|
|
106
108
|
internal_links = document_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
|
107
109
|
|
108
110
|
# if the site has the same content for http and https then normalize to http
|
111
|
+
|
109
112
|
if @options[:treat_https_as_http]
|
110
113
|
internal_links.map!{|link| link.gsub(/^https/, "http")}
|
111
114
|
end
|
112
115
|
|
116
|
+
|
113
117
|
# reject the link if we've crawled it or queued it
|
114
118
|
internal_links.reject!{|link| @redis.sismember("crawled", link)}
|
115
119
|
internal_links.reject!{|link| @redis.sismember("queued", link)}
|
data/lib/cobweb_links.rb
CHANGED
@@ -12,9 +12,9 @@ class CobwebLinks
|
|
12
12
|
@options[:external_urls] = [] unless @options.has_key? :external_urls
|
13
13
|
@options[:debug] = false unless @options.has_key? :debug
|
14
14
|
|
15
|
-
@internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
|
16
|
-
@external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
|
17
|
-
|
15
|
+
@internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, @options)}")}
|
16
|
+
@external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, @options)}")}
|
17
|
+
|
18
18
|
end
|
19
19
|
|
20
20
|
def allowed?(link)
|
data/lib/cobweb_version.rb
CHANGED
@@ -4,14 +4,14 @@ if Gem::Specification.find_all_by_name("sidekiq", ">=1.0.0").count >= 1
|
|
4
4
|
require 'sidekiq'
|
5
5
|
else
|
6
6
|
SIDEKIQ_INSTALLED = false
|
7
|
-
puts "sidekiq gem not installed, skipping crawl_worker specs"
|
7
|
+
puts "sidekiq gem not installed, skipping crawl_worker specs" if defined?(ENVIRONMENT) && ENVIRONMENT=="test"
|
8
8
|
end
|
9
9
|
if Gem::Specification.find_all_by_name("resque", ">=1.0.0").count >= 1
|
10
10
|
RESQUE_INSTALLED = true
|
11
11
|
require 'resque'
|
12
12
|
else
|
13
13
|
RESQUE_INSTALLED = false
|
14
|
-
puts "resque gem not installed, skipping crawl_job specs"
|
14
|
+
puts "resque gem not installed, skipping crawl_job specs" if defined?(ENVIRONMENT) && ENVIRONMENT=="test"
|
15
15
|
end
|
16
16
|
|
17
17
|
module Sidekiq
|
@@ -53,6 +53,39 @@ describe CobwebCrawler do
|
|
53
53
|
|
54
54
|
end
|
55
55
|
|
56
|
+
context "internal_links" do
|
57
|
+
it "should match internal links without being explicitly set" do
|
58
|
+
crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1})
|
59
|
+
crawler.crawl(@base_url)
|
60
|
+
queued_links = @redis_mock_object.smembers("queued")
|
61
|
+
queued_links.should_not include("http://themeforest.net/item/cleandream/490140")
|
62
|
+
queued_links.should include("http://localhost:3532/secure")
|
63
|
+
end
|
64
|
+
context "with https" do
|
65
|
+
it "should match https by default" do
|
66
|
+
crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1})
|
67
|
+
crawler.crawl(@base_url)
|
68
|
+
queued_links = @redis_mock_object.smembers("queued")
|
69
|
+
queued_links.should_not include("https://localhost:3532/secure")
|
70
|
+
queued_links.should include("http://localhost:3532/secure")
|
71
|
+
end
|
72
|
+
it "should not define https as different if treat_https_as_http is true" do
|
73
|
+
crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1, :treat_https_as_http => true})
|
74
|
+
crawler.crawl(@base_url)
|
75
|
+
queued_links = @redis_mock_object.smembers("queued")
|
76
|
+
queued_links.should_not include("https://localhost:3532/secure")
|
77
|
+
queued_links.should include("http://localhost:3532/secure")
|
78
|
+
end
|
79
|
+
it "should define https as different if treat_https_as_http is false" do
|
80
|
+
crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1, :treat_https_as_http => false})
|
81
|
+
crawler.crawl(@base_url)
|
82
|
+
queued_links = @redis_mock_object.smembers("queued")
|
83
|
+
queued_links.should_not include("https://localhost:3532/secure")
|
84
|
+
queued_links.should_not include("http://localhost:3532/secure")
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
56
89
|
context "storing inbound links" do
|
57
90
|
|
58
91
|
before(:each) do
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -61,19 +61,19 @@ describe Cobweb do
|
|
61
61
|
|
62
62
|
context "with https ignored" do
|
63
63
|
it "should ignore https" do
|
64
|
-
result = Cobweb.escape_pattern_for_regex("
|
64
|
+
result = Cobweb.escape_pattern_for_regex("http://asdf.com")
|
65
65
|
result.should eql "https?://asdf\\.com"
|
66
66
|
end
|
67
67
|
it "should ignore https" do
|
68
|
-
result = Cobweb.escape_pattern_for_regex("
|
68
|
+
result = Cobweb.escape_pattern_for_regex("http://asdf.com", :treat_https_as_http => true)
|
69
69
|
result.should eql "https?://asdf\\.com"
|
70
70
|
end
|
71
71
|
end
|
72
72
|
|
73
73
|
context "without https ignored" do
|
74
74
|
it "should ignore https" do
|
75
|
-
result = Cobweb.escape_pattern_for_regex("
|
76
|
-
result.should eql "
|
75
|
+
result = Cobweb.escape_pattern_for_regex("http://asdf.com", :treat_https_as_http => false)
|
76
|
+
result.should eql "http://asdf\\.com"
|
77
77
|
end
|
78
78
|
end
|
79
79
|
|
@@ -711,6 +711,8 @@
|
|
711
711
|
<a href="gfx/photos/07xl.jpg" class="zoom"><img src="gfx/photos/07.jpg" class="shadow" alt="Photo" /></a>
|
712
712
|
<a href="gfx/photos/08xl.jpg" class="zoom"><img src="gfx/photos/08.jpg" class="shadow" alt="Photo" /></a>
|
713
713
|
<a href="gfx/photos/09xl.jpg" class="zoom"><img src="gfx/photos/09.jpg" class="shadow" alt="Photo" /></a>
|
714
|
+
|
715
|
+
<a href="https://localhost:3532/secure">HTTPS Link</a>
|
714
716
|
|
715
717
|
<a href="#"><img src="gfx/photos/11.jpg" class="shadow" alt="Photo" /></a>
|
716
718
|
<a href="#"><img src="gfx/photos/12.jpg" class="shadow" alt="Photo" /></a>
|
data/spec/spec_helper.rb
CHANGED
@@ -37,10 +37,11 @@ RSpec.configure do |config|
|
|
37
37
|
|
38
38
|
config.before(:each) {
|
39
39
|
|
40
|
-
|
41
|
-
|
40
|
+
@redis_mock_object = MockRedis.new
|
41
|
+
Redis.stub(:new).and_return(@redis_mock_object)
|
42
|
+
Redis::Namespace.stub(:new).and_return(@redis_mock_object)
|
42
43
|
|
43
|
-
|
44
|
+
@redis_mock_object.flushdb
|
44
45
|
|
45
46
|
}
|
46
47
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.25
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Stewart McKee
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-01-
|
11
|
+
date: 2015-01-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: redis
|