cobweb 1.0.23 → 1.0.24
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.textile +2 -1
- data/lib/cobweb.rb +8 -2
- data/lib/cobweb_crawler.rb +5 -0
- data/lib/cobweb_links.rb +2 -2
- data/lib/cobweb_version.rb +1 -1
- data/lib/crawl.rb +5 -1
- data/lib/crawl_helper.rb +5 -0
- data/lib/robots.rb +2 -2
- data/spec/cobweb/cobweb_spec.rb +45 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b19a42aa5998a48743870fb7c0ed4e02ab0109f3
|
4
|
+
data.tar.gz: ad64a73409057595e0b60ab9cdf0d1b8e70339e6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c91107bc0bb4cf6257b4707cbaaee723ad898750150404863715edc0ce9619bf7c0156ffd292758d5dccadf641a0b8d24347569d443d11c8c42d5c9ff446ffa9
|
7
|
+
data.tar.gz: 61fa0d909f9c2763d04fcd189e0b47ed55fe7290726ff7d1b72c7c902e409612e7d9e64a800b41f046c1682c1f4ced18442ab98d2752ba0057fd24988a738ff8
|
data/README.textile
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
h1. Cobweb v1.0.
|
1
|
+
h1. Cobweb v1.0.24
|
2
2
|
|
3
3
|
"@cobweb_gem":https://twitter.com/cobweb_gem
|
4
4
|
!https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
|
@@ -116,6 +116,7 @@ Creates a new crawler object based on a base_url
|
|
116
116
|
** :use_encoding_safe_process_job - Base64-encode the body when storing job in queue; set to true when you are expecting non-ASCII content (Default: false)
|
117
117
|
** :proxy_addr - hostname of a proxy to use for crawling (e. g., 'myproxy.example.net', default: nil)
|
118
118
|
** :proxy_port - port number of the proxy (default: nil)
|
119
|
+
** :treat_https_as_http - determines whether https and http urls are treated as the same (defaults to true, ie treated as the same)
|
119
120
|
|
120
121
|
|
121
122
|
bc. crawler = Cobweb.new(:follow_redirects => false)
|
data/lib/cobweb.rb
CHANGED
@@ -61,6 +61,9 @@ class Cobweb
|
|
61
61
|
default_store_inbound_links_to false
|
62
62
|
default_proxy_addr_to nil
|
63
63
|
default_proxy_port_to nil
|
64
|
+
default_additional_tags_to nil
|
65
|
+
default_treat_https_as_http_to true
|
66
|
+
|
64
67
|
|
65
68
|
end
|
66
69
|
|
@@ -446,11 +449,14 @@ class Cobweb
|
|
446
449
|
end
|
447
450
|
|
448
451
|
# escapes characters with meaning in regular expressions and adds wildcard expression
|
449
|
-
def self.escape_pattern_for_regex(pattern)
|
452
|
+
def self.escape_pattern_for_regex(pattern, options={})
|
450
453
|
pattern = pattern.gsub(".", "\\.")
|
451
454
|
pattern = pattern.gsub("?", "\\?")
|
452
|
-
pattern = pattern.gsub("+", "
|
455
|
+
pattern = pattern.gsub("+", "\\\\+")
|
453
456
|
pattern = pattern.gsub("*", ".*?")
|
457
|
+
if !options.has_key?(:treat_https_as_http) || options[:treat_https_as_http]
|
458
|
+
pattern = pattern.gsub("https", "https?")
|
459
|
+
end
|
454
460
|
pattern
|
455
461
|
end
|
456
462
|
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -104,6 +104,11 @@ class CobwebCrawler
|
|
104
104
|
cobweb_links = CobwebLinks.new(@options)
|
105
105
|
|
106
106
|
internal_links = document_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
|
107
|
+
|
108
|
+
# if the site has the same content for http and https then normalize to http
|
109
|
+
if @options[:treat_https_as_http]
|
110
|
+
internal_links.map!{|link| link.gsub(/^https/, "http")}
|
111
|
+
end
|
107
112
|
|
108
113
|
# reject the link if we've crawled it or queued it
|
109
114
|
internal_links.reject!{|link| @redis.sismember("crawled", link)}
|
data/lib/cobweb_links.rb
CHANGED
@@ -12,8 +12,8 @@ class CobwebLinks
|
|
12
12
|
@options[:external_urls] = [] unless @options.has_key? :external_urls
|
13
13
|
@options[:debug] = false unless @options.has_key? :debug
|
14
14
|
|
15
|
-
@internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern)}")}
|
16
|
-
@external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern)}")}
|
15
|
+
@internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
|
16
|
+
@external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
|
17
17
|
|
18
18
|
end
|
19
19
|
|
data/lib/cobweb_version.rb
CHANGED
data/lib/crawl.rb
CHANGED
@@ -112,8 +112,12 @@ module CobwebModule
|
|
112
112
|
# select the link if its internal
|
113
113
|
internal_links = document_links.select{ |link| @cobweb_links.internal?(link) }
|
114
114
|
|
115
|
-
#
|
115
|
+
# if the site has the same content for http and https then normalize to http
|
116
|
+
if @options[:treat_https_as_http]
|
117
|
+
internal_links.map!{|link| link.gsub(/^https/, "http")}
|
118
|
+
end
|
116
119
|
|
120
|
+
# reject the link if we've crawled it or queued it
|
117
121
|
internal_links.reject! { |link| already_handled?(link)}
|
118
122
|
|
119
123
|
lock("internal-links") do
|
data/lib/crawl_helper.rb
CHANGED
@@ -60,6 +60,11 @@ class CrawlHelper
|
|
60
60
|
# select the link if its internal
|
61
61
|
internal_links.select! { |link| @cobweb_links.internal?(link) }
|
62
62
|
|
63
|
+
# if the site has the same content for http and https then normalize to http
|
64
|
+
if @options[:treat_https_as_http]
|
65
|
+
internal_links.map!{|link| link.gsub(/^https/, "http")}
|
66
|
+
end
|
67
|
+
|
63
68
|
# reject the link if we've crawled it or queued it
|
64
69
|
internal_links.reject! { |link| @redis.sismember("crawled", link) }
|
65
70
|
internal_links.reject! { |link| @redis.sismember("queued", link) }
|
data/lib/robots.rb
CHANGED
@@ -28,10 +28,10 @@ class Robots
|
|
28
28
|
def allowed?(url)
|
29
29
|
uri = URI.parse(url)
|
30
30
|
@params[:allow].each do |pattern|
|
31
|
-
return true if uri.path.match(Cobweb.escape_pattern_for_regex(pattern))
|
31
|
+
return true if uri.path.match(Cobweb.escape_pattern_for_regex(pattern, @options))
|
32
32
|
end
|
33
33
|
@params[:disallow].each do |pattern|
|
34
|
-
return false if uri.path.match(Cobweb.escape_pattern_for_regex(pattern))
|
34
|
+
return false if uri.path.match(Cobweb.escape_pattern_for_regex(pattern, @options))
|
35
35
|
end
|
36
36
|
true
|
37
37
|
end
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -35,6 +35,51 @@ describe Cobweb do
|
|
35
35
|
options[:proxy_port].should be_nil
|
36
36
|
|
37
37
|
end
|
38
|
+
|
39
|
+
describe "link escape pattern" do
|
40
|
+
|
41
|
+
it "should return the same pattern if no triggers present" do
|
42
|
+
result = Cobweb.escape_pattern_for_regex("asdf")
|
43
|
+
result.should eql "asdf"
|
44
|
+
end
|
45
|
+
it "should escape ." do
|
46
|
+
result = Cobweb.escape_pattern_for_regex("asdf.txt")
|
47
|
+
result.should eql "asdf\\.txt"
|
48
|
+
end
|
49
|
+
it "should escape ?" do
|
50
|
+
result = Cobweb.escape_pattern_for_regex("asdf?")
|
51
|
+
result.should eql "asdf\\?"
|
52
|
+
end
|
53
|
+
it "should escape +" do
|
54
|
+
result = Cobweb.escape_pattern_for_regex("asdf + asdf = asdfasdf")
|
55
|
+
result.should eql "asdf \\+ asdf = asdfasdf"
|
56
|
+
end
|
57
|
+
it "should transform * to .*?" do
|
58
|
+
result = Cobweb.escape_pattern_for_regex("asdf*")
|
59
|
+
result.should eql "asdf.*?"
|
60
|
+
end
|
61
|
+
|
62
|
+
context "with https ignored" do
|
63
|
+
it "should ignore https" do
|
64
|
+
result = Cobweb.escape_pattern_for_regex("https://asdf.com")
|
65
|
+
result.should eql "https?://asdf\\.com"
|
66
|
+
end
|
67
|
+
it "should ignore https" do
|
68
|
+
result = Cobweb.escape_pattern_for_regex("https://asdf.com", :treat_https_as_http => true)
|
69
|
+
result.should eql "https?://asdf\\.com"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
context "without https ignored" do
|
74
|
+
it "should ignore https" do
|
75
|
+
result = Cobweb.escape_pattern_for_regex("https://asdf.com", :treat_https_as_http => false)
|
76
|
+
result.should eql "https://asdf\\.com"
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
context "with"
|
82
|
+
end
|
38
83
|
|
39
84
|
describe "get" do
|
40
85
|
it "should return a hash with default values" do
|
@@ -125,7 +170,6 @@ describe Cobweb do
|
|
125
170
|
end
|
126
171
|
|
127
172
|
describe "with cache" do
|
128
|
-
|
129
173
|
before(:each) do
|
130
174
|
@cobweb = Cobweb.new :quiet => true, :cache => 1
|
131
175
|
Redis.new.flushdb
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.24
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Stewart McKee
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-01-
|
11
|
+
date: 2015-01-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: redis
|