cobweb 1.0.23 → 1.0.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.textile +2 -1
- data/lib/cobweb.rb +8 -2
- data/lib/cobweb_crawler.rb +5 -0
- data/lib/cobweb_links.rb +2 -2
- data/lib/cobweb_version.rb +1 -1
- data/lib/crawl.rb +5 -1
- data/lib/crawl_helper.rb +5 -0
- data/lib/robots.rb +2 -2
- data/spec/cobweb/cobweb_spec.rb +45 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b19a42aa5998a48743870fb7c0ed4e02ab0109f3
|
4
|
+
data.tar.gz: ad64a73409057595e0b60ab9cdf0d1b8e70339e6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c91107bc0bb4cf6257b4707cbaaee723ad898750150404863715edc0ce9619bf7c0156ffd292758d5dccadf641a0b8d24347569d443d11c8c42d5c9ff446ffa9
|
7
|
+
data.tar.gz: 61fa0d909f9c2763d04fcd189e0b47ed55fe7290726ff7d1b72c7c902e409612e7d9e64a800b41f046c1682c1f4ced18442ab98d2752ba0057fd24988a738ff8
|
data/README.textile
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
h1. Cobweb v1.0.
|
1
|
+
h1. Cobweb v1.0.24
|
2
2
|
|
3
3
|
"@cobweb_gem":https://twitter.com/cobweb_gem
|
4
4
|
!https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
|
@@ -116,6 +116,7 @@ Creates a new crawler object based on a base_url
|
|
116
116
|
** :use_encoding_safe_process_job - Base64-encode the body when storing job in queue; set to true when you are expecting non-ASCII content (Default: false)
|
117
117
|
** :proxy_addr - hostname of a proxy to use for crawling (e. g., 'myproxy.example.net', default: nil)
|
118
118
|
** :proxy_port - port number of the proxy (default: nil)
|
119
|
+
** :treat_https_as_http - determines whether https and http urls are treated as the same (defaults to true, ie treated as the same)
|
119
120
|
|
120
121
|
|
121
122
|
bc. crawler = Cobweb.new(:follow_redirects => false)
|
data/lib/cobweb.rb
CHANGED
@@ -61,6 +61,9 @@ class Cobweb
|
|
61
61
|
default_store_inbound_links_to false
|
62
62
|
default_proxy_addr_to nil
|
63
63
|
default_proxy_port_to nil
|
64
|
+
default_additional_tags_to nil
|
65
|
+
default_treat_https_as_http_to true
|
66
|
+
|
64
67
|
|
65
68
|
end
|
66
69
|
|
@@ -446,11 +449,14 @@ class Cobweb
|
|
446
449
|
end
|
447
450
|
|
448
451
|
# escapes characters with meaning in regular expressions and adds wildcard expression
|
449
|
-
def self.escape_pattern_for_regex(pattern)
|
452
|
+
def self.escape_pattern_for_regex(pattern, options={})
|
450
453
|
pattern = pattern.gsub(".", "\\.")
|
451
454
|
pattern = pattern.gsub("?", "\\?")
|
452
|
-
pattern = pattern.gsub("+", "
|
455
|
+
pattern = pattern.gsub("+", "\\\\+")
|
453
456
|
pattern = pattern.gsub("*", ".*?")
|
457
|
+
if !options.has_key?(:treat_https_as_http) || options[:treat_https_as_http]
|
458
|
+
pattern = pattern.gsub("https", "https?")
|
459
|
+
end
|
454
460
|
pattern
|
455
461
|
end
|
456
462
|
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -104,6 +104,11 @@ class CobwebCrawler
|
|
104
104
|
cobweb_links = CobwebLinks.new(@options)
|
105
105
|
|
106
106
|
internal_links = document_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
|
107
|
+
|
108
|
+
# if the site has the same content for http and https then normalize to http
|
109
|
+
if @options[:treat_https_as_http]
|
110
|
+
internal_links.map!{|link| link.gsub(/^https/, "http")}
|
111
|
+
end
|
107
112
|
|
108
113
|
# reject the link if we've crawled it or queued it
|
109
114
|
internal_links.reject!{|link| @redis.sismember("crawled", link)}
|
data/lib/cobweb_links.rb
CHANGED
@@ -12,8 +12,8 @@ class CobwebLinks
|
|
12
12
|
@options[:external_urls] = [] unless @options.has_key? :external_urls
|
13
13
|
@options[:debug] = false unless @options.has_key? :debug
|
14
14
|
|
15
|
-
@internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern)}")}
|
16
|
-
@external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern)}")}
|
15
|
+
@internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
|
16
|
+
@external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
|
17
17
|
|
18
18
|
end
|
19
19
|
|
data/lib/cobweb_version.rb
CHANGED
data/lib/crawl.rb
CHANGED
@@ -112,8 +112,12 @@ module CobwebModule
|
|
112
112
|
# select the link if its internal
|
113
113
|
internal_links = document_links.select{ |link| @cobweb_links.internal?(link) }
|
114
114
|
|
115
|
-
#
|
115
|
+
# if the site has the same content for http and https then normalize to http
|
116
|
+
if @options[:treat_https_as_http]
|
117
|
+
internal_links.map!{|link| link.gsub(/^https/, "http")}
|
118
|
+
end
|
116
119
|
|
120
|
+
# reject the link if we've crawled it or queued it
|
117
121
|
internal_links.reject! { |link| already_handled?(link)}
|
118
122
|
|
119
123
|
lock("internal-links") do
|
data/lib/crawl_helper.rb
CHANGED
@@ -60,6 +60,11 @@ class CrawlHelper
|
|
60
60
|
# select the link if its internal
|
61
61
|
internal_links.select! { |link| @cobweb_links.internal?(link) }
|
62
62
|
|
63
|
+
# if the site has the same content for http and https then normalize to http
|
64
|
+
if @options[:treat_https_as_http]
|
65
|
+
internal_links.map!{|link| link.gsub(/^https/, "http")}
|
66
|
+
end
|
67
|
+
|
63
68
|
# reject the link if we've crawled it or queued it
|
64
69
|
internal_links.reject! { |link| @redis.sismember("crawled", link) }
|
65
70
|
internal_links.reject! { |link| @redis.sismember("queued", link) }
|
data/lib/robots.rb
CHANGED
@@ -28,10 +28,10 @@ class Robots
|
|
28
28
|
def allowed?(url)
|
29
29
|
uri = URI.parse(url)
|
30
30
|
@params[:allow].each do |pattern|
|
31
|
-
return true if uri.path.match(Cobweb.escape_pattern_for_regex(pattern))
|
31
|
+
return true if uri.path.match(Cobweb.escape_pattern_for_regex(pattern, @options))
|
32
32
|
end
|
33
33
|
@params[:disallow].each do |pattern|
|
34
|
-
return false if uri.path.match(Cobweb.escape_pattern_for_regex(pattern))
|
34
|
+
return false if uri.path.match(Cobweb.escape_pattern_for_regex(pattern, @options))
|
35
35
|
end
|
36
36
|
true
|
37
37
|
end
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -35,6 +35,51 @@ describe Cobweb do
|
|
35
35
|
options[:proxy_port].should be_nil
|
36
36
|
|
37
37
|
end
|
38
|
+
|
39
|
+
describe "link escape pattern" do
|
40
|
+
|
41
|
+
it "should return the same pattern if no triggers present" do
|
42
|
+
result = Cobweb.escape_pattern_for_regex("asdf")
|
43
|
+
result.should eql "asdf"
|
44
|
+
end
|
45
|
+
it "should escape ." do
|
46
|
+
result = Cobweb.escape_pattern_for_regex("asdf.txt")
|
47
|
+
result.should eql "asdf\\.txt"
|
48
|
+
end
|
49
|
+
it "should escape ?" do
|
50
|
+
result = Cobweb.escape_pattern_for_regex("asdf?")
|
51
|
+
result.should eql "asdf\\?"
|
52
|
+
end
|
53
|
+
it "should escape +" do
|
54
|
+
result = Cobweb.escape_pattern_for_regex("asdf + asdf = asdfasdf")
|
55
|
+
result.should eql "asdf \\+ asdf = asdfasdf"
|
56
|
+
end
|
57
|
+
it "should transform * to .*?" do
|
58
|
+
result = Cobweb.escape_pattern_for_regex("asdf*")
|
59
|
+
result.should eql "asdf.*?"
|
60
|
+
end
|
61
|
+
|
62
|
+
context "with https ignored" do
|
63
|
+
it "should ignore https" do
|
64
|
+
result = Cobweb.escape_pattern_for_regex("https://asdf.com")
|
65
|
+
result.should eql "https?://asdf\\.com"
|
66
|
+
end
|
67
|
+
it "should ignore https" do
|
68
|
+
result = Cobweb.escape_pattern_for_regex("https://asdf.com", :treat_https_as_http => true)
|
69
|
+
result.should eql "https?://asdf\\.com"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
context "without https ignored" do
|
74
|
+
it "should ignore https" do
|
75
|
+
result = Cobweb.escape_pattern_for_regex("https://asdf.com", :treat_https_as_http => false)
|
76
|
+
result.should eql "https://asdf\\.com"
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
context "with"
|
82
|
+
end
|
38
83
|
|
39
84
|
describe "get" do
|
40
85
|
it "should return a hash with default values" do
|
@@ -125,7 +170,6 @@ describe Cobweb do
|
|
125
170
|
end
|
126
171
|
|
127
172
|
describe "with cache" do
|
128
|
-
|
129
173
|
before(:each) do
|
130
174
|
@cobweb = Cobweb.new :quiet => true, :cache => 1
|
131
175
|
Redis.new.flushdb
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.24
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Stewart McKee
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-01-
|
11
|
+
date: 2015-01-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: redis
|