cobweb 1.0.23 → 1.0.24

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5a4dac5d8a0745f5ab077f79f14db2f09ebd110c
4
- data.tar.gz: c94965108d453320f28ccb1d54eb8e6026bd9f8d
3
+ metadata.gz: b19a42aa5998a48743870fb7c0ed4e02ab0109f3
4
+ data.tar.gz: ad64a73409057595e0b60ab9cdf0d1b8e70339e6
5
5
  SHA512:
6
- metadata.gz: 78c1a393efc45984459ee20c2e6f1662501794b0f56c7824d4e6db8eefcfad011576a81949b535d9374ad3e80005f6ed4a13ea0993b82d5d6bf9b9147186b06f
7
- data.tar.gz: 268695b9124db64f838105011dfbabc5ba402155edb19a42450071926c7bf88d2f474060bd9d27dba9c947f6ef21e916e78c0bf2a604be02de7bd030c7f41ba8
6
+ metadata.gz: c91107bc0bb4cf6257b4707cbaaee723ad898750150404863715edc0ce9619bf7c0156ffd292758d5dccadf641a0b8d24347569d443d11c8c42d5c9ff446ffa9
7
+ data.tar.gz: 61fa0d909f9c2763d04fcd189e0b47ed55fe7290726ff7d1b72c7c902e409612e7d9e64a800b41f046c1682c1f4ced18442ab98d2752ba0057fd24988a738ff8
data/README.textile CHANGED
@@ -1,4 +1,4 @@
1
- h1. Cobweb v1.0.23
1
+ h1. Cobweb v1.0.24
2
2
 
3
3
  "@cobweb_gem":https://twitter.com/cobweb_gem
4
4
  !https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
@@ -116,6 +116,7 @@ Creates a new crawler object based on a base_url
116
116
  ** :use_encoding_safe_process_job - Base64-encode the body when storing job in queue; set to true when you are expecting non-ASCII content (Default: false)
117
117
  ** :proxy_addr - hostname of a proxy to use for crawling (e. g., 'myproxy.example.net', default: nil)
118
118
  ** :proxy_port - port number of the proxy (default: nil)
119
+ ** :treat_https_as_http - determines whether https and http urls are treated as the same (defaults to true, ie treated as the same)
119
120
 
120
121
 
121
122
  bc. crawler = Cobweb.new(:follow_redirects => false)
data/lib/cobweb.rb CHANGED
@@ -61,6 +61,9 @@ class Cobweb
61
61
  default_store_inbound_links_to false
62
62
  default_proxy_addr_to nil
63
63
  default_proxy_port_to nil
64
+ default_additional_tags_to nil
65
+ default_treat_https_as_http_to true
66
+
64
67
 
65
68
  end
66
69
 
@@ -446,11 +449,14 @@ class Cobweb
446
449
  end
447
450
 
448
451
  # escapes characters with meaning in regular expressions and adds wildcard expression
449
- def self.escape_pattern_for_regex(pattern)
452
+ def self.escape_pattern_for_regex(pattern, options={})
450
453
  pattern = pattern.gsub(".", "\\.")
451
454
  pattern = pattern.gsub("?", "\\?")
452
- pattern = pattern.gsub("+", "\\+")
455
+ pattern = pattern.gsub("+", "\\\\+")
453
456
  pattern = pattern.gsub("*", ".*?")
457
+ if !options.has_key?(:treat_https_as_http) || options[:treat_https_as_http]
458
+ pattern = pattern.gsub("https", "https?")
459
+ end
454
460
  pattern
455
461
  end
456
462
 
@@ -104,6 +104,11 @@ class CobwebCrawler
104
104
  cobweb_links = CobwebLinks.new(@options)
105
105
 
106
106
  internal_links = document_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
107
+
108
+ # if the site has the same content for http and https then normalize to http
109
+ if @options[:treat_https_as_http]
110
+ internal_links.map!{|link| link.gsub(/^https/, "http")}
111
+ end
107
112
 
108
113
  # reject the link if we've crawled it or queued it
109
114
  internal_links.reject!{|link| @redis.sismember("crawled", link)}
data/lib/cobweb_links.rb CHANGED
@@ -12,8 +12,8 @@ class CobwebLinks
12
12
  @options[:external_urls] = [] unless @options.has_key? :external_urls
13
13
  @options[:debug] = false unless @options.has_key? :debug
14
14
 
15
- @internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern)}")}
16
- @external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern)}")}
15
+ @internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
16
+ @external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
17
17
 
18
18
  end
19
19
 
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.23"
6
+ "1.0.24"
7
7
  end
8
8
 
9
9
  end
data/lib/crawl.rb CHANGED
@@ -112,8 +112,12 @@ module CobwebModule
112
112
  # select the link if its internal
113
113
  internal_links = document_links.select{ |link| @cobweb_links.internal?(link) }
114
114
 
115
- # reject the link if we've crawled it or queued it
115
+ # if the site has the same content for http and https then normalize to http
116
+ if @options[:treat_https_as_http]
117
+ internal_links.map!{|link| link.gsub(/^https/, "http")}
118
+ end
116
119
 
120
+ # reject the link if we've crawled it or queued it
117
121
  internal_links.reject! { |link| already_handled?(link)}
118
122
 
119
123
  lock("internal-links") do
data/lib/crawl_helper.rb CHANGED
@@ -60,6 +60,11 @@ class CrawlHelper
60
60
  # select the link if its internal
61
61
  internal_links.select! { |link| @cobweb_links.internal?(link) }
62
62
 
63
+ # if the site has the same content for http and https then normalize to http
64
+ if @options[:treat_https_as_http]
65
+ internal_links.map!{|link| link.gsub(/^https/, "http")}
66
+ end
67
+
63
68
  # reject the link if we've crawled it or queued it
64
69
  internal_links.reject! { |link| @redis.sismember("crawled", link) }
65
70
  internal_links.reject! { |link| @redis.sismember("queued", link) }
data/lib/robots.rb CHANGED
@@ -28,10 +28,10 @@ class Robots
28
28
  def allowed?(url)
29
29
  uri = URI.parse(url)
30
30
  @params[:allow].each do |pattern|
31
- return true if uri.path.match(Cobweb.escape_pattern_for_regex(pattern))
31
+ return true if uri.path.match(Cobweb.escape_pattern_for_regex(pattern, @options))
32
32
  end
33
33
  @params[:disallow].each do |pattern|
34
- return false if uri.path.match(Cobweb.escape_pattern_for_regex(pattern))
34
+ return false if uri.path.match(Cobweb.escape_pattern_for_regex(pattern, @options))
35
35
  end
36
36
  true
37
37
  end
@@ -35,6 +35,51 @@ describe Cobweb do
35
35
  options[:proxy_port].should be_nil
36
36
 
37
37
  end
38
+
39
+ describe "link escape pattern" do
40
+
41
+ it "should return the same pattern if no triggers present" do
42
+ result = Cobweb.escape_pattern_for_regex("asdf")
43
+ result.should eql "asdf"
44
+ end
45
+ it "should escape ." do
46
+ result = Cobweb.escape_pattern_for_regex("asdf.txt")
47
+ result.should eql "asdf\\.txt"
48
+ end
49
+ it "should escape ?" do
50
+ result = Cobweb.escape_pattern_for_regex("asdf?")
51
+ result.should eql "asdf\\?"
52
+ end
53
+ it "should escape +" do
54
+ result = Cobweb.escape_pattern_for_regex("asdf + asdf = asdfasdf")
55
+ result.should eql "asdf \\+ asdf = asdfasdf"
56
+ end
57
+ it "should transform * to .*?" do
58
+ result = Cobweb.escape_pattern_for_regex("asdf*")
59
+ result.should eql "asdf.*?"
60
+ end
61
+
62
+ context "with https ignored" do
63
+ it "should ignore https" do
64
+ result = Cobweb.escape_pattern_for_regex("https://asdf.com")
65
+ result.should eql "https?://asdf\\.com"
66
+ end
67
+ it "should ignore https" do
68
+ result = Cobweb.escape_pattern_for_regex("https://asdf.com", :treat_https_as_http => true)
69
+ result.should eql "https?://asdf\\.com"
70
+ end
71
+ end
72
+
73
+ context "without https ignored" do
74
+ it "should ignore https" do
75
+ result = Cobweb.escape_pattern_for_regex("https://asdf.com", :treat_https_as_http => false)
76
+ result.should eql "https://asdf\\.com"
77
+ end
78
+ end
79
+
80
+
81
+ context "with"
82
+ end
38
83
 
39
84
  describe "get" do
40
85
  it "should return a hash with default values" do
@@ -125,7 +170,6 @@ describe Cobweb do
125
170
  end
126
171
 
127
172
  describe "with cache" do
128
-
129
173
  before(:each) do
130
174
  @cobweb = Cobweb.new :quiet => true, :cache => 1
131
175
  Redis.new.flushdb
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.23
4
+ version: 1.0.24
5
5
  platform: ruby
6
6
  authors:
7
7
  - Stewart McKee
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-20 00:00:00.000000000 Z
11
+ date: 2015-01-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: redis