cobweb 1.0.23 → 1.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5a4dac5d8a0745f5ab077f79f14db2f09ebd110c
4
- data.tar.gz: c94965108d453320f28ccb1d54eb8e6026bd9f8d
3
+ metadata.gz: b19a42aa5998a48743870fb7c0ed4e02ab0109f3
4
+ data.tar.gz: ad64a73409057595e0b60ab9cdf0d1b8e70339e6
5
5
  SHA512:
6
- metadata.gz: 78c1a393efc45984459ee20c2e6f1662501794b0f56c7824d4e6db8eefcfad011576a81949b535d9374ad3e80005f6ed4a13ea0993b82d5d6bf9b9147186b06f
7
- data.tar.gz: 268695b9124db64f838105011dfbabc5ba402155edb19a42450071926c7bf88d2f474060bd9d27dba9c947f6ef21e916e78c0bf2a604be02de7bd030c7f41ba8
6
+ metadata.gz: c91107bc0bb4cf6257b4707cbaaee723ad898750150404863715edc0ce9619bf7c0156ffd292758d5dccadf641a0b8d24347569d443d11c8c42d5c9ff446ffa9
7
+ data.tar.gz: 61fa0d909f9c2763d04fcd189e0b47ed55fe7290726ff7d1b72c7c902e409612e7d9e64a800b41f046c1682c1f4ced18442ab98d2752ba0057fd24988a738ff8
data/README.textile CHANGED
@@ -1,4 +1,4 @@
1
- h1. Cobweb v1.0.23
1
+ h1. Cobweb v1.0.24
2
2
 
3
3
  "@cobweb_gem":https://twitter.com/cobweb_gem
4
4
  !https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
@@ -116,6 +116,7 @@ Creates a new crawler object based on a base_url
116
116
  ** :use_encoding_safe_process_job - Base64-encode the body when storing job in queue; set to true when you are expecting non-ASCII content (Default: false)
117
117
  ** :proxy_addr - hostname of a proxy to use for crawling (e. g., 'myproxy.example.net', default: nil)
118
118
  ** :proxy_port - port number of the proxy (default: nil)
119
+ ** :treat_https_as_http - determines whether https and http urls are treated as the same (defaults to true, ie treated as the same)
119
120
 
120
121
 
121
122
  bc. crawler = Cobweb.new(:follow_redirects => false)
data/lib/cobweb.rb CHANGED
@@ -61,6 +61,9 @@ class Cobweb
61
61
  default_store_inbound_links_to false
62
62
  default_proxy_addr_to nil
63
63
  default_proxy_port_to nil
64
+ default_additional_tags_to nil
65
+ default_treat_https_as_http_to true
66
+
64
67
 
65
68
  end
66
69
 
@@ -446,11 +449,14 @@ class Cobweb
446
449
  end
447
450
 
448
451
  # escapes characters with meaning in regular expressions and adds wildcard expression
449
- def self.escape_pattern_for_regex(pattern)
452
+ def self.escape_pattern_for_regex(pattern, options={})
450
453
  pattern = pattern.gsub(".", "\\.")
451
454
  pattern = pattern.gsub("?", "\\?")
452
- pattern = pattern.gsub("+", "\\+")
455
+ pattern = pattern.gsub("+", "\\\\+")
453
456
  pattern = pattern.gsub("*", ".*?")
457
+ if !options.has_key?(:treat_https_as_http) || options[:treat_https_as_http]
458
+ pattern = pattern.gsub("https", "https?")
459
+ end
454
460
  pattern
455
461
  end
456
462
 
@@ -104,6 +104,11 @@ class CobwebCrawler
104
104
  cobweb_links = CobwebLinks.new(@options)
105
105
 
106
106
  internal_links = document_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
107
+
108
+ # if the site has the same content for http and https then normalize to http
109
+ if @options[:treat_https_as_http]
110
+ internal_links.map!{|link| link.gsub(/^https/, "http")}
111
+ end
107
112
 
108
113
  # reject the link if we've crawled it or queued it
109
114
  internal_links.reject!{|link| @redis.sismember("crawled", link)}
data/lib/cobweb_links.rb CHANGED
@@ -12,8 +12,8 @@ class CobwebLinks
12
12
  @options[:external_urls] = [] unless @options.has_key? :external_urls
13
13
  @options[:debug] = false unless @options.has_key? :debug
14
14
 
15
- @internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern)}")}
16
- @external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern)}")}
15
+ @internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
16
+ @external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
17
17
 
18
18
  end
19
19
 
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.23"
6
+ "1.0.24"
7
7
  end
8
8
 
9
9
  end
data/lib/crawl.rb CHANGED
@@ -112,8 +112,12 @@ module CobwebModule
112
112
  # select the link if its internal
113
113
  internal_links = document_links.select{ |link| @cobweb_links.internal?(link) }
114
114
 
115
- # reject the link if we've crawled it or queued it
115
+ # if the site has the same content for http and https then normalize to http
116
+ if @options[:treat_https_as_http]
117
+ internal_links.map!{|link| link.gsub(/^https/, "http")}
118
+ end
116
119
 
120
+ # reject the link if we've crawled it or queued it
117
121
  internal_links.reject! { |link| already_handled?(link)}
118
122
 
119
123
  lock("internal-links") do
data/lib/crawl_helper.rb CHANGED
@@ -60,6 +60,11 @@ class CrawlHelper
60
60
  # select the link if its internal
61
61
  internal_links.select! { |link| @cobweb_links.internal?(link) }
62
62
 
63
+ # if the site has the same content for http and https then normalize to http
64
+ if @options[:treat_https_as_http]
65
+ internal_links.map!{|link| link.gsub(/^https/, "http")}
66
+ end
67
+
63
68
  # reject the link if we've crawled it or queued it
64
69
  internal_links.reject! { |link| @redis.sismember("crawled", link) }
65
70
  internal_links.reject! { |link| @redis.sismember("queued", link) }
data/lib/robots.rb CHANGED
@@ -28,10 +28,10 @@ class Robots
28
28
  def allowed?(url)
29
29
  uri = URI.parse(url)
30
30
  @params[:allow].each do |pattern|
31
- return true if uri.path.match(Cobweb.escape_pattern_for_regex(pattern))
31
+ return true if uri.path.match(Cobweb.escape_pattern_for_regex(pattern, @options))
32
32
  end
33
33
  @params[:disallow].each do |pattern|
34
- return false if uri.path.match(Cobweb.escape_pattern_for_regex(pattern))
34
+ return false if uri.path.match(Cobweb.escape_pattern_for_regex(pattern, @options))
35
35
  end
36
36
  true
37
37
  end
@@ -35,6 +35,51 @@ describe Cobweb do
35
35
  options[:proxy_port].should be_nil
36
36
 
37
37
  end
38
+
39
+ describe "link escape pattern" do
40
+
41
+ it "should return the same pattern if no triggers present" do
42
+ result = Cobweb.escape_pattern_for_regex("asdf")
43
+ result.should eql "asdf"
44
+ end
45
+ it "should escape ." do
46
+ result = Cobweb.escape_pattern_for_regex("asdf.txt")
47
+ result.should eql "asdf\\.txt"
48
+ end
49
+ it "should escape ?" do
50
+ result = Cobweb.escape_pattern_for_regex("asdf?")
51
+ result.should eql "asdf\\?"
52
+ end
53
+ it "should escape +" do
54
+ result = Cobweb.escape_pattern_for_regex("asdf + asdf = asdfasdf")
55
+ result.should eql "asdf \\+ asdf = asdfasdf"
56
+ end
57
+ it "should transform * to .*?" do
58
+ result = Cobweb.escape_pattern_for_regex("asdf*")
59
+ result.should eql "asdf.*?"
60
+ end
61
+
62
+ context "with https ignored" do
63
+ it "should ignore https" do
64
+ result = Cobweb.escape_pattern_for_regex("https://asdf.com")
65
+ result.should eql "https?://asdf\\.com"
66
+ end
67
+ it "should ignore https" do
68
+ result = Cobweb.escape_pattern_for_regex("https://asdf.com", :treat_https_as_http => true)
69
+ result.should eql "https?://asdf\\.com"
70
+ end
71
+ end
72
+
73
+ context "without https ignored" do
74
+ it "should ignore https" do
75
+ result = Cobweb.escape_pattern_for_regex("https://asdf.com", :treat_https_as_http => false)
76
+ result.should eql "https://asdf\\.com"
77
+ end
78
+ end
79
+
80
+
81
+ context "with"
82
+ end
38
83
 
39
84
  describe "get" do
40
85
  it "should return a hash with default values" do
@@ -125,7 +170,6 @@ describe Cobweb do
125
170
  end
126
171
 
127
172
  describe "with cache" do
128
-
129
173
  before(:each) do
130
174
  @cobweb = Cobweb.new :quiet => true, :cache => 1
131
175
  Redis.new.flushdb
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.23
4
+ version: 1.0.24
5
5
  platform: ruby
6
6
  authors:
7
7
  - Stewart McKee
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-20 00:00:00.000000000 Z
11
+ date: 2015-01-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: redis