RubyGems - cobweb - Versions diffs - 1.0.23 → 1.0.24 - Mend

cobweb 1.0.23 → 1.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 5a4dac5d8a0745f5ab077f79f14db2f09ebd110c
-  data.tar.gz: c94965108d453320f28ccb1d54eb8e6026bd9f8d
+  metadata.gz: b19a42aa5998a48743870fb7c0ed4e02ab0109f3
+  data.tar.gz: ad64a73409057595e0b60ab9cdf0d1b8e70339e6
 SHA512:
-  metadata.gz: 78c1a393efc45984459ee20c2e6f1662501794b0f56c7824d4e6db8eefcfad011576a81949b535d9374ad3e80005f6ed4a13ea0993b82d5d6bf9b9147186b06f
-  data.tar.gz: 268695b9124db64f838105011dfbabc5ba402155edb19a42450071926c7bf88d2f474060bd9d27dba9c947f6ef21e916e78c0bf2a604be02de7bd030c7f41ba8
+  metadata.gz: c91107bc0bb4cf6257b4707cbaaee723ad898750150404863715edc0ce9619bf7c0156ffd292758d5dccadf641a0b8d24347569d443d11c8c42d5c9ff446ffa9
+  data.tar.gz: 61fa0d909f9c2763d04fcd189e0b47ed55fe7290726ff7d1b72c7c902e409612e7d9e64a800b41f046c1682c1f4ced18442ab98d2752ba0057fd24988a738ff8

data/README.textile CHANGED Viewed

@@ -1,4 +1,4 @@
-h1. Cobweb v1.0.23
+h1. Cobweb v1.0.24
 "@cobweb_gem":https://twitter.com/cobweb_gem
 !https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
@@ -116,6 +116,7 @@ Creates a new crawler object based on a base_url
     ** :use_encoding_safe_process_job - Base64-encode the body when storing job in queue; set to true when you are expecting non-ASCII content (Default: false)
     ** :proxy_addr                    - hostname of a proxy to use for crawling (e. g., 'myproxy.example.net', default: nil)
     ** :proxy_port                    - port number of the proxy (default: nil)
+    ** :treat_https_as_http           - determines whether https and http urls are treated as the same (defaults to true, ie treated as the same)
 bc. crawler = Cobweb.new(:follow_redirects => false)

data/lib/cobweb.rb CHANGED Viewed

@@ -61,6 +61,9 @@ class Cobweb
     default_store_inbound_links_to            false
     default_proxy_addr_to                     nil
     default_proxy_port_to                     nil
+    default_additional_tags_to                nil
+    default_treat_https_as_http_to            true
   end
@@ -446,11 +449,14 @@ class Cobweb
   end
   # escapes characters with meaning in regular expressions and adds wildcard expression
-  def self.escape_pattern_for_regex(pattern)
+  def self.escape_pattern_for_regex(pattern, options={})
     pattern = pattern.gsub(".", "\\.")
     pattern = pattern.gsub("?", "\\?")
-    pattern = pattern.gsub("+", "\\+")
+    pattern = pattern.gsub("+", "\\\\+")
     pattern = pattern.gsub("*", ".*?")
+    if !options.has_key?(:treat_https_as_http) || options[:treat_https_as_http]
+      pattern = pattern.gsub("https", "https?")
+    end
     pattern
   end

data/lib/cobweb_crawler.rb CHANGED Viewed

@@ -104,6 +104,11 @@ class CobwebCrawler
             cobweb_links = CobwebLinks.new(@options)
             internal_links = document_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
+            # if the site has the same content for http and https then normalize to http
+            if @options[:treat_https_as_http]
+              internal_links.map!{|link| link.gsub(/^https/, "http")}
+            end
             # reject the link if we've crawled it or queued it
             internal_links.reject!{|link| @redis.sismember("crawled", link)}

data/lib/cobweb_links.rb CHANGED Viewed

@@ -12,8 +12,8 @@ class CobwebLinks
     @options[:external_urls] = [] unless @options.has_key? :external_urls
     @options[:debug] = false unless @options.has_key? :debug
-    @internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern)}")}
-    @external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern)}")}
+    @internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
+    @external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
   end

data/lib/cobweb_version.rb CHANGED Viewed

@@ -3,7 +3,7 @@ class CobwebVersion
   # Returns a string of the current version
   def self.version
-    "1.0.23"
+    "1.0.24"
   end
 end

data/lib/crawl.rb CHANGED Viewed

@@ -112,8 +112,12 @@ module CobwebModule
         # select the link if its internal
         internal_links = document_links.select{ |link| @cobweb_links.internal?(link) }
-        # reject the link if we've crawled it or queued it
+        # if the site has the same content for http and https then normalize to http
+        if @options[:treat_https_as_http]
+          internal_links.map!{|link| link.gsub(/^https/, "http")}
+        end
+        # reject the link if we've crawled it or queued it
         internal_links.reject! { |link| already_handled?(link)}
         lock("internal-links") do

data/lib/crawl_helper.rb CHANGED Viewed

@@ -60,6 +60,11 @@ class CrawlHelper
               # select the link if its internal
               internal_links.select! { |link| @cobweb_links.internal?(link) }
+              # if the site has the same content for http and https then normalize to http
+              if @options[:treat_https_as_http]
+                internal_links.map!{|link| link.gsub(/^https/, "http")}
+              end
               # reject the link if we've crawled it or queued it
               internal_links.reject! { |link| @redis.sismember("crawled", link) }
               internal_links.reject! { |link| @redis.sismember("queued", link) }

data/lib/robots.rb CHANGED Viewed

@@ -28,10 +28,10 @@ class Robots
   def allowed?(url)
     uri = URI.parse(url)
     @params[:allow].each do |pattern|
-      return true if uri.path.match(Cobweb.escape_pattern_for_regex(pattern))
+      return true if uri.path.match(Cobweb.escape_pattern_for_regex(pattern, @options))
     end
     @params[:disallow].each do |pattern|
-      return false if uri.path.match(Cobweb.escape_pattern_for_regex(pattern))
+      return false if uri.path.match(Cobweb.escape_pattern_for_regex(pattern, @options))
     end
     true
   end

data/spec/cobweb/cobweb_spec.rb CHANGED Viewed

@@ -35,6 +35,51 @@ describe Cobweb do
     options[:proxy_port].should be_nil
   end
+  describe "link escape pattern" do
+    it "should return the same pattern if no triggers present" do
+      result = Cobweb.escape_pattern_for_regex("asdf")
+      result.should eql "asdf"
+    end
+    it "should escape ." do
+      result = Cobweb.escape_pattern_for_regex("asdf.txt")
+      result.should eql "asdf\\.txt"
+    end
+    it "should escape ?" do
+      result = Cobweb.escape_pattern_for_regex("asdf?")
+      result.should eql "asdf\\?"
+    end
+    it "should escape +" do
+      result = Cobweb.escape_pattern_for_regex("asdf + asdf = asdfasdf")
+      result.should eql "asdf \\+ asdf = asdfasdf"
+    end
+    it "should transform * to .*?" do
+      result = Cobweb.escape_pattern_for_regex("asdf*")
+      result.should eql "asdf.*?"
+    end
+    context "with https ignored" do
+      it "should ignore https" do
+        result = Cobweb.escape_pattern_for_regex("https://asdf.com")
+        result.should eql "https?://asdf\\.com"
+      end
+      it "should ignore https" do
+        result = Cobweb.escape_pattern_for_regex("https://asdf.com", :treat_https_as_http => true)
+        result.should eql "https?://asdf\\.com"
+      end
+    end
+    context "without https ignored" do
+      it "should ignore https" do
+        result = Cobweb.escape_pattern_for_regex("https://asdf.com", :treat_https_as_http => false)
+        result.should eql "https://asdf\\.com"
+      end
+    end
+    context "with"
+  end
   describe "get" do
     it "should return a hash with default values" do
@@ -125,7 +170,6 @@ describe Cobweb do
     end
     describe "with cache" do
       before(:each) do
         @cobweb = Cobweb.new :quiet => true, :cache => 1
         Redis.new.flushdb

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: cobweb
 version: !ruby/object:Gem::Version
-  version: 1.0.23
+  version: 1.0.24
 platform: ruby
 authors:
 - Stewart McKee
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-01-20 00:00:00.000000000 Z
+date: 2015-01-22 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: redis