RubyGems - cobweb - Versions diffs - 1.0.25 → 1.0.26 - Mend

cobweb 1.0.25 → 1.0.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +4 -4
data/README.textile +1 -1
data/bin/cobweb +11 -9
data/lib/cobweb.rb +4 -7
data/lib/cobweb_crawler.rb +1 -5
data/lib/cobweb_links.rb +3 -3
data/lib/cobweb_version.rb +1 -1
data/lib/export_command.rb +11 -3
data/lib/report_command.rb +10 -1
data/lib/sidekiq/cobweb_helper.rb +2 -2
data/spec/cobweb/cobweb_crawler_spec.rb +0 -33
data/spec/cobweb/cobweb_spec.rb +4 -4
data/spec/samples/sample_site/index.html +0 -2
data/spec/spec_helper.rb +3 -4
metadata +30 -44

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 433da726316611ac2835723ff04e645fb00a3dc4
-  data.tar.gz: e85d23955c5ddbb02cf69baef0c5e240ff7d8204
+  metadata.gz: b39481d4cdb68d7f602e63a3919e51c126648a2d
+  data.tar.gz: bc29d59ae32beadf047c6e6c73503770d4ca842e
 SHA512:
-  metadata.gz: 23b0e6707b07bcad8621f8c547f48c3c8d0abf946e6454ffa3fee599fdfac5ab01942579c3f4a16ce9e743034225e68695f77df752032acc0c9fbfdfeb7e43ce
-  data.tar.gz: 702b5ed7c93e56f3994c7bee735ef1a44ceabd0cf349e1976b3692a094465b949ca618c106f1896e33551caeb64fe5383b57f53b8a8b4f18144b41255c092ad7
+  metadata.gz: fe824099c8329662036c3df2ec67688ba46f8a3337d63b4942cf93e5dd6de452f3215ddc5a80968dad552e29ff4ba445456112c39cec7e0c1c3166d4aadb91eb
+  data.tar.gz: 43fbe93aa454e77cfdb8f95d4c580dc7a8f6c526025839db75f71ccce02c4f38a2a03e91a2c8417c39311367435c2b6a03168eb48b28eb893014b8e67663ef01

data/README.textile CHANGED Viewed

@@ -1,4 +1,4 @@
-h1. Cobweb v1.0.25
+h1. Cobweb v1.0.26
 "@cobweb_gem":https://twitter.com/cobweb_gem
 !https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb

data/bin/cobweb CHANGED Viewed

@@ -17,12 +17,13 @@ opts = Slop.parse(:help => true) do
     on 'output=', 'Path to output data to'
     on 'script=', "Script to generate report"
     on 'url=', 'URL to start crawl from'
-    on 'internal_urls=', 'Url patterns to include', :as => Array
-    on 'external_urls=', 'Url patterns to exclude', :as => Array
-    on 'seed_urls=', "Seed urls", :as => Array
-    on 'crawl_limit=', 'Limit the crawl to a number of urls', :as => Integer
+    on 'internal_urls=', 'Comma separated list of URL patterns to include (* is wildcard)', :as => Array
+    on 'external_urls=', 'Comma separated list of URL patterns to exclude (* is wildcard)', :as => Array
+    on 'seed_urls=', "CSV list of seed urls to crawl", :as => Array
+    on 'seed_url_file=', "File with URL per line to add to seed list"
+    on 'crawl_limit=', 'Maximum number of URLs to crawl', :as => Integer
     on 'thread_count=', "Set the number of threads used", :as => Integer
     on 'timeout=', "Sets the timeout for http requests", :as => Integer
     on 'v', 'verbose', 'Display crawl information'
@@ -38,10 +39,11 @@ opts = Slop.parse(:help => true) do
     banner 'Usage: cobweb export [options]'
     on 'url=', 'URL to start crawl from'
-    on 'internal_urls=', 'Url patterns to include', :as => Array
-    on 'external_urls=', 'Url patterns to exclude', :as => Array
-    on 'seed_urls=', "Seed urls", :as => Array
-    on 'crawl_limit=', 'Limit the crawl to a number of urls', :as => Integer
+    on 'internal_urls=', 'Comma separated list of URL patterns to include (* is wildcard)', :as => Array
+    on 'external_urls=', 'Comma separated list of URL patterns to exclude (* is wildcard)', :as => Array
+    on 'seed_urls=', "CSV list of seed urls to crawl", :as => Array
+    on 'seed_url_file=', "File with URL per line to add to seed list"
+    on 'crawl_limit=', 'Maximum number of URLs to crawl', :as => Integer
     on 'thread_count=', "Set the number of threads used", :as => Integer
     on 'timeout=', "Sets the timeout for http requests", :as => Integer
     on 'v', 'verbose', 'Display crawl information'

data/lib/cobweb.rb CHANGED Viewed

@@ -250,10 +250,7 @@ class Cobweb
           end
         end
       rescue RedirectError => e
-        if @options[:raise_exceptions]
-          puts "Re-Raising error #{e.message} on #{uri.to_s}"
-          raise e
-        end
+        raise e if @options[:raise_exceptions]
         puts "ERROR RedirectError: #{e.message}"
         ## generate a blank content
@@ -456,9 +453,9 @@ class Cobweb
     pattern = pattern.gsub(".", "\\.")
     pattern = pattern.gsub("?", "\\?")
     pattern = pattern.gsub("+", "\\\\+")
-    pattern = pattern.gsub("*", ".*?")
-    if options[:treat_https_as_http] || !options.has_key?(:treat_https_as_http)
-      pattern = pattern.gsub("http:", "https?:")
+    pattern = pattern.gsub("*", ".*?")
+    if !options.has_key?(:treat_https_as_http) || options[:treat_https_as_http]
+      pattern = pattern.gsub("https", "https?")
     end
     pattern
   end

data/lib/cobweb_crawler.rb CHANGED Viewed

@@ -27,8 +27,7 @@ class CobwebCrawler
     @options[:seed_urls].map{|link| @redis.sadd "queued", link }
     @options[:crawl_linked_external] = false unless @options.has_key? :crawl_linked_external
-    @options[:treat_https_as_http] = true unless @options.has_key? :treat_https_as_http
     @debug = @options[:debug]
     @stats = Stats.new(@options.merge(:crawl_id => @crawl_id))
@@ -101,19 +100,16 @@ class CobwebCrawler
             document_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https]).uniq
             # select the link if its internal (eliminate external before expensive lookups in queued and crawled)
             cobweb_links = CobwebLinks.new(@options)
             internal_links = document_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
             # if the site has the same content for http and https then normalize to http
             if @options[:treat_https_as_http]
               internal_links.map!{|link| link.gsub(/^https/, "http")}
             end
             # reject the link if we've crawled it or queued it
             internal_links.reject!{|link| @redis.sismember("crawled", link)}
             internal_links.reject!{|link| @redis.sismember("queued", link)}

data/lib/cobweb_links.rb CHANGED Viewed

@@ -12,9 +12,9 @@ class CobwebLinks
     @options[:external_urls] = [] unless @options.has_key? :external_urls
     @options[:debug] = false unless @options.has_key? :debug
-    @internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, @options)}")}
-    @external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, @options)}")}
+    @internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
+    @external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
   end
   def allowed?(link)

data/lib/cobweb_version.rb CHANGED Viewed

@@ -3,7 +3,7 @@ class CobwebVersion
   # Returns a string of the current version
   def self.version
-    "1.0.25"
+    "1.0.26"
   end
 end

data/lib/export_command.rb CHANGED Viewed

@@ -13,8 +13,16 @@ class ExportCommand
       :raise_exceptions => true,
       :root_path => default_root_path
     }.merge(opts)
+    if options.has_key?(:seed_url_file)
+      filename = options.delete(:seed_url_file)
+      options[:seed_urls] = []
+      File.open(filename, "r") do |f|
+        f.each_line do |line|
+          options[:seed_urls] << line
+        end
+      end
+    end
     statistics = CobwebCrawler.new(options).crawl(options[:url]) do |page|
       begin
@@ -28,7 +36,7 @@ class ExportCommand
         uri.path.split("/")[0..-2].each do |dir|
           path+="/" unless path.ends_with?("/")
-          path+=dir
+          path+=dir
           if File.exist?(options[:root_path] + path) && !File.directory?(options[:root_path] + path)
             FileUtils.mv(options[:root_path] + path, options[:root_path] + path + ".tmp")
             Dir.mkdir(options[:root_path] + path)
@@ -48,7 +56,7 @@ class ExportCommand
           doc = Nokogiri::HTML.parse(page[:body])
           if doc.search("title").first
-            title = doc.search("title").first.content.gsub(" - ", " ")
+            title = doc.search("title").first.content.gsub(" - ", " ")
           else
             title = uri.path.split("/")[-1]
           end

data/lib/report_command.rb CHANGED Viewed

@@ -5,6 +5,16 @@ class ReportCommand
       options = opts.to_hash.delete_if { |k, v| v.nil?}
       options[:quiet] = !opts[:verbose]
+      if options.has_key?(:seed_url_file)
+        filename = options.delete(:seed_url_file)
+        options[:seed_urls] = []
+        File.open(filename, "r") do |f|
+          f.each_line do |line|
+            options[:seed_urls] << line
+          end
+        end
+      end
       @crawler = CobwebCrawler.new({:cache_type => :full, :raise_exceptions => true}.merge(options))
       columns = nil
@@ -23,7 +33,6 @@ class ReportCommand
           page["img without alt count"] = scope.img_tags.select{|node| node[:alt].nil? || node[:alt].strip().empty?}.count
           page["img alt"] = scope.img_tags_with_alt.map{|node| node[:alt]}.uniq
           if !columns
             columns = page.keys.reject{|k| k==:body || k==:links}
             csv << columns.map{|k| k.to_s}

data/lib/sidekiq/cobweb_helper.rb CHANGED Viewed

@@ -4,14 +4,14 @@ if Gem::Specification.find_all_by_name("sidekiq", ">=1.0.0").count >= 1
   require 'sidekiq'
 else
   SIDEKIQ_INSTALLED = false
-  puts "sidekiq gem not installed, skipping crawl_worker specs" if defined?(ENVIRONMENT) && ENVIRONMENT=="test"
+  puts "sidekiq gem not installed, skipping crawl_worker specs"
 end
 if Gem::Specification.find_all_by_name("resque", ">=1.0.0").count >= 1
   RESQUE_INSTALLED = true
   require 'resque'
 else
   RESQUE_INSTALLED = false
-  puts "resque gem not installed, skipping crawl_job specs" if defined?(ENVIRONMENT) && ENVIRONMENT=="test"
+  puts "resque gem not installed, skipping crawl_job specs"
 end
 module Sidekiq

data/spec/cobweb/cobweb_crawler_spec.rb CHANGED Viewed

@@ -53,39 +53,6 @@ describe CobwebCrawler do
     end
-    context "internal_links" do
-      it "should match internal links without being explicitly set" do
-        crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1})
-        crawler.crawl(@base_url)
-        queued_links = @redis_mock_object.smembers("queued")
-        queued_links.should_not include("http://themeforest.net/item/cleandream/490140")
-        queued_links.should include("http://localhost:3532/secure")
-      end
-      context "with https" do
-        it "should match https by default" do
-          crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1})
-          crawler.crawl(@base_url)
-          queued_links = @redis_mock_object.smembers("queued")
-          queued_links.should_not include("https://localhost:3532/secure")
-          queued_links.should include("http://localhost:3532/secure")
-        end
-        it "should not define https as different if treat_https_as_http is true" do
-          crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1, :treat_https_as_http => true})
-          crawler.crawl(@base_url)
-          queued_links = @redis_mock_object.smembers("queued")
-          queued_links.should_not include("https://localhost:3532/secure")
-          queued_links.should include("http://localhost:3532/secure")
-        end
-        it "should define https as different if treat_https_as_http is false" do
-          crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1, :treat_https_as_http => false})
-          crawler.crawl(@base_url)
-          queued_links = @redis_mock_object.smembers("queued")
-          queued_links.should_not include("https://localhost:3532/secure")
-          queued_links.should_not include("http://localhost:3532/secure")
-        end
-      end
-    end
     context "storing inbound links" do
       before(:each) do

data/spec/cobweb/cobweb_spec.rb CHANGED Viewed

@@ -61,19 +61,19 @@ describe Cobweb do
     context "with https ignored" do
       it "should ignore https" do
-        result = Cobweb.escape_pattern_for_regex("http://asdf.com")
+        result = Cobweb.escape_pattern_for_regex("https://asdf.com")
         result.should eql "https?://asdf\\.com"
       end
       it "should ignore https" do
-        result = Cobweb.escape_pattern_for_regex("http://asdf.com", :treat_https_as_http => true)
+        result = Cobweb.escape_pattern_for_regex("https://asdf.com", :treat_https_as_http => true)
         result.should eql "https?://asdf\\.com"
       end
     end
     context "without https ignored" do
       it "should ignore https" do
-        result = Cobweb.escape_pattern_for_regex("http://asdf.com", :treat_https_as_http => false)
-        result.should eql "http://asdf\\.com"
+        result = Cobweb.escape_pattern_for_regex("https://asdf.com", :treat_https_as_http => false)
+        result.should eql "https://asdf\\.com"
       end
     end

data/spec/samples/sample_site/index.html CHANGED Viewed

@@ -711,8 +711,6 @@
 					<a href="gfx/photos/07xl.jpg" class="zoom"><img src="gfx/photos/07.jpg" class="shadow" alt="Photo" /></a>
 					<a href="gfx/photos/08xl.jpg" class="zoom"><img src="gfx/photos/08.jpg" class="shadow" alt="Photo" /></a>
 					<a href="gfx/photos/09xl.jpg" class="zoom"><img src="gfx/photos/09.jpg" class="shadow" alt="Photo" /></a>
-					<a href="https://localhost:3532/secure">HTTPS Link</a>
 					<a href="#"><img src="gfx/photos/11.jpg" class="shadow" alt="Photo" /></a>
 					<a href="#"><img src="gfx/photos/12.jpg" class="shadow" alt="Photo" /></a>

data/spec/spec_helper.rb CHANGED Viewed

@@ -37,11 +37,10 @@ RSpec.configure do |config|
   config.before(:each) {
-    @redis_mock_object = MockRedis.new
-    Redis.stub(:new).and_return(@redis_mock_object)
-    Redis::Namespace.stub(:new).and_return(@redis_mock_object)
+    #redis_mock = double("redis")
+    #redis_mock.stub(:new).and_return(@redis_mock_object)
-    @redis_mock_object.flushdb
+    #redis_mock.flushdb
   }

metadata CHANGED Viewed

@@ -1,127 +1,113 @@
 --- !ruby/object:Gem::Specification
 name: cobweb
 version: !ruby/object:Gem::Version
-  version: 1.0.25
+  version: 1.0.26
 platform: ruby
 authors:
 - Stewart McKee
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-01-24 00:00:00.000000000 Z
+date: 2015-03-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: redis
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '3.0'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '3.0'
 - !ruby/object:Gem::Dependency
   name: nokogiri
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '1.6'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '1.6'
 - !ruby/object:Gem::Dependency
   name: addressable
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-  type: :runtime
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-- !ruby/object:Gem::Dependency
-  name: awesome_print
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '2.3'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '2.3'
 - !ruby/object:Gem::Dependency
   name: sinatra
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '1.4'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '1.4'
 - !ruby/object:Gem::Dependency
   name: haml
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '4.0'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '4.0'
 - !ruby/object:Gem::Dependency
   name: redis-namespace
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '1.3'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '1.3'
 - !ruby/object:Gem::Dependency
   name: json
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '1.8'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '1.8'
 - !ruby/object:Gem::Dependency
   name: slop
   requirement: !ruby/object:Gem::Requirement