RubyGems - krawler - Versions diffs - 1.0.8 → 1.0.9 - Mend

krawler 1.0.8 → 1.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

data/bin/krawl CHANGED Viewed

@@ -13,6 +13,10 @@ optparse = OptionParser.new do |opts|
     options[:e] = e
   end
+  opts.on('-i', '--include regex', 'Include matching paths regardless of sub path restriction') do |i|
+    options[:i] = i
+  end
   opts.on('-s', '--sub-restrict', 'Restrict to sub paths of base url', 'Default: false') do |s|
     options[:s] = true
   end
@@ -55,6 +59,7 @@ end
 Krawler::Base.new(ARGV[0] || 'http://localhost:3000/', {
   :exclude   => options[:e],
+  :include   => options[:i],
   :restrict  => options[:s],
   :threads   => options[:c],
   :randomize => options[:r],

data/lib/krawler/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Krawler
-  VERSION = '1.0.8'
+  VERSION = '1.0.9'
 end

data/lib/krawler.rb CHANGED Viewed

@@ -3,6 +3,7 @@ require 'mechanize'
 require 'timeout'
 require 'uri'
 require 'thread'
 module Krawler
   class Base
@@ -16,12 +17,13 @@ module Krawler
       @bad_links        = []
       @suspect_links    = []
       @exclude          = options[:exclude]
+      @include          = options[:include]
       @restrict         = options[:restrict]
       @randomize        = options[:randomize]
       @threads          = options[:threads]   || 1
       @username         = options[:username]
       @password         = options[:password]
-      @login_url       = options[:login_url]
+      @login_url        = options[:login_url]
       @mutex            = Mutex.new
       @agent            = Mechanize.new
       @agent.user_agent = 'Krawler'
@@ -149,10 +151,19 @@ module Krawler
           if (new_link =~ /^#{Regexp.escape(@host)}/) || (new_link =~ /^\//) # don't crawl external domains
-            next if @crawled_links.include?(new_link) || @links_to_crawl.include?(new_link)       # don't crawl what we've alread crawled
+            next if @crawled_links.include?(new_link) || @links_to_crawl.include?(new_link) # don't crawl what we've alread crawled
             next if @exclude  && new_link =~ /#{@exclude}/   # don't crawl excluded matched paths
-            next if @restrict && (new_url.path !~ /^#{Regexp.escape(@base_path)}/) # don't crawl outside of our restricted base path
+            if @restrict  # don't crawl outside of our restricted base path
+              if @include && new_url.path =~ /#{@include}/ # unless we match our inclusion
+                # ignore
+              else
+                if new_url.path !~ /^#{Regexp.escape(@base_path)}/
+                  next
+                end
+              end
+            end
             @links_to_crawl << new_link
           end
         end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: krawler
 version: !ruby/object:Gem::Version
-  version: 1.0.8
+  version: 1.0.9
   prerelease:
 platform: ruby
 authors:
@@ -9,11 +9,11 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-10-31 00:00:00.000000000 Z
+date: 2012-11-03 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: mechanize
-  requirement: &70166215374240 !ruby/object:Gem::Requirement
+  requirement: &70292261892960 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -21,7 +21,7 @@ dependencies:
         version: 2.5.1
   type: :runtime
   prerelease: false
-  version_requirements: *70166215374240
+  version_requirements: *70292261892960
 description: Simple little website crawler.
 email:
 - mike@urlgonomics.com
@@ -65,4 +65,3 @@ signing_key:
 specification_version: 3
 summary: ''
 test_files: []
-has_rdoc: