RubyGems - krawler - Versions diffs - 0.1.0 → 0.1.1 - Mend

krawler 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

data/bin/krawl CHANGED Viewed

@@ -12,6 +12,10 @@ optparse = OptionParser.new do |opts|
   opts.on("-ex [regex]", "Exclude matching paths") do |ex|
     options[:ex] = ex
   end
+  opts.on("-r", "Restrict to sub paths") do |r|
+    options[:r] = true
+  end
 end
 optparse.parse!
@@ -21,5 +25,6 @@ if ARGV.empty? || !(ARGV[0] =~ /^http/)
 end
 Krawler::Base.new(ARGV[0] || 'http://localhost:3000/', {
-  :exclude => options[:ex]
+  :exclude  => options[:ex],
+  :restrict => options[:r]
 }).base

data/lib/krawler/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Krawler
-  VERSION = "0.1.0"
+  VERSION = "0.1.1"
 end

data/lib/krawler.rb CHANGED Viewed

@@ -1,24 +1,29 @@
 require 'krawler/version'
 require 'mechanize'
 require 'timeout'
+require 'uri'
+require 'pry'
 module Krawler
   class Base
     def initialize(url, options)
-      @base = url
-      @agent = Mechanize.new
-      @links_to_crawl = [@base]
+      url = URI(url)
+      @host           = "#{url.scheme}://#{url.host}"
+      @base_path      = url.path
+      @agent          = Mechanize.new
+      @links_to_crawl = [url]
       @crawled_links  = []
       @bad_links      = []
       @suspect_links  = []
       @exclude        = options[:exclude]
+      @restrict       = options[:restrict]
     end
     def base
-      puts "Crawling #{@base}"
+      puts "Crawling..."
       while !@links_to_crawl.empty? do
         crawl_page(@links_to_crawl.pop)
       end
@@ -51,11 +56,19 @@ module Krawler
       return if !page.respond_to?(:links)
       page.links.each do |new_link|
-        new_link = new_link.href
-        if (new_link =~ /^#{Regexp.escape(@base)}/) || (new_link =~ /^\//)
-          next if @crawled_links.include?(new_link)
-          next if @exclude && new_link =~ /#{@exclude}/
+        begin
+          new_url = URI(new_link.href)
+          new_link = new_url.to_s
+        rescue ArgumentError # junk link
+          next
+        end
+        if (new_link =~ /^#{Regexp.escape(@host)}/) || (new_link =~ /^\//) # don't crawl external domains
+          next if @crawled_links.include?(new_link)       # don't crawl what we've alread crawled
+          next if @exclude && new_link =~ /#{@exclude}/   # don't crawl excluded matched paths
+          next if @restrict && (new_url.path !~ /^#{Regexp.escape(@base_path)}/) # don't crawl outside of our restricted base path
           @links_to_crawl << new_link
         end
       end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: krawler
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.1
   prerelease:
 platform: ruby
 authors:
@@ -9,11 +9,11 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-05-14 00:00:00.000000000 Z
+date: 2012-05-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: mechanize
-  requirement: &70120618617640 !ruby/object:Gem::Requirement
+  requirement: &70155565557800 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -21,7 +21,7 @@ dependencies:
         version: 2.5.0
   type: :runtime
   prerelease: false
-  version_requirements: *70120618617640
+  version_requirements: *70155565557800
 description: Simple little website crawler.
 email:
 - mike@urlgonomics.com