RubyGems - shelob - Versions diffs - 0.1.0.beta3 → 0.1.0.beta4 - Mend

shelob 0.1.0.beta3 → 0.1.0.beta4

Files changed (8) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 7554cee96efb6430592a84c0954da5df9cb7efc2
-  data.tar.gz: 1cf0cfad3ed2f1505f88cd45c22394b09d55c27a
+  metadata.gz: 797b409821f95b66b1a3c7a7852a4f6e1cc16159
+  data.tar.gz: 42946abb1bec2cb2545598b41e6dd97e0a27eb2f
 SHA512:
-  metadata.gz: 94d3264022e2e80736a54eb5fe1d0e68e2252be4e4008fc2855df6e841dbd141a073009c97fe10e8121ab06ccae9bf21aa558a26f0067bc0e7dc9985bc836bfd
-  data.tar.gz: 8f2d4e39cae612176646421eb29d1441a24cfacf2ae64c00c6d08fa15e341d72d89d21c1106ed462d2a2d4cbd345bf7296d9a8da54ba30bb0d3b3afe03b245aa
+  metadata.gz: 9cea61a95b7dcebdd8b49552260864e7e01b1ceda91cf537663787cbcec8357022a5c8c0a4c793f423c6fa2b6bc82c6f606e66b58806171e0254abde7bc9ff16
+  data.tar.gz: ef49faa8de4267ba382274e2a4f912a41a57967565d6dce769ec878e1ed070e15fd15c05e001adbb6b1daa8df0a39048f4a3b8ae161fc588056c527a99ccb460

data/bin/shelob CHANGED Viewed

@@ -2,9 +2,10 @@
 require 'optparse'
 require 'shelob'
+require 'shelob/version'
 def main args, options
-  puts Shelob::Spider.new(args[0], verbose: options[:verbose]).check
+  puts Shelob::Spider.new(args[0], options).check
   0
 end
@@ -20,11 +21,19 @@ optparse = OptionParser.new do |opts|
   opts.on('-r', '--[no-]really-verbose', "Print lots of information(overrides -v)") do
     options[:verbose] = 2
   end
+  opts.on('-s', '--seed SEED_URL', "Initial seed url if different from root url") do |seed|
+    options[:seed] = seed
+  end
   opts.on_tail('-h', '--help', 'Show this message') do
     puts opts
     exit
   end
+  opts.on_tail('--version', 'Show version') do
+    puts Shelob::VERSION
+  end
 end.parse!
 if ARGV.empty?

data/lib/extractor.rb CHANGED Viewed

@@ -10,7 +10,7 @@ module Shelob
     def extract
       content = Nokogiri::HTML(@fetched.body)
       raw = content.css('a').map { |anchor| anchor['href'] }
-      raw.map do |link|
+      raw.reject(&:nil?).map do |link|
         if link.start_with? '/'
           u = URI(@fetched.url)
           "#{u.scheme}://#{u.host}#{link}"

data/lib/shelob/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Shelob
-  VERSION = "0.1.0.beta3"
+  VERSION = "0.1.0.beta4"
 end

data/lib/shelob.rb CHANGED Viewed

@@ -12,12 +12,17 @@ module Shelob
     # underneath
     attr_accessor :hostname
+    # The current queue of urls to check
+    attr_accessor :queue
     # Create a new spider with the given hostname and
     # options
     #
     # Valid options:
-    # * Verbose: 0 for no output, 1 for progress output, 2
-    # for verbose output
+    # * verbose: 0 for no output, 1 for progress output, 2
+    #   for verbose output
+    # * seed: Provide an initial seed value, other than the
+    #   root url you're providing
     def initialize hostname, options = {}
       # Data
       @hostname = hostname
@@ -27,7 +32,51 @@ module Shelob
       @chatty = options[:verbose] == 2 ? true : false
       # Internal
-      @queue = [ hostname ]
+      if options[:seed].nil?
+        @queue = [ hostname ]
+      else
+        @queue = [ options[:seed] ]
+      end
+    end
+    # Entry point to the main spider process. This is the
+    # main API point, and will return once the site has
+    # been completely spidered.
+    #
+    # Returns a list of all failed urls, and their
+    # particular error code (404, 500, etc.)
+    def check
+      # set up variables
+      @urls ||= Set.new
+      @failures ||= []
+      # kick the spider off
+      run_spider
+      @failures
+    end
+    # Returns a count of the remaining urls to parse - this
+    # number is only a view of the current state, as more
+    # urls are constantly being added as other urls
+    # resolve.
+    #
+    # This would only be useful to call from another thread
+    # at this time, as check is a blocking call
+    def remaining
+      return @queue.count
+    end
+    # Return the total number of urls that were fetched in
+    # the spidering process.
+    def requests
+      return @urls.count
+    end
+    # Return an array of all urls that were fetched in the
+    # process of spidering the site.
+    def fetched
+      return @urls
     end
     # Notify that a url is about to be processed. Currently
@@ -112,45 +161,5 @@ module Shelob
         post_process_notify url
       end
     end
-    # Entry point to the main spider process. This is the
-    # main API point, and will return once the site has
-    # been completely spidered.
-    #
-    # Returns a list of all failed urls, and their
-    # particular error code (404, 500, etc.)
-    def check
-      # set up variables
-      @urls ||= Set.new
-      @failures ||= []
-      # kick the spider off
-      run_spider
-      @failures
-    end
-    # Returns a count of the remaining urls to parse - this
-    # number is only a view of the current state, as more
-    # urls are constantly being added as other urls
-    # resolve.
-    #
-    # This would only be useful to call from another thread
-    # at this time, as check is a blocking call
-    def remaining
-      return @queue.count
-    end
-    # Return the total number of urls that were fetched in
-    # the spidering process.
-    def requests
-      return @urls.count
-    end
-    # Return an array of all urls that were fetched in the
-    # process of spidering the site.
-    def fetched
-      return @urls
-    end
   end
 end

data/test/test_extractor.rb CHANGED Viewed

@@ -15,8 +15,10 @@ describe Shelob::Extractor, "Link extracting module" do
     before do
       @result = LinkResult.new("http://google.com", 200, '<html><head><title>hi</title></head><body><a href="http://bing.com">bing</a><a href="http://yahoo.com">yahoo</a></body></html>')
       @result2 = LinkResult.new("http://google.com/something", 200, '<html><head><title>hi</title></head><body><a href="/about">about</a></body></html>')
+      @result3 = LinkResult.new("http://google.com/another", 200, '<html><head><title>hi</title></head><body><a>about</a><a href="http://boop.com">boop</a></body></html>')
       @le = Shelob::Extractor.new(@result)
       @le2 = Shelob::Extractor.new(@result2)
+      @le3 = Shelob::Extractor.new(@result3)
     end
     it "should return a list of the links in the page" do
@@ -31,6 +33,13 @@ describe Shelob::Extractor, "Link extracting module" do
       extracts.must_equal ["http://google.com/about"]
     end
+    it "should gracefully handle empty links" do
+      # we shouldn't get an exception here
+      extracts = @le3.extract
+      extracts.must_be_kind_of Array
+      extracts.must_equal ["http://boop.com"]
+    end
   end # describe
 end # describe

data/test/test_shelob.rb CHANGED Viewed

@@ -30,6 +30,12 @@ describe Shelob::Spider, "Link checking spider" do
       spider.wont_be_nil
       spider.hostname.must_equal "http://bmnick.com"
     end
+    it "should be able to take a seperate seed url" do
+      spider = Shelob::Spider.new("http://bmnick.com", seed: "http://bmnick.com/resume")
+      spider.wont_be_nil
+      spider.hostname.must_equal "http://bmnick.com"
+      spider.queue.must_include "http://bmnick.com/resume"
+    end
   end
   describe "when checking links" do
     before do

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: shelob
 version: !ruby/object:Gem::Version
-  version: 0.1.0.beta3
+  version: 0.1.0.beta4
 platform: ruby
 authors:
 - Benjamin Nicholas
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-12-30 00:00:00.000000000 Z
+date: 2013-12-31 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler