RubyGems - shelob - Versions diffs - 0.1.0.beta3 → 0.1.0.beta4 - Mend

shelob 0.1.0.beta3 → 0.1.0.beta4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 7554cee96efb6430592a84c0954da5df9cb7efc2
-  data.tar.gz: 1cf0cfad3ed2f1505f88cd45c22394b09d55c27a
+  metadata.gz: 797b409821f95b66b1a3c7a7852a4f6e1cc16159
+  data.tar.gz: 42946abb1bec2cb2545598b41e6dd97e0a27eb2f
 SHA512:
-  metadata.gz: 94d3264022e2e80736a54eb5fe1d0e68e2252be4e4008fc2855df6e841dbd141a073009c97fe10e8121ab06ccae9bf21aa558a26f0067bc0e7dc9985bc836bfd
-  data.tar.gz: 8f2d4e39cae612176646421eb29d1441a24cfacf2ae64c00c6d08fa15e341d72d89d21c1106ed462d2a2d4cbd345bf7296d9a8da54ba30bb0d3b3afe03b245aa
+  metadata.gz: 9cea61a95b7dcebdd8b49552260864e7e01b1ceda91cf537663787cbcec8357022a5c8c0a4c793f423c6fa2b6bc82c6f606e66b58806171e0254abde7bc9ff16
+  data.tar.gz: ef49faa8de4267ba382274e2a4f912a41a57967565d6dce769ec878e1ed070e15fd15c05e001adbb6b1daa8df0a39048f4a3b8ae161fc588056c527a99ccb460

data/bin/shelob CHANGED Viewed

@@ -2,9 +2,10 @@
 require 'optparse'
 require 'shelob'
+require 'shelob/version'
 def main args, options
-  puts Shelob::Spider.new(args[0], verbose: options[:verbose]).check
+  puts Shelob::Spider.new(args[0], options).check
   0
 end
@@ -20,11 +21,19 @@ optparse = OptionParser.new do |opts|
   opts.on('-r', '--[no-]really-verbose', "Print lots of information(overrides -v)") do
     options[:verbose] = 2
   end
+  opts.on('-s', '--seed SEED_URL', "Initial seed url if different from root url") do |seed|
+    options[:seed] = seed
+  end
   opts.on_tail('-h', '--help', 'Show this message') do
     puts opts
     exit
   end
+  opts.on_tail('--version', 'Show version') do
+    puts Shelob::VERSION
+  end
 end.parse!
 if ARGV.empty?

data/lib/extractor.rb CHANGED Viewed

@@ -10,7 +10,7 @@ module Shelob
     def extract
       content = Nokogiri::HTML(@fetched.body)
       raw = content.css('a').map { |anchor| anchor['href'] }
-      raw.map do |link|
+      raw.reject(&:nil?).map do |link|
         if link.start_with? '/'
           u = URI(@fetched.url)
           "#{u.scheme}://#{u.host}#{link}"

data/lib/shelob/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Shelob
-  VERSION = "0.1.0.beta3"
+  VERSION = "0.1.0.beta4"
 end

data/lib/shelob.rb CHANGED Viewed

@@ -12,12 +12,17 @@ module Shelob
     # underneath
     attr_accessor :hostname
+    # The current queue of urls to check
+    attr_accessor :queue
     # Create a new spider with the given hostname and
     # options
     #
     # Valid options:
-    # * Verbose: 0 for no output, 1 for progress output, 2
-    # for verbose output
+    # * verbose: 0 for no output, 1 for progress output, 2
+    #   for verbose output
+    # * seed: Provide an initial seed value, other than the
+    #   root url you're providing
     def initialize hostname, options = {}
       # Data
       @hostname = hostname
@@ -27,7 +32,51 @@ module Shelob
       @chatty = options[:verbose] == 2 ? true : false
       # Internal
-      @queue = [ hostname ]
+      if options[:seed].nil?
+        @queue = [ hostname ]
+      else
+        @queue = [ options[:seed] ]
+      end
+    end
+    # Entry point to the main spider process. This is the
+    # main API point, and will return once the site has
+    # been completely spidered.
+    #
+    # Returns a list of all failed urls, and their
+    # particular error code (404, 500, etc.)
+    def check
+      # set up variables
+      @urls ||= Set.new
+      @failures ||= []
+      # kick the spider off
+      run_spider
+      @failures
+    end
+    # Returns a count of the remaining urls to parse - this
+    # number is only a view of the current state, as more
+    # urls are constantly being added as other urls
+    # resolve.
+    #
+    # This would only be useful to call from another thread
+    # at this time, as check is a blocking call
+    def remaining
+      return @queue.count
+    end
+    # Return the total number of urls that were fetched in
+    # the spidering process.
+    def requests
+      return @urls.count
+    end
+    # Return an array of all urls that were fetched in the
+    # process of spidering the site.
+    def fetched
+      return @urls
     end
     # Notify that a url is about to be processed. Currently
@@ -112,45 +161,5 @@ module Shelob
         post_process_notify url
       end
     end
-    # Entry point to the main spider process. This is the
-    # main API point, and will return once the site has
-    # been completely spidered.
-    #
-    # Returns a list of all failed urls, and their
-    # particular error code (404, 500, etc.)
-    def check
-      # set up variables
-      @urls ||= Set.new
-      @failures ||= []
-      # kick the spider off
-      run_spider
-      @failures
-    end
-    # Returns a count of the remaining urls to parse - this
-    # number is only a view of the current state, as more
-    # urls are constantly being added as other urls
-    # resolve.
-    #
-    # This would only be useful to call from another thread
-    # at this time, as check is a blocking call
-    def remaining
-      return @queue.count
-    end
-    # Return the total number of urls that were fetched in
-    # the spidering process.
-    def requests
-      return @urls.count
-    end
-    # Return an array of all urls that were fetched in the
-    # process of spidering the site.
-    def fetched
-      return @urls
-    end
   end
 end

data/test/test_extractor.rb CHANGED Viewed

@@ -15,8 +15,10 @@ describe Shelob::Extractor, "Link extracting module" do
     before do
       @result = LinkResult.new("http://google.com", 200, '<html><head><title>hi</title></head><body><a href="http://bing.com">bing</a><a href="http://yahoo.com">yahoo</a></body></html>')
       @result2 = LinkResult.new("http://google.com/something", 200, '<html><head><title>hi</title></head><body><a href="/about">about</a></body></html>')
+      @result3 = LinkResult.new("http://google.com/another", 200, '<html><head><title>hi</title></head><body><a>about</a><a href="http://boop.com">boop</a></body></html>')
       @le = Shelob::Extractor.new(@result)
       @le2 = Shelob::Extractor.new(@result2)
+      @le3 = Shelob::Extractor.new(@result3)
     end
     it "should return a list of the links in the page" do
@@ -31,6 +33,13 @@ describe Shelob::Extractor, "Link extracting module" do
       extracts.must_equal ["http://google.com/about"]
     end
+    it "should gracefully handle empty links" do
+      # we shouldn't get an exception here
+      extracts = @le3.extract
+      extracts.must_be_kind_of Array
+      extracts.must_equal ["http://boop.com"]
+    end
   end # describe
 end # describe

data/test/test_shelob.rb CHANGED Viewed

@@ -30,6 +30,12 @@ describe Shelob::Spider, "Link checking spider" do
       spider.wont_be_nil
       spider.hostname.must_equal "http://bmnick.com"
     end
+    it "should be able to take a seperate seed url" do
+      spider = Shelob::Spider.new("http://bmnick.com", seed: "http://bmnick.com/resume")
+      spider.wont_be_nil
+      spider.hostname.must_equal "http://bmnick.com"
+      spider.queue.must_include "http://bmnick.com/resume"
+    end
   end
   describe "when checking links" do
     before do

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: shelob
 version: !ruby/object:Gem::Version
-  version: 0.1.0.beta3
+  version: 0.1.0.beta4
 platform: ruby
 authors:
 - Benjamin Nicholas
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-12-30 00:00:00.000000000 Z
+date: 2013-12-31 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler