shelob 0.1.0.beta3 → 0.1.0.beta4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7554cee96efb6430592a84c0954da5df9cb7efc2
4
- data.tar.gz: 1cf0cfad3ed2f1505f88cd45c22394b09d55c27a
3
+ metadata.gz: 797b409821f95b66b1a3c7a7852a4f6e1cc16159
4
+ data.tar.gz: 42946abb1bec2cb2545598b41e6dd97e0a27eb2f
5
5
  SHA512:
6
- metadata.gz: 94d3264022e2e80736a54eb5fe1d0e68e2252be4e4008fc2855df6e841dbd141a073009c97fe10e8121ab06ccae9bf21aa558a26f0067bc0e7dc9985bc836bfd
7
- data.tar.gz: 8f2d4e39cae612176646421eb29d1441a24cfacf2ae64c00c6d08fa15e341d72d89d21c1106ed462d2a2d4cbd345bf7296d9a8da54ba30bb0d3b3afe03b245aa
6
+ metadata.gz: 9cea61a95b7dcebdd8b49552260864e7e01b1ceda91cf537663787cbcec8357022a5c8c0a4c793f423c6fa2b6bc82c6f606e66b58806171e0254abde7bc9ff16
7
+ data.tar.gz: ef49faa8de4267ba382274e2a4f912a41a57967565d6dce769ec878e1ed070e15fd15c05e001adbb6b1daa8df0a39048f4a3b8ae161fc588056c527a99ccb460
data/bin/shelob CHANGED
@@ -2,9 +2,10 @@
2
2
 
3
3
  require 'optparse'
4
4
  require 'shelob'
5
+ require 'shelob/version'
5
6
 
6
7
  def main args, options
7
- puts Shelob::Spider.new(args[0], verbose: options[:verbose]).check
8
+ puts Shelob::Spider.new(args[0], options).check
8
9
 
9
10
  0
10
11
  end
@@ -20,11 +21,19 @@ optparse = OptionParser.new do |opts|
20
21
  opts.on('-r', '--[no-]really-verbose', "Print lots of information(overrides -v)") do
21
22
  options[:verbose] = 2
22
23
  end
24
+
25
+ opts.on('-s', '--seed SEED_URL', "Initial seed url if different from root url") do |seed|
26
+ options[:seed] = seed
27
+ end
23
28
 
24
29
  opts.on_tail('-h', '--help', 'Show this message') do
25
30
  puts opts
26
31
  exit
27
32
  end
33
+
34
+ opts.on_tail('--version', 'Show version') do
35
+ puts Shelob::VERSION
36
+ end
28
37
  end.parse!
29
38
 
30
39
  if ARGV.empty?
data/lib/extractor.rb CHANGED
@@ -10,7 +10,7 @@ module Shelob
10
10
  def extract
11
11
  content = Nokogiri::HTML(@fetched.body)
12
12
  raw = content.css('a').map { |anchor| anchor['href'] }
13
- raw.map do |link|
13
+ raw.reject(&:nil?).map do |link|
14
14
  if link.start_with? '/'
15
15
  u = URI(@fetched.url)
16
16
  "#{u.scheme}://#{u.host}#{link}"
@@ -1,3 +1,3 @@
1
1
  module Shelob
2
- VERSION = "0.1.0.beta3"
2
+ VERSION = "0.1.0.beta4"
3
3
  end
data/lib/shelob.rb CHANGED
@@ -12,12 +12,17 @@ module Shelob
12
12
  # underneath
13
13
  attr_accessor :hostname
14
14
 
15
+ # The current queue of urls to check
16
+ attr_accessor :queue
17
+
15
18
  # Create a new spider with the given hostname and
16
19
  # options
17
20
  #
18
21
  # Valid options:
19
- # * Verbose: 0 for no output, 1 for progress output, 2
20
- # for verbose output
22
+ # * verbose: 0 for no output, 1 for progress output, 2
23
+ # for verbose output
24
+ # * seed: Provide an initial seed value, other than the
25
+ # root url you're providing
21
26
  def initialize hostname, options = {}
22
27
  # Data
23
28
  @hostname = hostname
@@ -27,7 +32,51 @@ module Shelob
27
32
  @chatty = options[:verbose] == 2 ? true : false
28
33
 
29
34
  # Internal
30
- @queue = [ hostname ]
35
+ if options[:seed].nil?
36
+ @queue = [ hostname ]
37
+ else
38
+ @queue = [ options[:seed] ]
39
+ end
40
+ end
41
+
42
+ # Entry point to the main spider process. This is the
43
+ # main API point, and will return once the site has
44
+ # been completely spidered.
45
+ #
46
+ # Returns a list of all failed urls, and their
47
+ # particular error code (404, 500, etc.)
48
+ def check
49
+ # set up variables
50
+ @urls ||= Set.new
51
+ @failures ||= []
52
+
53
+ # kick the spider off
54
+ run_spider
55
+
56
+ @failures
57
+ end
58
+
59
+ # Returns a count of the remaining urls to parse - this
60
+ # number is only a view of the current state, as more
61
+ # urls are constantly being added as other urls
62
+ # resolve.
63
+ #
64
+ # This would only be useful to call from another thread
65
+ # at this time, as check is a blocking call
66
+ def remaining
67
+ return @queue.count
68
+ end
69
+
70
+ # Return the total number of urls that were fetched in
71
+ # the spidering process.
72
+ def requests
73
+ return @urls.count
74
+ end
75
+
76
+ # Return an array of all urls that were fetched in the
77
+ # process of spidering the site.
78
+ def fetched
79
+ return @urls
31
80
  end
32
81
 
33
82
  # Notify that a url is about to be processed. Currently
@@ -112,45 +161,5 @@ module Shelob
112
161
  post_process_notify url
113
162
  end
114
163
  end
115
-
116
- # Entry point to the main spider process. This is the
117
- # main API point, and will return once the site has
118
- # been completely spidered.
119
- #
120
- # Returns a list of all failed urls, and their
121
- # particular error code (404, 500, etc.)
122
- def check
123
- # set up variables
124
- @urls ||= Set.new
125
- @failures ||= []
126
-
127
- # kick the spider off
128
- run_spider
129
-
130
- @failures
131
- end
132
-
133
- # Returns a count of the remaining urls to parse - this
134
- # number is only a view of the current state, as more
135
- # urls are constantly being added as other urls
136
- # resolve.
137
- #
138
- # This would only be useful to call from another thread
139
- # at this time, as check is a blocking call
140
- def remaining
141
- return @queue.count
142
- end
143
-
144
- # Return the total number of urls that were fetched in
145
- # the spidering process.
146
- def requests
147
- return @urls.count
148
- end
149
-
150
- # Return an array of all urls that were fetched in the
151
- # process of spidering the site.
152
- def fetched
153
- return @urls
154
- end
155
164
  end
156
165
  end
@@ -15,8 +15,10 @@ describe Shelob::Extractor, "Link extracting module" do
15
15
  before do
16
16
  @result = LinkResult.new("http://google.com", 200, '<html><head><title>hi</title></head><body><a href="http://bing.com">bing</a><a href="http://yahoo.com">yahoo</a></body></html>')
17
17
  @result2 = LinkResult.new("http://google.com/something", 200, '<html><head><title>hi</title></head><body><a href="/about">about</a></body></html>')
18
+ @result3 = LinkResult.new("http://google.com/another", 200, '<html><head><title>hi</title></head><body><a>about</a><a href="http://boop.com">boop</a></body></html>')
18
19
  @le = Shelob::Extractor.new(@result)
19
20
  @le2 = Shelob::Extractor.new(@result2)
21
+ @le3 = Shelob::Extractor.new(@result3)
20
22
  end
21
23
 
22
24
  it "should return a list of the links in the page" do
@@ -31,6 +33,13 @@ describe Shelob::Extractor, "Link extracting module" do
31
33
  extracts.must_equal ["http://google.com/about"]
32
34
  end
33
35
 
36
+ it "should gracefully handle empty links" do
37
+ # we shouldn't get an exception here
38
+ extracts = @le3.extract
39
+ extracts.must_be_kind_of Array
40
+ extracts.must_equal ["http://boop.com"]
41
+ end
42
+
34
43
  end # describe
35
44
 
36
45
  end # describe
data/test/test_shelob.rb CHANGED
@@ -30,6 +30,12 @@ describe Shelob::Spider, "Link checking spider" do
30
30
  spider.wont_be_nil
31
31
  spider.hostname.must_equal "http://bmnick.com"
32
32
  end
33
+ it "should be able to take a seperate seed url" do
34
+ spider = Shelob::Spider.new("http://bmnick.com", seed: "http://bmnick.com/resume")
35
+ spider.wont_be_nil
36
+ spider.hostname.must_equal "http://bmnick.com"
37
+ spider.queue.must_include "http://bmnick.com/resume"
38
+ end
33
39
  end
34
40
  describe "when checking links" do
35
41
  before do
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: shelob
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0.beta3
4
+ version: 0.1.0.beta4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Benjamin Nicholas
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-12-30 00:00:00.000000000 Z
11
+ date: 2013-12-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler