shelob 0.1.0.beta3 → 0.1.0.beta4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7554cee96efb6430592a84c0954da5df9cb7efc2
4
- data.tar.gz: 1cf0cfad3ed2f1505f88cd45c22394b09d55c27a
3
+ metadata.gz: 797b409821f95b66b1a3c7a7852a4f6e1cc16159
4
+ data.tar.gz: 42946abb1bec2cb2545598b41e6dd97e0a27eb2f
5
5
  SHA512:
6
- metadata.gz: 94d3264022e2e80736a54eb5fe1d0e68e2252be4e4008fc2855df6e841dbd141a073009c97fe10e8121ab06ccae9bf21aa558a26f0067bc0e7dc9985bc836bfd
7
- data.tar.gz: 8f2d4e39cae612176646421eb29d1441a24cfacf2ae64c00c6d08fa15e341d72d89d21c1106ed462d2a2d4cbd345bf7296d9a8da54ba30bb0d3b3afe03b245aa
6
+ metadata.gz: 9cea61a95b7dcebdd8b49552260864e7e01b1ceda91cf537663787cbcec8357022a5c8c0a4c793f423c6fa2b6bc82c6f606e66b58806171e0254abde7bc9ff16
7
+ data.tar.gz: ef49faa8de4267ba382274e2a4f912a41a57967565d6dce769ec878e1ed070e15fd15c05e001adbb6b1daa8df0a39048f4a3b8ae161fc588056c527a99ccb460
data/bin/shelob CHANGED
@@ -2,9 +2,10 @@
2
2
 
3
3
  require 'optparse'
4
4
  require 'shelob'
5
+ require 'shelob/version'
5
6
 
6
7
  def main args, options
7
- puts Shelob::Spider.new(args[0], verbose: options[:verbose]).check
8
+ puts Shelob::Spider.new(args[0], options).check
8
9
 
9
10
  0
10
11
  end
@@ -20,11 +21,19 @@ optparse = OptionParser.new do |opts|
20
21
  opts.on('-r', '--[no-]really-verbose', "Print lots of information(overrides -v)") do
21
22
  options[:verbose] = 2
22
23
  end
24
+
25
+ opts.on('-s', '--seed SEED_URL', "Initial seed url if different from root url") do |seed|
26
+ options[:seed] = seed
27
+ end
23
28
 
24
29
  opts.on_tail('-h', '--help', 'Show this message') do
25
30
  puts opts
26
31
  exit
27
32
  end
33
+
34
+ opts.on_tail('--version', 'Show version') do
35
+ puts Shelob::VERSION
36
+ end
28
37
  end.parse!
29
38
 
30
39
  if ARGV.empty?
data/lib/extractor.rb CHANGED
@@ -10,7 +10,7 @@ module Shelob
10
10
  def extract
11
11
  content = Nokogiri::HTML(@fetched.body)
12
12
  raw = content.css('a').map { |anchor| anchor['href'] }
13
- raw.map do |link|
13
+ raw.reject(&:nil?).map do |link|
14
14
  if link.start_with? '/'
15
15
  u = URI(@fetched.url)
16
16
  "#{u.scheme}://#{u.host}#{link}"
@@ -1,3 +1,3 @@
1
1
  module Shelob
2
- VERSION = "0.1.0.beta3"
2
+ VERSION = "0.1.0.beta4"
3
3
  end
data/lib/shelob.rb CHANGED
@@ -12,12 +12,17 @@ module Shelob
12
12
  # underneath
13
13
  attr_accessor :hostname
14
14
 
15
+ # The current queue of urls to check
16
+ attr_accessor :queue
17
+
15
18
  # Create a new spider with the given hostname and
16
19
  # options
17
20
  #
18
21
  # Valid options:
19
- # * Verbose: 0 for no output, 1 for progress output, 2
20
- # for verbose output
22
+ # * verbose: 0 for no output, 1 for progress output, 2
23
+ # for verbose output
24
+ # * seed: Provide an initial seed value, other than the
25
+ # root url you're providing
21
26
  def initialize hostname, options = {}
22
27
  # Data
23
28
  @hostname = hostname
@@ -27,7 +32,51 @@ module Shelob
27
32
  @chatty = options[:verbose] == 2 ? true : false
28
33
 
29
34
  # Internal
30
- @queue = [ hostname ]
35
+ if options[:seed].nil?
36
+ @queue = [ hostname ]
37
+ else
38
+ @queue = [ options[:seed] ]
39
+ end
40
+ end
41
+
42
+ # Entry point to the main spider process. This is the
43
+ # main API point, and will return once the site has
44
+ # been completely spidered.
45
+ #
46
+ # Returns a list of all failed urls, and their
47
+ # particular error code (404, 500, etc.)
48
+ def check
49
+ # set up variables
50
+ @urls ||= Set.new
51
+ @failures ||= []
52
+
53
+ # kick the spider off
54
+ run_spider
55
+
56
+ @failures
57
+ end
58
+
59
+ # Returns a count of the remaining urls to parse - this
60
+ # number is only a view of the current state, as more
61
+ # urls are constantly being added as other urls
62
+ # resolve.
63
+ #
64
+ # This would only be useful to call from another thread
65
+ # at this time, as check is a blocking call
66
+ def remaining
67
+ return @queue.count
68
+ end
69
+
70
+ # Return the total number of urls that were fetched in
71
+ # the spidering process.
72
+ def requests
73
+ return @urls.count
74
+ end
75
+
76
+ # Return an array of all urls that were fetched in the
77
+ # process of spidering the site.
78
+ def fetched
79
+ return @urls
31
80
  end
32
81
 
33
82
  # Notify that a url is about to be processed. Currently
@@ -112,45 +161,5 @@ module Shelob
112
161
  post_process_notify url
113
162
  end
114
163
  end
115
-
116
- # Entry point to the main spider process. This is the
117
- # main API point, and will return once the site has
118
- # been completely spidered.
119
- #
120
- # Returns a list of all failed urls, and their
121
- # particular error code (404, 500, etc.)
122
- def check
123
- # set up variables
124
- @urls ||= Set.new
125
- @failures ||= []
126
-
127
- # kick the spider off
128
- run_spider
129
-
130
- @failures
131
- end
132
-
133
- # Returns a count of the remaining urls to parse - this
134
- # number is only a view of the current state, as more
135
- # urls are constantly being added as other urls
136
- # resolve.
137
- #
138
- # This would only be useful to call from another thread
139
- # at this time, as check is a blocking call
140
- def remaining
141
- return @queue.count
142
- end
143
-
144
- # Return the total number of urls that were fetched in
145
- # the spidering process.
146
- def requests
147
- return @urls.count
148
- end
149
-
150
- # Return an array of all urls that were fetched in the
151
- # process of spidering the site.
152
- def fetched
153
- return @urls
154
- end
155
164
  end
156
165
  end
@@ -15,8 +15,10 @@ describe Shelob::Extractor, "Link extracting module" do
15
15
  before do
16
16
  @result = LinkResult.new("http://google.com", 200, '<html><head><title>hi</title></head><body><a href="http://bing.com">bing</a><a href="http://yahoo.com">yahoo</a></body></html>')
17
17
  @result2 = LinkResult.new("http://google.com/something", 200, '<html><head><title>hi</title></head><body><a href="/about">about</a></body></html>')
18
+ @result3 = LinkResult.new("http://google.com/another", 200, '<html><head><title>hi</title></head><body><a>about</a><a href="http://boop.com">boop</a></body></html>')
18
19
  @le = Shelob::Extractor.new(@result)
19
20
  @le2 = Shelob::Extractor.new(@result2)
21
+ @le3 = Shelob::Extractor.new(@result3)
20
22
  end
21
23
 
22
24
  it "should return a list of the links in the page" do
@@ -31,6 +33,13 @@ describe Shelob::Extractor, "Link extracting module" do
31
33
  extracts.must_equal ["http://google.com/about"]
32
34
  end
33
35
 
36
+ it "should gracefully handle empty links" do
37
+ # we shouldn't get an exception here
38
+ extracts = @le3.extract
39
+ extracts.must_be_kind_of Array
40
+ extracts.must_equal ["http://boop.com"]
41
+ end
42
+
34
43
  end # describe
35
44
 
36
45
  end # describe
data/test/test_shelob.rb CHANGED
@@ -30,6 +30,12 @@ describe Shelob::Spider, "Link checking spider" do
30
30
  spider.wont_be_nil
31
31
  spider.hostname.must_equal "http://bmnick.com"
32
32
  end
33
+ it "should be able to take a seperate seed url" do
34
+ spider = Shelob::Spider.new("http://bmnick.com", seed: "http://bmnick.com/resume")
35
+ spider.wont_be_nil
36
+ spider.hostname.must_equal "http://bmnick.com"
37
+ spider.queue.must_include "http://bmnick.com/resume"
38
+ end
33
39
  end
34
40
  describe "when checking links" do
35
41
  before do
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: shelob
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0.beta3
4
+ version: 0.1.0.beta4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Benjamin Nicholas
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-12-30 00:00:00.000000000 Z
11
+ date: 2013-12-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler