spidr 0.1.9 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/spidr/rules.rb CHANGED
@@ -7,25 +7,43 @@ module Spidr
7
7
  # Reject rules
8
8
  attr_reader :reject
9
9
 
10
+ #
11
+ # Creates a new Rules object.
12
+ #
13
+ # @param [Hash] options
14
+ # Additional options.
15
+ #
16
+ # @option options [Array<String, Regexp, Proc>] :accept
17
+ # The patterns to accept data with.
18
+ #
19
+ # @option options [Array<String, Regexp, Proc>] :reject
20
+ # The patterns to reject data with.
21
+ #
10
22
  def initialize(options={})
11
- @accept = (options[:accept] || [])
12
- @reject = (options[:reject] || [])
23
+ @accept = []
24
+ @reject = []
25
+
26
+ @accept += options[:accept] if options[:accept]
27
+ @reject += options[:reject] if options[:reject]
13
28
  end
14
29
 
15
30
  #
16
- # Returns +true+ if the _field_ is accepted by the rules,
17
- # returns +false+ otherwise.
31
+ # Determines whether the data should be accepted or rejected.
32
+ #
33
+ # @return [Boolean]
34
+ # Specifies whether the given data was accepted, using the rules
35
+ # acceptance patterns.
18
36
  #
19
- def accept?(field)
37
+ def accept?(data)
20
38
  unless @accept.empty?
21
39
  @accept.each do |rule|
22
- return true if test_field(field,rule)
40
+ return true if test_data(data,rule)
23
41
  end
24
42
 
25
43
  return false
26
44
  else
27
45
  @reject.each do |rule|
28
- return false if test_field(field,rule)
46
+ return false if test_data(data,rule)
29
47
  end
30
48
 
31
49
  return true
@@ -33,27 +51,31 @@ module Spidr
33
51
  end
34
52
 
35
53
  #
36
- # Returns +true+ if the _field_ is rejected by the rules,
37
- # returns +false+ otherwise.
54
+ # Determines whether the data should be rejected or accepted.
38
55
  #
39
- def reject?(field)
40
- !(accept?(field))
56
+ # @return [Boolean]
57
+ # Specifies whether the given data was rejected, using the rules
58
+ # rejection patterns.
59
+ #
60
+ def reject?(data)
61
+ !(accept?(data))
41
62
  end
42
63
 
43
64
  protected
44
65
 
45
66
  #
46
- # Tests the specified _field_ against the specified _rule_. Returns
47
- # +true+ when the _rule_ matches the specified _field_, returns
48
- # +false+ otherwise.
67
+ # Tests the given data against a given pattern.
68
+ #
69
+ # @return [Boolean]
70
+ # Specifies whether the given data matched the pattern.
49
71
  #
50
- def test_field(field,rule)
72
+ def test_data(data,rule)
51
73
  if rule.kind_of?(Proc)
52
- return (rule.call(field) == true)
74
+ return (rule.call(data) == true)
53
75
  elsif rule.kind_of?(Regexp)
54
- return !((field.to_s =~ rule).nil?)
76
+ return !((data.to_s =~ rule).nil?)
55
77
  else
56
- return field == rule
78
+ return data == rule
57
79
  end
58
80
  end
59
81
 
data/lib/spidr/spidr.rb CHANGED
@@ -4,43 +4,93 @@ module Spidr
4
4
  # Common proxy port.
5
5
  COMMON_PROXY_PORT = 8080
6
6
 
7
+ # Default proxy information.
8
+ DEFAULT_PROXY = {
9
+ :host => nil,
10
+ :port => COMMON_PROXY_PORT,
11
+ :user => nil,
12
+ :password => nil
13
+ }
14
+
15
+ #
16
+ # Proxy information used by all newly created Agent objects by default.
7
17
  #
8
- # Returns the +Hash+ of the Spidr proxy information.
18
+ # @return [Hash]
19
+ # The Spidr proxy information.
9
20
  #
10
21
  def Spidr.proxy
11
- @@spidr_proxy ||= {:host => nil, :port => COMMON_PROXY_PORT, :user => nil, :password => nil}
22
+ @@spidr_proxy ||= DEFAULT_PROXY
23
+ end
24
+
25
+ #
26
+ # Sets the proxy information used by Agent objects.
27
+ #
28
+ # @param [Hash] new_proxy
29
+ # The new proxy information.
30
+ #
31
+ # @option new_proxy [String] :host
32
+ # The host-name of the proxy.
33
+ #
34
+ # @option new_proxy [Integer] :port (COMMON_PROXY_PORT)
35
+ # The port of the proxy.
36
+ #
37
+ # @option new_proxy [String] :user
38
+ # The user to authenticate with the proxy as.
39
+ #
40
+ # @option new_proxy [String] :password
41
+ # The password to authenticate with the proxy.
42
+ #
43
+ # @return [Hash]
44
+ # The new proxy information.
45
+ #
46
+ def Spidr.proxy=(new_proxy)
47
+ @@spidr_proxy = {:port => COMMON_PROXY_PORT}.merge(new_proxy)
12
48
  end
13
49
 
14
50
  #
15
- # Returns the Spidr User-Agent
51
+ # Disables the proxy settings used by all newly created Agent objects.
52
+ #
53
+ def Spidr.disable_proxy!
54
+ @@spidr_proxy = DEFAULT_PROXY
55
+ return true
56
+ end
57
+
58
+ #
59
+ # The User-Agent string used by all Agent objects by default.
60
+ #
61
+ # @return [String]
62
+ # The Spidr User-Agent string.
16
63
  #
17
64
  def Spidr.user_agent
18
65
  @@spidr_user_agent ||= nil
19
66
  end
20
67
 
21
68
  #
22
- # Sets the Spidr Web User-Agent to the specified _new_agent_.
69
+ # Sets the Spidr User-Agent string.
70
+ #
71
+ # @param [String] new_agent
72
+ # The new User-Agent string.
23
73
  #
24
74
  def Spidr.user_agent=(new_agent)
25
75
  @@spidr_user_agent = new_agent
26
76
  end
27
77
 
28
78
  #
29
- # See Agent.start_at.
79
+ # @see Agent.start_at
30
80
  #
31
81
  def Spidr.start_at(url,options={},&block)
32
82
  Agent.start_at(url,options,&block)
33
83
  end
34
84
 
35
85
  #
36
- # See Agent.host.
86
+ # @see Agent.host
37
87
  #
38
88
  def Spidr.host(name,options={},&block)
39
89
  Agent.host(name,options,&block)
40
90
  end
41
91
 
42
92
  #
43
- # See Agent.site.
93
+ # @see Agent.site
44
94
  #
45
95
  def Spidr.site(url,options={},&block)
46
96
  Agent.site(url,options,&block)
data/lib/spidr/version.rb CHANGED
@@ -1,3 +1,4 @@
1
1
  module Spidr
2
- VERSION = '0.1.9'
2
+ # Spidr version
3
+ VERSION = '0.2.0'
3
4
  end
@@ -0,0 +1,61 @@
1
+ require 'spidr/actions'
2
+ require 'spidr/agent'
3
+
4
+ require 'spec_helper'
5
+
6
+ describe Actions do
7
+ before(:all) do
8
+ @url = URI('http://spidr.rubyforge.org/')
9
+ end
10
+
11
+ it "should be able to pause spidering" do
12
+ count = 0
13
+ agent = Agent.host('spidr.rubyforge.org') do |spider|
14
+ spider.every_page do |page|
15
+ count += 1
16
+ spider.pause! if count >= 2
17
+ end
18
+ end
19
+
20
+ agent.should be_paused
21
+ agent.history.length.should == 2
22
+ end
23
+
24
+ it "should be able to continue spidering after being paused" do
25
+ agent = Agent.new do |spider|
26
+ spider.every_page do |page|
27
+ spider.pause!
28
+ end
29
+ end
30
+
31
+ agent.enqueue(@url)
32
+ agent.continue!
33
+
34
+ agent.visited?(@url).should == true
35
+ end
36
+
37
+ it "should allow skipping of enqueued links" do
38
+ agent = Agent.new do |spider|
39
+ spider.every_url do |url|
40
+ spider.skip_link!
41
+ end
42
+ end
43
+
44
+ agent.enqueue(@url)
45
+
46
+ agent.queue.should be_empty
47
+ end
48
+
49
+ it "should allow skipping of visited pages" do
50
+ agent = Agent.new do |spider|
51
+ spider.every_page do |url|
52
+ spider.skip_page!
53
+ end
54
+ end
55
+
56
+ agent.visit_page(@url)
57
+
58
+ agent.history.should == Set[@url]
59
+ agent.queue.should be_empty
60
+ end
61
+ end
data/spec/agent_spec.rb CHANGED
@@ -20,19 +20,38 @@ describe Agent do
20
20
 
21
21
  it "should be able to restore the history" do
22
22
  agent = Agent.new
23
- previous_history = [URI('http://www.example.com')]
23
+ previous_history = Set[URI('http://www.example.com')]
24
24
 
25
25
  agent.history = previous_history
26
26
  agent.history.should == previous_history
27
27
  end
28
28
 
29
- it "should convert new histories to an Array of URIs" do
29
+ it "should convert new histories to an Set of URIs" do
30
30
  agent = Agent.new
31
31
  previous_history = ['http://www.example.com']
32
+ expected_history = Set[URI('http://www.example.com')]
32
33
 
33
34
  agent.history = previous_history
34
35
  agent.history.should_not == previous_history
35
- agent.history.should == previous_history.map { |url| URI(url) }
36
+ agent.history.should == expected_history
37
+ end
38
+
39
+ it "should be able to restore the failures" do
40
+ agent = Agent.new
41
+ previous_failures = Set[URI('http://localhost/')]
42
+
43
+ agent.failures = previous_failures
44
+ agent.failures.should == previous_failures
45
+ end
46
+
47
+ it "should convert new histories to a Set of URIs" do
48
+ agent = Agent.new
49
+ previous_failures = ['http://localhost/']
50
+ expected_failures = Set[URI('http://localhost/')]
51
+
52
+ agent.failures = previous_failures
53
+ agent.failures.should_not == previous_failures
54
+ agent.failures.should == expected_failures
36
55
  end
37
56
 
38
57
  it "should be able to restore the queue" do
@@ -46,37 +65,11 @@ describe Agent do
46
65
  it "should convert new queues to an Array of URIs" do
47
66
  agent = Agent.new
48
67
  previous_queue = ['http://www.example.com']
68
+ expected_queue = [URI('http://www.example.com')]
49
69
 
50
70
  agent.queue = previous_queue
51
71
  agent.queue.should_not == previous_queue
52
- agent.queue.should == previous_queue.map { |url| URI(url) }
53
- end
54
-
55
- it "should be able to pause spidering" do
56
- count = 0
57
- agent = Agent.host('spidr.rubyforge.org') do |spider|
58
- spider.every_page do |page|
59
- count += 1
60
- spider.pause! if count >= 2
61
- end
62
- end
63
-
64
- agent.should be_paused
65
- agent.history.length.should == 2
66
- end
67
-
68
- it "should be able to continue spidering after being paused" do
69
- agent = Agent.new do |spider|
70
- spider.enqueue('http://spidr.rubyforge.org/')
71
- spider.every_page do |page|
72
- spider.pause!
73
- end
74
- end
75
-
76
- agent.pause!
77
- agent.continue!
78
-
79
- agent.visited?('http://spidr.rubyforge.org/').should == true
72
+ agent.queue.should == expected_queue
80
73
  end
81
74
 
82
75
  it "should provide a to_hash method that returns the queue and history" do
@@ -0,0 +1,39 @@
1
+ require 'spidr/extensions/uri'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe URI do
6
+ describe "expand_path" do
7
+ it "should preserve single directory paths" do
8
+ URI.expand_path('path').should == 'path'
9
+ end
10
+
11
+ it "should preserve trailing '/'" do
12
+ URI.expand_path('test/path/').should == 'test/path/'
13
+ end
14
+
15
+ it "should remove multiple '/' characters" do
16
+ URI.expand_path('///test///path///').should == '/test/path/'
17
+ end
18
+
19
+ it "should remove '.' directories from the path" do
20
+ URI.expand_path('test/./path').should == 'test/path'
21
+ end
22
+
23
+ it "should handle '..' directories properly" do
24
+ URI.expand_path('test/../path').should == 'path'
25
+ end
26
+
27
+ it "should limit the number of '..' directories resolved" do
28
+ URI.expand_path('/test/../../../..').should == '/'
29
+ end
30
+
31
+ it "should preserve absolute paths" do
32
+ URI.expand_path('/test/path').should == '/test/path'
33
+ end
34
+
35
+ it "should preserve the root path" do
36
+ URI.expand_path('/').should == '/'
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,53 @@
1
+ require 'spidr/filters'
2
+ require 'spidr/agent'
3
+
4
+ require 'spec_helper'
5
+
6
+ describe Filters do
7
+ it "should allow setting the acceptable schemes" do
8
+ agent = Agent.new
9
+
10
+ agent.schemes = [:http]
11
+ agent.schemes.should == ['http']
12
+ end
13
+
14
+ it "should provide the hosts that will be visited" do
15
+ agent = Agent.new(:hosts => ['spidr.rubyforge.org'])
16
+ agent.visit_hosts.should == ['spidr.rubyforge.org']
17
+ end
18
+
19
+ it "should provide the hosts that will not be visited" do
20
+ agent = Agent.new(:ignore_hosts => ['example.com'])
21
+ agent.ignore_hosts.should == ['example.com']
22
+ end
23
+
24
+ it "should provide the ports that will be visited" do
25
+ agent = Agent.new(:ports => [80, 443, 8000])
26
+ agent.visit_ports.should == [80, 443, 8000]
27
+ end
28
+
29
+ it "should provide the ports that will not be visited" do
30
+ agent = Agent.new(:ignore_ports => [8000, 8080])
31
+ agent.ignore_ports.should == [8000, 8080]
32
+ end
33
+
34
+ it "should provide the links that will be visited" do
35
+ agent = Agent.new(:links => ['index.php'])
36
+ agent.visit_links.should == ['index.php']
37
+ end
38
+
39
+ it "should provide the links that will not be visited" do
40
+ agent = Agent.new(:ignore_links => [/login/])
41
+ agent.ignore_links.should == [/login/]
42
+ end
43
+
44
+ it "should provide the exts that will be visited" do
45
+ agent = Agent.new(:exts => ['htm'])
46
+ agent.visit_exts.should == ['htm']
47
+ end
48
+
49
+ it "should provide the exts that will not be visited" do
50
+ agent = Agent.new(:ignore_exts => ['cfm'])
51
+ agent.ignore_exts.should == ['cfm']
52
+ end
53
+ end
@@ -0,0 +1,8 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+
4
+ def get_page(url)
5
+ url = URI(url.to_s)
6
+
7
+ return Spidr::Page.new(url,Net::HTTP.get_response(url))
8
+ end