spidr 0.1.9 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/spidr/rules.rb CHANGED
@@ -7,25 +7,43 @@ module Spidr
7
7
  # Reject rules
8
8
  attr_reader :reject
9
9
 
10
+ #
11
+ # Creates a new Rules object.
12
+ #
13
+ # @param [Hash] options
14
+ # Additional options.
15
+ #
16
+ # @option options [Array<String, Regexp, Proc>] :accept
17
+ # The patterns to accept data with.
18
+ #
19
+ # @option options [Array<String, Regexp, Proc>] :reject
20
+ # The patterns to reject data with.
21
+ #
10
22
  def initialize(options={})
11
- @accept = (options[:accept] || [])
12
- @reject = (options[:reject] || [])
23
+ @accept = []
24
+ @reject = []
25
+
26
+ @accept += options[:accept] if options[:accept]
27
+ @reject += options[:reject] if options[:reject]
13
28
  end
14
29
 
15
30
  #
16
- # Returns +true+ if the _field_ is accepted by the rules,
17
- # returns +false+ otherwise.
31
+ # Determines whether the data should be accepted or rejected.
32
+ #
33
+ # @return [Boolean]
34
+ # Specifies whether the given data was accepted, using the rules
35
+ # acceptance patterns.
18
36
  #
19
- def accept?(field)
37
+ def accept?(data)
20
38
  unless @accept.empty?
21
39
  @accept.each do |rule|
22
- return true if test_field(field,rule)
40
+ return true if test_data(data,rule)
23
41
  end
24
42
 
25
43
  return false
26
44
  else
27
45
  @reject.each do |rule|
28
- return false if test_field(field,rule)
46
+ return false if test_data(data,rule)
29
47
  end
30
48
 
31
49
  return true
@@ -33,27 +51,31 @@ module Spidr
33
51
  end
34
52
 
35
53
  #
36
- # Returns +true+ if the _field_ is rejected by the rules,
37
- # returns +false+ otherwise.
54
+ # Determines whether the data should be rejected or accepted.
38
55
  #
39
- def reject?(field)
40
- !(accept?(field))
56
+ # @return [Boolean]
57
+ # Specifies whether the given data was rejected, using the rules
58
+ # rejection patterns.
59
+ #
60
+ def reject?(data)
61
+ !(accept?(data))
41
62
  end
42
63
 
43
64
  protected
44
65
 
45
66
  #
46
- # Tests the specified _field_ against the specified _rule_. Returns
47
- # +true+ when the _rule_ matches the specified _field_, returns
48
- # +false+ otherwise.
67
+ # Tests the given data against a given pattern.
68
+ #
69
+ # @return [Boolean]
70
+ # Specifies whether the given data matched the pattern.
49
71
  #
50
- def test_field(field,rule)
72
+ def test_data(data,rule)
51
73
  if rule.kind_of?(Proc)
52
- return (rule.call(field) == true)
74
+ return (rule.call(data) == true)
53
75
  elsif rule.kind_of?(Regexp)
54
- return !((field.to_s =~ rule).nil?)
76
+ return !((data.to_s =~ rule).nil?)
55
77
  else
56
- return field == rule
78
+ return data == rule
57
79
  end
58
80
  end
59
81
 
data/lib/spidr/spidr.rb CHANGED
@@ -4,43 +4,93 @@ module Spidr
4
4
  # Common proxy port.
5
5
  COMMON_PROXY_PORT = 8080
6
6
 
7
+ # Default proxy information.
8
+ DEFAULT_PROXY = {
9
+ :host => nil,
10
+ :port => COMMON_PROXY_PORT,
11
+ :user => nil,
12
+ :password => nil
13
+ }
14
+
15
+ #
16
+ # Proxy information used by all newly created Agent objects by default.
7
17
  #
8
- # Returns the +Hash+ of the Spidr proxy information.
18
+ # @return [Hash]
19
+ # The Spidr proxy information.
9
20
  #
10
21
  def Spidr.proxy
11
- @@spidr_proxy ||= {:host => nil, :port => COMMON_PROXY_PORT, :user => nil, :password => nil}
22
+ @@spidr_proxy ||= DEFAULT_PROXY
23
+ end
24
+
25
+ #
26
+ # Sets the proxy information used by Agent objects.
27
+ #
28
+ # @param [Hash] new_proxy
29
+ # The new proxy information.
30
+ #
31
+ # @option new_proxy [String] :host
32
+ # The host-name of the proxy.
33
+ #
34
+ # @option new_proxy [Integer] :port (COMMON_PROXY_PORT)
35
+ # The port of the proxy.
36
+ #
37
+ # @option new_proxy [String] :user
38
+ # The user to authenticate with the proxy as.
39
+ #
40
+ # @option new_proxy [String] :password
41
+ # The password to authenticate with the proxy.
42
+ #
43
+ # @return [Hash]
44
+ # The new proxy information.
45
+ #
46
+ def Spidr.proxy=(new_proxy)
47
+ @@spidr_proxy = {:port => COMMON_PROXY_PORT}.merge(new_proxy)
12
48
  end
13
49
 
14
50
  #
15
- # Returns the Spidr User-Agent
51
+ # Disables the proxy settings used by all newly created Agent objects.
52
+ #
53
+ def Spidr.disable_proxy!
54
+ @@spidr_proxy = DEFAULT_PROXY
55
+ return true
56
+ end
57
+
58
+ #
59
+ # The User-Agent string used by all Agent objects by default.
60
+ #
61
+ # @return [String]
62
+ # The Spidr User-Agent string.
16
63
  #
17
64
  def Spidr.user_agent
18
65
  @@spidr_user_agent ||= nil
19
66
  end
20
67
 
21
68
  #
22
- # Sets the Spidr Web User-Agent to the specified _new_agent_.
69
+ # Sets the Spidr User-Agent string.
70
+ #
71
+ # @param [String] new_agent
72
+ # The new User-Agent string.
23
73
  #
24
74
  def Spidr.user_agent=(new_agent)
25
75
  @@spidr_user_agent = new_agent
26
76
  end
27
77
 
28
78
  #
29
- # See Agent.start_at.
79
+ # @see Agent.start_at
30
80
  #
31
81
  def Spidr.start_at(url,options={},&block)
32
82
  Agent.start_at(url,options,&block)
33
83
  end
34
84
 
35
85
  #
36
- # See Agent.host.
86
+ # @see Agent.host
37
87
  #
38
88
  def Spidr.host(name,options={},&block)
39
89
  Agent.host(name,options,&block)
40
90
  end
41
91
 
42
92
  #
43
- # See Agent.site.
93
+ # @see Agent.site
44
94
  #
45
95
  def Spidr.site(url,options={},&block)
46
96
  Agent.site(url,options,&block)
data/lib/spidr/version.rb CHANGED
@@ -1,3 +1,4 @@
1
1
  module Spidr
2
- VERSION = '0.1.9'
2
+ # Spidr version
3
+ VERSION = '0.2.0'
3
4
  end
@@ -0,0 +1,61 @@
1
+ require 'spidr/actions'
2
+ require 'spidr/agent'
3
+
4
+ require 'spec_helper'
5
+
6
+ describe Actions do
7
+ before(:all) do
8
+ @url = URI('http://spidr.rubyforge.org/')
9
+ end
10
+
11
+ it "should be able to pause spidering" do
12
+ count = 0
13
+ agent = Agent.host('spidr.rubyforge.org') do |spider|
14
+ spider.every_page do |page|
15
+ count += 1
16
+ spider.pause! if count >= 2
17
+ end
18
+ end
19
+
20
+ agent.should be_paused
21
+ agent.history.length.should == 2
22
+ end
23
+
24
+ it "should be able to continue spidering after being paused" do
25
+ agent = Agent.new do |spider|
26
+ spider.every_page do |page|
27
+ spider.pause!
28
+ end
29
+ end
30
+
31
+ agent.enqueue(@url)
32
+ agent.continue!
33
+
34
+ agent.visited?(@url).should == true
35
+ end
36
+
37
+ it "should allow skipping of enqueued links" do
38
+ agent = Agent.new do |spider|
39
+ spider.every_url do |url|
40
+ spider.skip_link!
41
+ end
42
+ end
43
+
44
+ agent.enqueue(@url)
45
+
46
+ agent.queue.should be_empty
47
+ end
48
+
49
+ it "should allow skipping of visited pages" do
50
+ agent = Agent.new do |spider|
51
+ spider.every_page do |url|
52
+ spider.skip_page!
53
+ end
54
+ end
55
+
56
+ agent.visit_page(@url)
57
+
58
+ agent.history.should == Set[@url]
59
+ agent.queue.should be_empty
60
+ end
61
+ end
data/spec/agent_spec.rb CHANGED
@@ -20,19 +20,38 @@ describe Agent do
20
20
 
21
21
  it "should be able to restore the history" do
22
22
  agent = Agent.new
23
- previous_history = [URI('http://www.example.com')]
23
+ previous_history = Set[URI('http://www.example.com')]
24
24
 
25
25
  agent.history = previous_history
26
26
  agent.history.should == previous_history
27
27
  end
28
28
 
29
- it "should convert new histories to an Array of URIs" do
29
+ it "should convert new histories to an Set of URIs" do
30
30
  agent = Agent.new
31
31
  previous_history = ['http://www.example.com']
32
+ expected_history = Set[URI('http://www.example.com')]
32
33
 
33
34
  agent.history = previous_history
34
35
  agent.history.should_not == previous_history
35
- agent.history.should == previous_history.map { |url| URI(url) }
36
+ agent.history.should == expected_history
37
+ end
38
+
39
+ it "should be able to restore the failures" do
40
+ agent = Agent.new
41
+ previous_failures = Set[URI('http://localhost/')]
42
+
43
+ agent.failures = previous_failures
44
+ agent.failures.should == previous_failures
45
+ end
46
+
47
+ it "should convert new histories to a Set of URIs" do
48
+ agent = Agent.new
49
+ previous_failures = ['http://localhost/']
50
+ expected_failures = Set[URI('http://localhost/')]
51
+
52
+ agent.failures = previous_failures
53
+ agent.failures.should_not == previous_failures
54
+ agent.failures.should == expected_failures
36
55
  end
37
56
 
38
57
  it "should be able to restore the queue" do
@@ -46,37 +65,11 @@ describe Agent do
46
65
  it "should convert new queues to an Array of URIs" do
47
66
  agent = Agent.new
48
67
  previous_queue = ['http://www.example.com']
68
+ expected_queue = [URI('http://www.example.com')]
49
69
 
50
70
  agent.queue = previous_queue
51
71
  agent.queue.should_not == previous_queue
52
- agent.queue.should == previous_queue.map { |url| URI(url) }
53
- end
54
-
55
- it "should be able to pause spidering" do
56
- count = 0
57
- agent = Agent.host('spidr.rubyforge.org') do |spider|
58
- spider.every_page do |page|
59
- count += 1
60
- spider.pause! if count >= 2
61
- end
62
- end
63
-
64
- agent.should be_paused
65
- agent.history.length.should == 2
66
- end
67
-
68
- it "should be able to continue spidering after being paused" do
69
- agent = Agent.new do |spider|
70
- spider.enqueue('http://spidr.rubyforge.org/')
71
- spider.every_page do |page|
72
- spider.pause!
73
- end
74
- end
75
-
76
- agent.pause!
77
- agent.continue!
78
-
79
- agent.visited?('http://spidr.rubyforge.org/').should == true
72
+ agent.queue.should == expected_queue
80
73
  end
81
74
 
82
75
  it "should provide a to_hash method that returns the queue and history" do
@@ -0,0 +1,39 @@
1
+ require 'spidr/extensions/uri'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe URI do
6
+ describe "expand_path" do
7
+ it "should preserve single directory paths" do
8
+ URI.expand_path('path').should == 'path'
9
+ end
10
+
11
+ it "should preserve trailing '/'" do
12
+ URI.expand_path('test/path/').should == 'test/path/'
13
+ end
14
+
15
+ it "should remove multiple '/' characters" do
16
+ URI.expand_path('///test///path///').should == '/test/path/'
17
+ end
18
+
19
+ it "should remove '.' directories from the path" do
20
+ URI.expand_path('test/./path').should == 'test/path'
21
+ end
22
+
23
+ it "should handle '..' directories properly" do
24
+ URI.expand_path('test/../path').should == 'path'
25
+ end
26
+
27
+ it "should limit the number of '..' directories resolved" do
28
+ URI.expand_path('/test/../../../..').should == '/'
29
+ end
30
+
31
+ it "should preserve absolute paths" do
32
+ URI.expand_path('/test/path').should == '/test/path'
33
+ end
34
+
35
+ it "should preserve the root path" do
36
+ URI.expand_path('/').should == '/'
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,53 @@
1
+ require 'spidr/filters'
2
+ require 'spidr/agent'
3
+
4
+ require 'spec_helper'
5
+
6
+ describe Filters do
7
+ it "should allow setting the acceptable schemes" do
8
+ agent = Agent.new
9
+
10
+ agent.schemes = [:http]
11
+ agent.schemes.should == ['http']
12
+ end
13
+
14
+ it "should provide the hosts that will be visited" do
15
+ agent = Agent.new(:hosts => ['spidr.rubyforge.org'])
16
+ agent.visit_hosts.should == ['spidr.rubyforge.org']
17
+ end
18
+
19
+ it "should provide the hosts that will not be visited" do
20
+ agent = Agent.new(:ignore_hosts => ['example.com'])
21
+ agent.ignore_hosts.should == ['example.com']
22
+ end
23
+
24
+ it "should provide the ports that will be visited" do
25
+ agent = Agent.new(:ports => [80, 443, 8000])
26
+ agent.visit_ports.should == [80, 443, 8000]
27
+ end
28
+
29
+ it "should provide the ports that will not be visited" do
30
+ agent = Agent.new(:ignore_ports => [8000, 8080])
31
+ agent.ignore_ports.should == [8000, 8080]
32
+ end
33
+
34
+ it "should provide the links that will be visited" do
35
+ agent = Agent.new(:links => ['index.php'])
36
+ agent.visit_links.should == ['index.php']
37
+ end
38
+
39
+ it "should provide the links that will not be visited" do
40
+ agent = Agent.new(:ignore_links => [/login/])
41
+ agent.ignore_links.should == [/login/]
42
+ end
43
+
44
+ it "should provide the exts that will be visited" do
45
+ agent = Agent.new(:exts => ['htm'])
46
+ agent.visit_exts.should == ['htm']
47
+ end
48
+
49
+ it "should provide the exts that will not be visited" do
50
+ agent = Agent.new(:ignore_exts => ['cfm'])
51
+ agent.ignore_exts.should == ['cfm']
52
+ end
53
+ end
@@ -0,0 +1,8 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+
4
+ def get_page(url)
5
+ url = URI(url.to_s)
6
+
7
+ return Spidr::Page.new(url,Net::HTTP.get_response(url))
8
+ end