spidr 0.4.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/ChangeLog.md +69 -54
- data/Gemfile +9 -5
- data/LICENSE.txt +1 -1
- data/README.md +34 -26
- data/Rakefile +4 -15
- data/gemspec.yml +3 -2
- data/lib/spidr/agent.rb +101 -44
- data/lib/spidr/{actions → agent}/actions.rb +32 -12
- data/lib/spidr/{events.rb → agent/events.rb} +4 -8
- data/lib/spidr/{filters.rb → agent/filters.rb} +14 -16
- data/lib/spidr/{sanitizers.rb → agent/sanitizers.rb} +5 -7
- data/lib/spidr/auth_store.rb +2 -2
- data/lib/spidr/cookie_jar.rb +2 -2
- data/lib/spidr/extensions/uri.rb +28 -16
- data/lib/spidr/page.rb +7 -11
- data/lib/spidr/{body.rb → page/body.rb} +1 -1
- data/lib/spidr/{headers.rb → page/headers.rb} +1 -1
- data/lib/spidr/{links.rb → page/links.rb} +43 -7
- data/lib/spidr/session_cache.rb +2 -2
- data/lib/spidr/spidr.rb +32 -5
- data/lib/spidr/version.rb +1 -1
- data/spec/agent/actions_spec.rb +60 -0
- data/spec/agent/filters_spec.rb +62 -0
- data/spec/agent/sanitizers_spec.rb +62 -0
- data/spec/agent_spec.rb +13 -13
- data/spec/auth_store_spec.rb +17 -17
- data/spec/cookie_jar_spec.rb +26 -26
- data/spec/extensions/uri_spec.rb +19 -9
- data/spec/helpers/history.rb +5 -5
- data/spec/helpers/wsoc.rb +2 -2
- data/spec/page_examples.rb +4 -4
- data/spec/page_spec.rb +28 -25
- data/spec/rules_spec.rb +14 -14
- data/spec/session_cache.rb +7 -7
- data/spec/spidr_spec.rb +10 -10
- metadata +37 -51
- data/lib/spidr/actions.rb +0 -2
- data/lib/spidr/actions/exceptions.rb +0 -4
- data/lib/spidr/actions/exceptions/action.rb +0 -9
- data/lib/spidr/actions/exceptions/paused.rb +0 -11
- data/lib/spidr/actions/exceptions/skip_link.rb +0 -12
- data/lib/spidr/actions/exceptions/skip_page.rb +0 -12
- data/spec/actions_spec.rb +0 -59
- data/spec/filters_spec.rb +0 -61
- data/spec/sanitizers_spec.rb +0 -61
data/lib/spidr/actions.rb
DELETED
data/spec/actions_spec.rb
DELETED
@@ -1,59 +0,0 @@
|
|
1
|
-
require 'spidr/actions'
|
2
|
-
require 'spidr/agent'
|
3
|
-
|
4
|
-
require 'spec_helper'
|
5
|
-
|
6
|
-
describe Actions do
|
7
|
-
let(:url) { URI('http://spidr.rubyforge.org/') }
|
8
|
-
|
9
|
-
it "should be able to pause spidering" do
|
10
|
-
count = 0
|
11
|
-
agent = Agent.host('spidr.rubyforge.org') do |spider|
|
12
|
-
spider.every_page do |page|
|
13
|
-
count += 1
|
14
|
-
spider.pause! if count >= 2
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
agent.should be_paused
|
19
|
-
agent.history.length.should == 2
|
20
|
-
end
|
21
|
-
|
22
|
-
it "should be able to continue spidering after being paused" do
|
23
|
-
agent = Agent.new do |spider|
|
24
|
-
spider.every_page do |page|
|
25
|
-
spider.pause!
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
agent.enqueue(url)
|
30
|
-
agent.continue!
|
31
|
-
|
32
|
-
agent.visited?(url).should == true
|
33
|
-
end
|
34
|
-
|
35
|
-
it "should allow skipping of enqueued links" do
|
36
|
-
agent = Agent.new do |spider|
|
37
|
-
spider.every_url do |url|
|
38
|
-
spider.skip_link!
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
agent.enqueue(url)
|
43
|
-
|
44
|
-
agent.queue.should be_empty
|
45
|
-
end
|
46
|
-
|
47
|
-
it "should allow skipping of visited pages" do
|
48
|
-
agent = Agent.new do |spider|
|
49
|
-
spider.every_page do |url|
|
50
|
-
spider.skip_page!
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
agent.visit_page(url)
|
55
|
-
|
56
|
-
agent.history.should == Set[url]
|
57
|
-
agent.queue.should be_empty
|
58
|
-
end
|
59
|
-
end
|
data/spec/filters_spec.rb
DELETED
@@ -1,61 +0,0 @@
|
|
1
|
-
require 'spidr/filters'
|
2
|
-
require 'spidr/agent'
|
3
|
-
|
4
|
-
require 'spec_helper'
|
5
|
-
|
6
|
-
describe Filters do
|
7
|
-
it "should allow setting the acceptable schemes" do
|
8
|
-
agent = Agent.new
|
9
|
-
|
10
|
-
agent.schemes = [:http]
|
11
|
-
agent.schemes.should == ['http']
|
12
|
-
end
|
13
|
-
|
14
|
-
it "should provide the hosts that will be visited" do
|
15
|
-
agent = Agent.new(:hosts => ['spidr.rubyforge.org'])
|
16
|
-
|
17
|
-
agent.visit_hosts.should == ['spidr.rubyforge.org']
|
18
|
-
end
|
19
|
-
|
20
|
-
it "should provide the hosts that will not be visited" do
|
21
|
-
agent = Agent.new(:ignore_hosts => ['example.com'])
|
22
|
-
|
23
|
-
agent.ignore_hosts.should == ['example.com']
|
24
|
-
end
|
25
|
-
|
26
|
-
it "should provide the ports that will be visited" do
|
27
|
-
agent = Agent.new(:ports => [80, 443, 8000])
|
28
|
-
|
29
|
-
agent.visit_ports.should == [80, 443, 8000]
|
30
|
-
end
|
31
|
-
|
32
|
-
it "should provide the ports that will not be visited" do
|
33
|
-
agent = Agent.new(:ignore_ports => [8000, 8080])
|
34
|
-
|
35
|
-
agent.ignore_ports.should == [8000, 8080]
|
36
|
-
end
|
37
|
-
|
38
|
-
it "should provide the links that will be visited" do
|
39
|
-
agent = Agent.new(:links => ['index.php'])
|
40
|
-
|
41
|
-
agent.visit_links.should == ['index.php']
|
42
|
-
end
|
43
|
-
|
44
|
-
it "should provide the links that will not be visited" do
|
45
|
-
agent = Agent.new(:ignore_links => [/login/])
|
46
|
-
|
47
|
-
agent.ignore_links.should == [/login/]
|
48
|
-
end
|
49
|
-
|
50
|
-
it "should provide the exts that will be visited" do
|
51
|
-
agent = Agent.new(:exts => ['htm'])
|
52
|
-
|
53
|
-
agent.visit_exts.should == ['htm']
|
54
|
-
end
|
55
|
-
|
56
|
-
it "should provide the exts that will not be visited" do
|
57
|
-
agent = Agent.new(:ignore_exts => ['cfm'])
|
58
|
-
|
59
|
-
agent.ignore_exts.should == ['cfm']
|
60
|
-
end
|
61
|
-
end
|
data/spec/sanitizers_spec.rb
DELETED
@@ -1,61 +0,0 @@
|
|
1
|
-
require 'spidr/sanitizers'
|
2
|
-
require 'spidr/agent'
|
3
|
-
|
4
|
-
require 'spec_helper'
|
5
|
-
|
6
|
-
describe Sanitizers do
|
7
|
-
describe "sanitize_url" do
|
8
|
-
let(:url) { 'http://host.com' }
|
9
|
-
before(:all) { @agent = Agent.new }
|
10
|
-
|
11
|
-
it "should sanitize URLs" do
|
12
|
-
agent = Agent.new
|
13
|
-
clean_url = agent.sanitize_url(URI(url))
|
14
|
-
|
15
|
-
clean_url.host.should == 'host.com'
|
16
|
-
end
|
17
|
-
|
18
|
-
it "should sanitize URLs given as Strings" do
|
19
|
-
agent = Agent.new
|
20
|
-
clean_url = agent.sanitize_url(url)
|
21
|
-
|
22
|
-
clean_url.host.should == 'host.com'
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
describe "strip_fragments" do
|
27
|
-
let(:url) { URI("http://host.com/page#lol") }
|
28
|
-
|
29
|
-
it "should strip fragment components by default" do
|
30
|
-
agent = Agent.new
|
31
|
-
clean_url = agent.sanitize_url(url)
|
32
|
-
|
33
|
-
clean_url.fragment.should be_nil
|
34
|
-
end
|
35
|
-
|
36
|
-
it "should allow perserving fragment components" do
|
37
|
-
agent = Agent.new(:strip_fragments => false)
|
38
|
-
clean_url = agent.sanitize_url(url)
|
39
|
-
|
40
|
-
clean_url.fragment.should == 'lol'
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
describe "strip_query" do
|
45
|
-
let(:url) { URI("http://host.com/page?x=1") }
|
46
|
-
|
47
|
-
it "should not strip query components by default" do
|
48
|
-
agent = Agent.new
|
49
|
-
clean_url = agent.sanitize_url(url)
|
50
|
-
|
51
|
-
clean_url.query.should == 'x=1'
|
52
|
-
end
|
53
|
-
|
54
|
-
it "should allow stripping of query components" do
|
55
|
-
agent = Agent.new(:strip_query => true)
|
56
|
-
clean_url = agent.sanitize_url(url)
|
57
|
-
|
58
|
-
clean_url.query.should be_nil
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|