spidr 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ChangeLog.md +69 -54
- data/Gemfile +9 -5
- data/LICENSE.txt +1 -1
- data/README.md +34 -26
- data/Rakefile +4 -15
- data/gemspec.yml +3 -2
- data/lib/spidr/agent.rb +101 -44
- data/lib/spidr/{actions → agent}/actions.rb +32 -12
- data/lib/spidr/{events.rb → agent/events.rb} +4 -8
- data/lib/spidr/{filters.rb → agent/filters.rb} +14 -16
- data/lib/spidr/{sanitizers.rb → agent/sanitizers.rb} +5 -7
- data/lib/spidr/auth_store.rb +2 -2
- data/lib/spidr/cookie_jar.rb +2 -2
- data/lib/spidr/extensions/uri.rb +28 -16
- data/lib/spidr/page.rb +7 -11
- data/lib/spidr/{body.rb → page/body.rb} +1 -1
- data/lib/spidr/{headers.rb → page/headers.rb} +1 -1
- data/lib/spidr/{links.rb → page/links.rb} +43 -7
- data/lib/spidr/session_cache.rb +2 -2
- data/lib/spidr/spidr.rb +32 -5
- data/lib/spidr/version.rb +1 -1
- data/spec/agent/actions_spec.rb +60 -0
- data/spec/agent/filters_spec.rb +62 -0
- data/spec/agent/sanitizers_spec.rb +62 -0
- data/spec/agent_spec.rb +13 -13
- data/spec/auth_store_spec.rb +17 -17
- data/spec/cookie_jar_spec.rb +26 -26
- data/spec/extensions/uri_spec.rb +19 -9
- data/spec/helpers/history.rb +5 -5
- data/spec/helpers/wsoc.rb +2 -2
- data/spec/page_examples.rb +4 -4
- data/spec/page_spec.rb +28 -25
- data/spec/rules_spec.rb +14 -14
- data/spec/session_cache.rb +7 -7
- data/spec/spidr_spec.rb +10 -10
- metadata +37 -51
- data/lib/spidr/actions.rb +0 -2
- data/lib/spidr/actions/exceptions.rb +0 -4
- data/lib/spidr/actions/exceptions/action.rb +0 -9
- data/lib/spidr/actions/exceptions/paused.rb +0 -11
- data/lib/spidr/actions/exceptions/skip_link.rb +0 -12
- data/lib/spidr/actions/exceptions/skip_page.rb +0 -12
- data/spec/actions_spec.rb +0 -59
- data/spec/filters_spec.rb +0 -61
- data/spec/sanitizers_spec.rb +0 -61
data/lib/spidr/actions.rb
DELETED
data/spec/actions_spec.rb
DELETED
@@ -1,59 +0,0 @@
|
|
1
|
-
require 'spidr/actions'
|
2
|
-
require 'spidr/agent'
|
3
|
-
|
4
|
-
require 'spec_helper'
|
5
|
-
|
6
|
-
describe Actions do
|
7
|
-
let(:url) { URI('http://spidr.rubyforge.org/') }
|
8
|
-
|
9
|
-
it "should be able to pause spidering" do
|
10
|
-
count = 0
|
11
|
-
agent = Agent.host('spidr.rubyforge.org') do |spider|
|
12
|
-
spider.every_page do |page|
|
13
|
-
count += 1
|
14
|
-
spider.pause! if count >= 2
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
agent.should be_paused
|
19
|
-
agent.history.length.should == 2
|
20
|
-
end
|
21
|
-
|
22
|
-
it "should be able to continue spidering after being paused" do
|
23
|
-
agent = Agent.new do |spider|
|
24
|
-
spider.every_page do |page|
|
25
|
-
spider.pause!
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
agent.enqueue(url)
|
30
|
-
agent.continue!
|
31
|
-
|
32
|
-
agent.visited?(url).should == true
|
33
|
-
end
|
34
|
-
|
35
|
-
it "should allow skipping of enqueued links" do
|
36
|
-
agent = Agent.new do |spider|
|
37
|
-
spider.every_url do |url|
|
38
|
-
spider.skip_link!
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
agent.enqueue(url)
|
43
|
-
|
44
|
-
agent.queue.should be_empty
|
45
|
-
end
|
46
|
-
|
47
|
-
it "should allow skipping of visited pages" do
|
48
|
-
agent = Agent.new do |spider|
|
49
|
-
spider.every_page do |url|
|
50
|
-
spider.skip_page!
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
agent.visit_page(url)
|
55
|
-
|
56
|
-
agent.history.should == Set[url]
|
57
|
-
agent.queue.should be_empty
|
58
|
-
end
|
59
|
-
end
|
data/spec/filters_spec.rb
DELETED
@@ -1,61 +0,0 @@
|
|
1
|
-
require 'spidr/filters'
|
2
|
-
require 'spidr/agent'
|
3
|
-
|
4
|
-
require 'spec_helper'
|
5
|
-
|
6
|
-
describe Filters do
|
7
|
-
it "should allow setting the acceptable schemes" do
|
8
|
-
agent = Agent.new
|
9
|
-
|
10
|
-
agent.schemes = [:http]
|
11
|
-
agent.schemes.should == ['http']
|
12
|
-
end
|
13
|
-
|
14
|
-
it "should provide the hosts that will be visited" do
|
15
|
-
agent = Agent.new(:hosts => ['spidr.rubyforge.org'])
|
16
|
-
|
17
|
-
agent.visit_hosts.should == ['spidr.rubyforge.org']
|
18
|
-
end
|
19
|
-
|
20
|
-
it "should provide the hosts that will not be visited" do
|
21
|
-
agent = Agent.new(:ignore_hosts => ['example.com'])
|
22
|
-
|
23
|
-
agent.ignore_hosts.should == ['example.com']
|
24
|
-
end
|
25
|
-
|
26
|
-
it "should provide the ports that will be visited" do
|
27
|
-
agent = Agent.new(:ports => [80, 443, 8000])
|
28
|
-
|
29
|
-
agent.visit_ports.should == [80, 443, 8000]
|
30
|
-
end
|
31
|
-
|
32
|
-
it "should provide the ports that will not be visited" do
|
33
|
-
agent = Agent.new(:ignore_ports => [8000, 8080])
|
34
|
-
|
35
|
-
agent.ignore_ports.should == [8000, 8080]
|
36
|
-
end
|
37
|
-
|
38
|
-
it "should provide the links that will be visited" do
|
39
|
-
agent = Agent.new(:links => ['index.php'])
|
40
|
-
|
41
|
-
agent.visit_links.should == ['index.php']
|
42
|
-
end
|
43
|
-
|
44
|
-
it "should provide the links that will not be visited" do
|
45
|
-
agent = Agent.new(:ignore_links => [/login/])
|
46
|
-
|
47
|
-
agent.ignore_links.should == [/login/]
|
48
|
-
end
|
49
|
-
|
50
|
-
it "should provide the exts that will be visited" do
|
51
|
-
agent = Agent.new(:exts => ['htm'])
|
52
|
-
|
53
|
-
agent.visit_exts.should == ['htm']
|
54
|
-
end
|
55
|
-
|
56
|
-
it "should provide the exts that will not be visited" do
|
57
|
-
agent = Agent.new(:ignore_exts => ['cfm'])
|
58
|
-
|
59
|
-
agent.ignore_exts.should == ['cfm']
|
60
|
-
end
|
61
|
-
end
|
data/spec/sanitizers_spec.rb
DELETED
@@ -1,61 +0,0 @@
|
|
1
|
-
require 'spidr/sanitizers'
|
2
|
-
require 'spidr/agent'
|
3
|
-
|
4
|
-
require 'spec_helper'
|
5
|
-
|
6
|
-
describe Sanitizers do
|
7
|
-
describe "sanitize_url" do
|
8
|
-
let(:url) { 'http://host.com' }
|
9
|
-
before(:all) { @agent = Agent.new }
|
10
|
-
|
11
|
-
it "should sanitize URLs" do
|
12
|
-
agent = Agent.new
|
13
|
-
clean_url = agent.sanitize_url(URI(url))
|
14
|
-
|
15
|
-
clean_url.host.should == 'host.com'
|
16
|
-
end
|
17
|
-
|
18
|
-
it "should sanitize URLs given as Strings" do
|
19
|
-
agent = Agent.new
|
20
|
-
clean_url = agent.sanitize_url(url)
|
21
|
-
|
22
|
-
clean_url.host.should == 'host.com'
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
describe "strip_fragments" do
|
27
|
-
let(:url) { URI("http://host.com/page#lol") }
|
28
|
-
|
29
|
-
it "should strip fragment components by default" do
|
30
|
-
agent = Agent.new
|
31
|
-
clean_url = agent.sanitize_url(url)
|
32
|
-
|
33
|
-
clean_url.fragment.should be_nil
|
34
|
-
end
|
35
|
-
|
36
|
-
it "should allow perserving fragment components" do
|
37
|
-
agent = Agent.new(:strip_fragments => false)
|
38
|
-
clean_url = agent.sanitize_url(url)
|
39
|
-
|
40
|
-
clean_url.fragment.should == 'lol'
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
describe "strip_query" do
|
45
|
-
let(:url) { URI("http://host.com/page?x=1") }
|
46
|
-
|
47
|
-
it "should not strip query components by default" do
|
48
|
-
agent = Agent.new
|
49
|
-
clean_url = agent.sanitize_url(url)
|
50
|
-
|
51
|
-
clean_url.query.should == 'x=1'
|
52
|
-
end
|
53
|
-
|
54
|
-
it "should allow stripping of query components" do
|
55
|
-
agent = Agent.new(:strip_query => true)
|
56
|
-
clean_url = agent.sanitize_url(url)
|
57
|
-
|
58
|
-
clean_url.query.should be_nil
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|