spidr 0.2.7 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.rspec +1 -0
- data/ChangeLog.md +56 -31
- data/Gemfile +7 -21
- data/LICENSE.txt +1 -2
- data/README.md +7 -6
- data/Rakefile +13 -23
- data/gemspec.yml +19 -0
- data/lib/spidr/actions/actions.rb +1 -1
- data/lib/spidr/agent.rb +21 -6
- data/lib/spidr/auth_store.rb +1 -1
- data/lib/spidr/body.rb +99 -0
- data/lib/spidr/extensions/uri.rb +14 -7
- data/lib/spidr/headers.rb +323 -0
- data/lib/spidr/links.rb +229 -0
- data/lib/spidr/page.rb +32 -536
- data/lib/spidr/sanitizers.rb +3 -3
- data/lib/spidr/session_cache.rb +1 -0
- data/lib/spidr/version.rb +1 -1
- data/spec/actions_spec.rb +6 -8
- data/spec/auth_store_spec.rb +28 -28
- data/spec/cookie_jar_spec.rb +49 -60
- data/spec/extensions/uri_spec.rb +4 -0
- data/spec/filters_spec.rb +8 -0
- data/spec/page_spec.rb +0 -7
- data/spec/rules_spec.rb +8 -6
- data/spec/sanitizers_spec.rb +10 -16
- data/spec/spec_helper.rb +1 -12
- data/spec/spidr_spec.rb +11 -11
- data/spidr.gemspec +11 -110
- metadata +24 -52
- data/.gitignore +0 -9
- data/.specopts +0 -1
- data/Gemfile.lock +0 -39
data/spec/page_spec.rb
CHANGED
@@ -100,13 +100,6 @@ describe Page do
|
|
100
100
|
end
|
101
101
|
|
102
102
|
it "should provide access to the raw Cookie" do
|
103
|
-
cookie = @page.raw_cookie
|
104
|
-
|
105
|
-
cookie.should_not be_nil
|
106
|
-
cookie.should_not be_empty
|
107
|
-
end
|
108
|
-
|
109
|
-
it "should still support the deprecated #cookie method" do
|
110
103
|
cookie = @page.cookie
|
111
104
|
|
112
105
|
cookie.should_not be_nil
|
data/spec/rules_spec.rb
CHANGED
@@ -3,39 +3,41 @@ require 'spidr/rules'
|
|
3
3
|
require 'spec_helper'
|
4
4
|
|
5
5
|
describe Rules do
|
6
|
+
subject { Rules }
|
7
|
+
|
6
8
|
it "should accept data based on acceptance data" do
|
7
|
-
rules =
|
9
|
+
rules = subject.new(:accept => [1])
|
8
10
|
|
9
11
|
rules.accept?(1).should == true
|
10
12
|
end
|
11
13
|
|
12
14
|
it "should accept data based on acceptance regexps" do
|
13
|
-
rules =
|
15
|
+
rules = subject.new(:accept => [/1/])
|
14
16
|
|
15
17
|
rules.accept?('1').should == true
|
16
18
|
end
|
17
19
|
|
18
20
|
it "should match non-Strings using acceptance regexps" do
|
19
|
-
rules =
|
21
|
+
rules = subject.new(:accept => [/1/])
|
20
22
|
|
21
23
|
rules.accept?(1).should == true
|
22
24
|
end
|
23
25
|
|
24
26
|
it "should accept data using acceptance lambdas" do
|
25
|
-
rules =
|
27
|
+
rules = subject.new(:accept => [lambda { |data| data > 2 }])
|
26
28
|
|
27
29
|
rules.accept?(3).should == true
|
28
30
|
end
|
29
31
|
|
30
32
|
it "should reject data that does not match any acceptance patterns" do
|
31
|
-
rules =
|
33
|
+
rules = subject.new(:accept => [1, 2, 3])
|
32
34
|
|
33
35
|
rules.accept?(2).should == true
|
34
36
|
rules.accept?(4).should == false
|
35
37
|
end
|
36
38
|
|
37
39
|
it "should accept data that does not match any rejection patterns" do
|
38
|
-
rules =
|
40
|
+
rules = subject.new(:reject => [1, 2, 3])
|
39
41
|
|
40
42
|
rules.accept?(2).should == false
|
41
43
|
rules.accept?(4).should == true
|
data/spec/sanitizers_spec.rb
CHANGED
@@ -5,61 +5,55 @@ require 'spec_helper'
|
|
5
5
|
|
6
6
|
describe Sanitizers do
|
7
7
|
describe "sanitize_url" do
|
8
|
-
|
9
|
-
|
10
|
-
@url = 'http://host.com'
|
11
|
-
end
|
8
|
+
let(:url) { 'http://host.com' }
|
9
|
+
before(:all) { @agent = Agent.new }
|
12
10
|
|
13
11
|
it "should sanitize URLs" do
|
14
12
|
agent = Agent.new
|
15
|
-
clean_url = agent.sanitize_url(URI(
|
13
|
+
clean_url = agent.sanitize_url(URI(url))
|
16
14
|
|
17
15
|
clean_url.host.should == 'host.com'
|
18
16
|
end
|
19
17
|
|
20
18
|
it "should sanitize URLs given as Strings" do
|
21
19
|
agent = Agent.new
|
22
|
-
clean_url = agent.sanitize_url(
|
20
|
+
clean_url = agent.sanitize_url(url)
|
23
21
|
|
24
22
|
clean_url.host.should == 'host.com'
|
25
23
|
end
|
26
24
|
end
|
27
25
|
|
28
26
|
describe "strip_fragments" do
|
29
|
-
|
30
|
-
@url = URI("http://host.com/page#lol")
|
31
|
-
end
|
27
|
+
let(:url) { URI("http://host.com/page#lol") }
|
32
28
|
|
33
29
|
it "should strip fragment components by default" do
|
34
30
|
agent = Agent.new
|
35
|
-
clean_url = agent.sanitize_url(
|
31
|
+
clean_url = agent.sanitize_url(url)
|
36
32
|
|
37
33
|
clean_url.fragment.should be_nil
|
38
34
|
end
|
39
35
|
|
40
36
|
it "should allow perserving fragment components" do
|
41
37
|
agent = Agent.new(:strip_fragments => false)
|
42
|
-
clean_url = agent.sanitize_url(
|
38
|
+
clean_url = agent.sanitize_url(url)
|
43
39
|
|
44
40
|
clean_url.fragment.should == 'lol'
|
45
41
|
end
|
46
42
|
end
|
47
43
|
|
48
44
|
describe "strip_query" do
|
49
|
-
|
50
|
-
@url = URI("http://host.com/page?x=1")
|
51
|
-
end
|
45
|
+
let(:url) { URI("http://host.com/page?x=1") }
|
52
46
|
|
53
47
|
it "should not strip query components by default" do
|
54
48
|
agent = Agent.new
|
55
|
-
clean_url = agent.sanitize_url(
|
49
|
+
clean_url = agent.sanitize_url(url)
|
56
50
|
|
57
51
|
clean_url.query.should == 'x=1'
|
58
52
|
end
|
59
53
|
|
60
54
|
it "should allow stripping of query components" do
|
61
55
|
agent = Agent.new(:strip_query => true)
|
62
|
-
clean_url = agent.sanitize_url(
|
56
|
+
clean_url = agent.sanitize_url(url)
|
63
57
|
|
64
58
|
clean_url.query.should be_nil
|
65
59
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,15 +1,4 @@
|
|
1
|
-
require '
|
2
|
-
require 'bundler'
|
3
|
-
|
4
|
-
begin
|
5
|
-
Bundler.setup(:runtime, :test)
|
6
|
-
rescue Bundler::BundlerError => e
|
7
|
-
STDERR.puts e.message
|
8
|
-
STDERR.puts "Run `bundle install` to install missing gems"
|
9
|
-
exit e.status_code
|
10
|
-
end
|
11
|
-
|
12
|
-
require 'spec'
|
1
|
+
require 'rspec'
|
13
2
|
require 'spidr/version'
|
14
3
|
|
15
4
|
include Spidr
|
data/spec/spidr_spec.rb
CHANGED
@@ -4,36 +4,36 @@ require 'spec_helper'
|
|
4
4
|
|
5
5
|
describe Spidr do
|
6
6
|
it "should have a VERSION constant" do
|
7
|
-
|
7
|
+
subject.const_defined?('VERSION').should == true
|
8
8
|
end
|
9
9
|
|
10
10
|
describe "proxy" do
|
11
11
|
after(:all) do
|
12
|
-
|
12
|
+
subject.disable_proxy!
|
13
13
|
end
|
14
14
|
|
15
15
|
it "should not have proxy settings by default" do
|
16
|
-
|
16
|
+
subject.proxy[:host].should be_nil
|
17
17
|
end
|
18
18
|
|
19
19
|
it "should allow setting new proxy settings" do
|
20
|
-
|
20
|
+
subject.proxy = {:host => 'example.com', :port => 8010}
|
21
21
|
|
22
|
-
|
23
|
-
|
22
|
+
subject.proxy[:host].should == 'example.com'
|
23
|
+
subject.proxy[:port].should == 8010
|
24
24
|
end
|
25
25
|
|
26
26
|
it "should default the :port option of new proxy settings" do
|
27
|
-
|
27
|
+
subject.proxy = {:host => 'example.com'}
|
28
28
|
|
29
|
-
|
30
|
-
|
29
|
+
subject.proxy[:host].should == 'example.com'
|
30
|
+
subject.proxy[:port].should == Spidr::COMMON_PROXY_PORT
|
31
31
|
end
|
32
32
|
|
33
33
|
it "should allow disabling the proxy" do
|
34
|
-
|
34
|
+
subject.disable_proxy!
|
35
35
|
|
36
|
-
|
36
|
+
subject.proxy[:host].should be_nil
|
37
37
|
end
|
38
38
|
end
|
39
39
|
end
|
data/spidr.gemspec
CHANGED
@@ -1,114 +1,15 @@
|
|
1
|
-
# Generated by jeweler
|
2
|
-
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
-
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
1
|
# -*- encoding: utf-8 -*-
|
5
2
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
"
|
17
|
-
"LICENSE.txt",
|
18
|
-
"README.md"
|
19
|
-
]
|
20
|
-
s.files = [
|
21
|
-
".gitignore",
|
22
|
-
".specopts",
|
23
|
-
".yardopts",
|
24
|
-
"ChangeLog.md",
|
25
|
-
"Gemfile",
|
26
|
-
"Gemfile.lock",
|
27
|
-
"LICENSE.txt",
|
28
|
-
"README.md",
|
29
|
-
"Rakefile",
|
30
|
-
"lib/spidr.rb",
|
31
|
-
"lib/spidr/actions.rb",
|
32
|
-
"lib/spidr/actions/actions.rb",
|
33
|
-
"lib/spidr/actions/exceptions.rb",
|
34
|
-
"lib/spidr/actions/exceptions/action.rb",
|
35
|
-
"lib/spidr/actions/exceptions/paused.rb",
|
36
|
-
"lib/spidr/actions/exceptions/skip_link.rb",
|
37
|
-
"lib/spidr/actions/exceptions/skip_page.rb",
|
38
|
-
"lib/spidr/agent.rb",
|
39
|
-
"lib/spidr/auth_credential.rb",
|
40
|
-
"lib/spidr/auth_store.rb",
|
41
|
-
"lib/spidr/cookie_jar.rb",
|
42
|
-
"lib/spidr/events.rb",
|
43
|
-
"lib/spidr/extensions.rb",
|
44
|
-
"lib/spidr/extensions/uri.rb",
|
45
|
-
"lib/spidr/filters.rb",
|
46
|
-
"lib/spidr/page.rb",
|
47
|
-
"lib/spidr/rules.rb",
|
48
|
-
"lib/spidr/sanitizers.rb",
|
49
|
-
"lib/spidr/session_cache.rb",
|
50
|
-
"lib/spidr/spidr.rb",
|
51
|
-
"lib/spidr/version.rb",
|
52
|
-
"spec/actions_spec.rb",
|
53
|
-
"spec/agent_spec.rb",
|
54
|
-
"spec/auth_store_spec.rb",
|
55
|
-
"spec/cookie_jar_spec.rb",
|
56
|
-
"spec/extensions/uri_spec.rb",
|
57
|
-
"spec/filters_spec.rb",
|
58
|
-
"spec/helpers/history.rb",
|
59
|
-
"spec/helpers/page.rb",
|
60
|
-
"spec/helpers/wsoc.rb",
|
61
|
-
"spec/page_examples.rb",
|
62
|
-
"spec/page_spec.rb",
|
63
|
-
"spec/rules_spec.rb",
|
64
|
-
"spec/sanitizers_spec.rb",
|
65
|
-
"spec/session_cache.rb",
|
66
|
-
"spec/spec_helper.rb",
|
67
|
-
"spec/spidr_spec.rb",
|
68
|
-
"spidr.gemspec"
|
69
|
-
]
|
70
|
-
s.has_rdoc = %q{yard}
|
71
|
-
s.homepage = %q{http://github.com/postmodern/spidr}
|
72
|
-
s.licenses = ["MIT"]
|
73
|
-
s.require_paths = ["lib"]
|
74
|
-
s.rubygems_version = %q{1.3.7}
|
75
|
-
s.summary = %q{A versatile Ruby web spidering library}
|
76
|
-
s.test_files = [
|
77
|
-
"spec/actions_spec.rb",
|
78
|
-
"spec/agent_spec.rb",
|
79
|
-
"spec/auth_store_spec.rb",
|
80
|
-
"spec/cookie_jar_spec.rb",
|
81
|
-
"spec/extensions/uri_spec.rb",
|
82
|
-
"spec/filters_spec.rb",
|
83
|
-
"spec/helpers/history.rb",
|
84
|
-
"spec/helpers/page.rb",
|
85
|
-
"spec/helpers/wsoc.rb",
|
86
|
-
"spec/page_examples.rb",
|
87
|
-
"spec/page_spec.rb",
|
88
|
-
"spec/rules_spec.rb",
|
89
|
-
"spec/sanitizers_spec.rb",
|
90
|
-
"spec/session_cache.rb",
|
91
|
-
"spec/spec_helper.rb",
|
92
|
-
"spec/spidr_spec.rb"
|
93
|
-
]
|
94
|
-
|
95
|
-
if s.respond_to? :specification_version then
|
96
|
-
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
97
|
-
s.specification_version = 3
|
98
|
-
|
99
|
-
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
100
|
-
s.add_development_dependency(%q<rake>, ["~> 0.8.7"])
|
101
|
-
s.add_development_dependency(%q<jeweler>, ["~> 1.4.0"])
|
102
|
-
s.add_development_dependency(%q<rspec>, ["~> 1.3.0"])
|
103
|
-
else
|
104
|
-
s.add_dependency(%q<rake>, ["~> 0.8.7"])
|
105
|
-
s.add_dependency(%q<jeweler>, ["~> 1.4.0"])
|
106
|
-
s.add_dependency(%q<rspec>, ["~> 1.3.0"])
|
107
|
-
end
|
108
|
-
else
|
109
|
-
s.add_dependency(%q<rake>, ["~> 0.8.7"])
|
110
|
-
s.add_dependency(%q<jeweler>, ["~> 1.4.0"])
|
111
|
-
s.add_dependency(%q<rspec>, ["~> 1.3.0"])
|
3
|
+
begin
|
4
|
+
Ore::Specification.new do |gemspec|
|
5
|
+
# custom logic here
|
6
|
+
end
|
7
|
+
rescue NameError
|
8
|
+
begin
|
9
|
+
require 'ore/specification'
|
10
|
+
retry
|
11
|
+
rescue LoadError
|
12
|
+
STDERR.puts "The '#{__FILE__}' file requires Ore."
|
13
|
+
STDERR.puts "Run `gem install ore-core` to install Ore."
|
112
14
|
end
|
113
15
|
end
|
114
|
-
|
metadata
CHANGED
@@ -1,12 +1,8 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
prerelease:
|
5
|
-
|
6
|
-
- 0
|
7
|
-
- 2
|
8
|
-
- 7
|
9
|
-
version: 0.2.7
|
4
|
+
prerelease:
|
5
|
+
version: 0.3.0
|
10
6
|
platform: ruby
|
11
7
|
authors:
|
12
8
|
- Postmodern
|
@@ -14,74 +10,59 @@ autorequire:
|
|
14
10
|
bindir: bin
|
15
11
|
cert_chain: []
|
16
12
|
|
17
|
-
date:
|
18
|
-
default_executable:
|
13
|
+
date: 2011-04-14 00:00:00 Z
|
19
14
|
dependencies:
|
20
15
|
- !ruby/object:Gem::Dependency
|
21
|
-
name:
|
16
|
+
name: nokogiri
|
22
17
|
requirement: &id001 !ruby/object:Gem::Requirement
|
23
18
|
none: false
|
24
19
|
requirements:
|
25
20
|
- - ~>
|
26
21
|
- !ruby/object:Gem::Version
|
27
|
-
|
28
|
-
|
29
|
-
- 8
|
30
|
-
- 7
|
31
|
-
version: 0.8.7
|
32
|
-
type: :development
|
22
|
+
version: "1.3"
|
23
|
+
type: :runtime
|
33
24
|
prerelease: false
|
34
25
|
version_requirements: *id001
|
35
26
|
- !ruby/object:Gem::Dependency
|
36
|
-
name:
|
27
|
+
name: bundler
|
37
28
|
requirement: &id002 !ruby/object:Gem::Requirement
|
38
29
|
none: false
|
39
30
|
requirements:
|
40
31
|
- - ~>
|
41
32
|
- !ruby/object:Gem::Version
|
42
|
-
|
43
|
-
- 1
|
44
|
-
- 4
|
45
|
-
- 0
|
46
|
-
version: 1.4.0
|
33
|
+
version: 1.0.0
|
47
34
|
type: :development
|
48
35
|
prerelease: false
|
49
36
|
version_requirements: *id002
|
50
37
|
- !ruby/object:Gem::Dependency
|
51
|
-
name:
|
38
|
+
name: yard
|
52
39
|
requirement: &id003 !ruby/object:Gem::Requirement
|
53
40
|
none: false
|
54
41
|
requirements:
|
55
42
|
- - ~>
|
56
43
|
- !ruby/object:Gem::Version
|
57
|
-
|
58
|
-
- 1
|
59
|
-
- 3
|
60
|
-
- 0
|
61
|
-
version: 1.3.0
|
44
|
+
version: 0.6.0
|
62
45
|
type: :development
|
63
46
|
prerelease: false
|
64
47
|
version_requirements: *id003
|
65
48
|
description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
|
66
|
-
email:
|
49
|
+
email:
|
50
|
+
- postmodern.mod3@gmail.com
|
67
51
|
executables: []
|
68
52
|
|
69
53
|
extensions: []
|
70
54
|
|
71
55
|
extra_rdoc_files:
|
72
|
-
- ChangeLog.md
|
73
|
-
- LICENSE.txt
|
74
56
|
- README.md
|
75
57
|
files:
|
76
|
-
- .
|
77
|
-
- .specopts
|
58
|
+
- .rspec
|
78
59
|
- .yardopts
|
79
60
|
- ChangeLog.md
|
80
61
|
- Gemfile
|
81
|
-
- Gemfile.lock
|
82
62
|
- LICENSE.txt
|
83
63
|
- README.md
|
84
64
|
- Rakefile
|
65
|
+
- gemspec.yml
|
85
66
|
- lib/spidr.rb
|
86
67
|
- lib/spidr/actions.rb
|
87
68
|
- lib/spidr/actions/actions.rb
|
@@ -93,11 +74,14 @@ files:
|
|
93
74
|
- lib/spidr/agent.rb
|
94
75
|
- lib/spidr/auth_credential.rb
|
95
76
|
- lib/spidr/auth_store.rb
|
77
|
+
- lib/spidr/body.rb
|
96
78
|
- lib/spidr/cookie_jar.rb
|
97
79
|
- lib/spidr/events.rb
|
98
80
|
- lib/spidr/extensions.rb
|
99
81
|
- lib/spidr/extensions/uri.rb
|
100
82
|
- lib/spidr/filters.rb
|
83
|
+
- lib/spidr/headers.rb
|
84
|
+
- lib/spidr/links.rb
|
101
85
|
- lib/spidr/page.rb
|
102
86
|
- lib/spidr/rules.rb
|
103
87
|
- lib/spidr/sanitizers.rb
|
@@ -121,7 +105,6 @@ files:
|
|
121
105
|
- spec/spec_helper.rb
|
122
106
|
- spec/spidr_spec.rb
|
123
107
|
- spidr.gemspec
|
124
|
-
has_rdoc: yard
|
125
108
|
homepage: http://github.com/postmodern/spidr
|
126
109
|
licenses:
|
127
110
|
- MIT
|
@@ -135,39 +118,28 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
135
118
|
requirements:
|
136
119
|
- - ">="
|
137
120
|
- !ruby/object:Gem::Version
|
138
|
-
hash: 4533863298463290280
|
139
|
-
segments:
|
140
|
-
- 0
|
141
121
|
version: "0"
|
142
122
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
143
123
|
none: false
|
144
124
|
requirements:
|
145
125
|
- - ">="
|
146
126
|
- !ruby/object:Gem::Version
|
147
|
-
|
148
|
-
- 0
|
149
|
-
version: "0"
|
127
|
+
version: 1.3.6
|
150
128
|
requirements: []
|
151
129
|
|
152
|
-
rubyforge_project:
|
153
|
-
rubygems_version: 1.
|
130
|
+
rubyforge_project: spidr
|
131
|
+
rubygems_version: 1.7.2
|
154
132
|
signing_key:
|
155
133
|
specification_version: 3
|
156
134
|
summary: A versatile Ruby web spidering library
|
157
135
|
test_files:
|
158
|
-
- spec/actions_spec.rb
|
159
136
|
- spec/agent_spec.rb
|
137
|
+
- spec/actions_spec.rb
|
138
|
+
- spec/rules_spec.rb
|
139
|
+
- spec/extensions/uri_spec.rb
|
160
140
|
- spec/auth_store_spec.rb
|
161
141
|
- spec/cookie_jar_spec.rb
|
162
|
-
- spec/extensions/uri_spec.rb
|
163
142
|
- spec/filters_spec.rb
|
164
|
-
- spec/helpers/history.rb
|
165
|
-
- spec/helpers/page.rb
|
166
|
-
- spec/helpers/wsoc.rb
|
167
|
-
- spec/page_examples.rb
|
168
|
-
- spec/page_spec.rb
|
169
|
-
- spec/rules_spec.rb
|
170
143
|
- spec/sanitizers_spec.rb
|
171
|
-
- spec/session_cache.rb
|
172
|
-
- spec/spec_helper.rb
|
173
144
|
- spec/spidr_spec.rb
|
145
|
+
- spec/page_spec.rb
|