spidr 0.2.7 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rspec +1 -0
- data/ChangeLog.md +56 -31
- data/Gemfile +7 -21
- data/LICENSE.txt +1 -2
- data/README.md +7 -6
- data/Rakefile +13 -23
- data/gemspec.yml +19 -0
- data/lib/spidr/actions/actions.rb +1 -1
- data/lib/spidr/agent.rb +21 -6
- data/lib/spidr/auth_store.rb +1 -1
- data/lib/spidr/body.rb +99 -0
- data/lib/spidr/extensions/uri.rb +14 -7
- data/lib/spidr/headers.rb +323 -0
- data/lib/spidr/links.rb +229 -0
- data/lib/spidr/page.rb +32 -536
- data/lib/spidr/sanitizers.rb +3 -3
- data/lib/spidr/session_cache.rb +1 -0
- data/lib/spidr/version.rb +1 -1
- data/spec/actions_spec.rb +6 -8
- data/spec/auth_store_spec.rb +28 -28
- data/spec/cookie_jar_spec.rb +49 -60
- data/spec/extensions/uri_spec.rb +4 -0
- data/spec/filters_spec.rb +8 -0
- data/spec/page_spec.rb +0 -7
- data/spec/rules_spec.rb +8 -6
- data/spec/sanitizers_spec.rb +10 -16
- data/spec/spec_helper.rb +1 -12
- data/spec/spidr_spec.rb +11 -11
- data/spidr.gemspec +11 -110
- metadata +24 -52
- data/.gitignore +0 -9
- data/.specopts +0 -1
- data/Gemfile.lock +0 -39
data/spec/page_spec.rb
CHANGED
@@ -100,13 +100,6 @@ describe Page do
|
|
100
100
|
end
|
101
101
|
|
102
102
|
it "should provide access to the raw Cookie" do
|
103
|
-
cookie = @page.raw_cookie
|
104
|
-
|
105
|
-
cookie.should_not be_nil
|
106
|
-
cookie.should_not be_empty
|
107
|
-
end
|
108
|
-
|
109
|
-
it "should still support the deprecated #cookie method" do
|
110
103
|
cookie = @page.cookie
|
111
104
|
|
112
105
|
cookie.should_not be_nil
|
data/spec/rules_spec.rb
CHANGED
@@ -3,39 +3,41 @@ require 'spidr/rules'
|
|
3
3
|
require 'spec_helper'
|
4
4
|
|
5
5
|
describe Rules do
|
6
|
+
subject { Rules }
|
7
|
+
|
6
8
|
it "should accept data based on acceptance data" do
|
7
|
-
rules =
|
9
|
+
rules = subject.new(:accept => [1])
|
8
10
|
|
9
11
|
rules.accept?(1).should == true
|
10
12
|
end
|
11
13
|
|
12
14
|
it "should accept data based on acceptance regexps" do
|
13
|
-
rules =
|
15
|
+
rules = subject.new(:accept => [/1/])
|
14
16
|
|
15
17
|
rules.accept?('1').should == true
|
16
18
|
end
|
17
19
|
|
18
20
|
it "should match non-Strings using acceptance regexps" do
|
19
|
-
rules =
|
21
|
+
rules = subject.new(:accept => [/1/])
|
20
22
|
|
21
23
|
rules.accept?(1).should == true
|
22
24
|
end
|
23
25
|
|
24
26
|
it "should accept data using acceptance lambdas" do
|
25
|
-
rules =
|
27
|
+
rules = subject.new(:accept => [lambda { |data| data > 2 }])
|
26
28
|
|
27
29
|
rules.accept?(3).should == true
|
28
30
|
end
|
29
31
|
|
30
32
|
it "should reject data that does not match any acceptance patterns" do
|
31
|
-
rules =
|
33
|
+
rules = subject.new(:accept => [1, 2, 3])
|
32
34
|
|
33
35
|
rules.accept?(2).should == true
|
34
36
|
rules.accept?(4).should == false
|
35
37
|
end
|
36
38
|
|
37
39
|
it "should accept data that does not match any rejection patterns" do
|
38
|
-
rules =
|
40
|
+
rules = subject.new(:reject => [1, 2, 3])
|
39
41
|
|
40
42
|
rules.accept?(2).should == false
|
41
43
|
rules.accept?(4).should == true
|
data/spec/sanitizers_spec.rb
CHANGED
@@ -5,61 +5,55 @@ require 'spec_helper'
|
|
5
5
|
|
6
6
|
describe Sanitizers do
|
7
7
|
describe "sanitize_url" do
|
8
|
-
|
9
|
-
|
10
|
-
@url = 'http://host.com'
|
11
|
-
end
|
8
|
+
let(:url) { 'http://host.com' }
|
9
|
+
before(:all) { @agent = Agent.new }
|
12
10
|
|
13
11
|
it "should sanitize URLs" do
|
14
12
|
agent = Agent.new
|
15
|
-
clean_url = agent.sanitize_url(URI(
|
13
|
+
clean_url = agent.sanitize_url(URI(url))
|
16
14
|
|
17
15
|
clean_url.host.should == 'host.com'
|
18
16
|
end
|
19
17
|
|
20
18
|
it "should sanitize URLs given as Strings" do
|
21
19
|
agent = Agent.new
|
22
|
-
clean_url = agent.sanitize_url(
|
20
|
+
clean_url = agent.sanitize_url(url)
|
23
21
|
|
24
22
|
clean_url.host.should == 'host.com'
|
25
23
|
end
|
26
24
|
end
|
27
25
|
|
28
26
|
describe "strip_fragments" do
|
29
|
-
|
30
|
-
@url = URI("http://host.com/page#lol")
|
31
|
-
end
|
27
|
+
let(:url) { URI("http://host.com/page#lol") }
|
32
28
|
|
33
29
|
it "should strip fragment components by default" do
|
34
30
|
agent = Agent.new
|
35
|
-
clean_url = agent.sanitize_url(
|
31
|
+
clean_url = agent.sanitize_url(url)
|
36
32
|
|
37
33
|
clean_url.fragment.should be_nil
|
38
34
|
end
|
39
35
|
|
40
36
|
it "should allow perserving fragment components" do
|
41
37
|
agent = Agent.new(:strip_fragments => false)
|
42
|
-
clean_url = agent.sanitize_url(
|
38
|
+
clean_url = agent.sanitize_url(url)
|
43
39
|
|
44
40
|
clean_url.fragment.should == 'lol'
|
45
41
|
end
|
46
42
|
end
|
47
43
|
|
48
44
|
describe "strip_query" do
|
49
|
-
|
50
|
-
@url = URI("http://host.com/page?x=1")
|
51
|
-
end
|
45
|
+
let(:url) { URI("http://host.com/page?x=1") }
|
52
46
|
|
53
47
|
it "should not strip query components by default" do
|
54
48
|
agent = Agent.new
|
55
|
-
clean_url = agent.sanitize_url(
|
49
|
+
clean_url = agent.sanitize_url(url)
|
56
50
|
|
57
51
|
clean_url.query.should == 'x=1'
|
58
52
|
end
|
59
53
|
|
60
54
|
it "should allow stripping of query components" do
|
61
55
|
agent = Agent.new(:strip_query => true)
|
62
|
-
clean_url = agent.sanitize_url(
|
56
|
+
clean_url = agent.sanitize_url(url)
|
63
57
|
|
64
58
|
clean_url.query.should be_nil
|
65
59
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,15 +1,4 @@
|
|
1
|
-
require '
|
2
|
-
require 'bundler'
|
3
|
-
|
4
|
-
begin
|
5
|
-
Bundler.setup(:runtime, :test)
|
6
|
-
rescue Bundler::BundlerError => e
|
7
|
-
STDERR.puts e.message
|
8
|
-
STDERR.puts "Run `bundle install` to install missing gems"
|
9
|
-
exit e.status_code
|
10
|
-
end
|
11
|
-
|
12
|
-
require 'spec'
|
1
|
+
require 'rspec'
|
13
2
|
require 'spidr/version'
|
14
3
|
|
15
4
|
include Spidr
|
data/spec/spidr_spec.rb
CHANGED
@@ -4,36 +4,36 @@ require 'spec_helper'
|
|
4
4
|
|
5
5
|
describe Spidr do
|
6
6
|
it "should have a VERSION constant" do
|
7
|
-
|
7
|
+
subject.const_defined?('VERSION').should == true
|
8
8
|
end
|
9
9
|
|
10
10
|
describe "proxy" do
|
11
11
|
after(:all) do
|
12
|
-
|
12
|
+
subject.disable_proxy!
|
13
13
|
end
|
14
14
|
|
15
15
|
it "should not have proxy settings by default" do
|
16
|
-
|
16
|
+
subject.proxy[:host].should be_nil
|
17
17
|
end
|
18
18
|
|
19
19
|
it "should allow setting new proxy settings" do
|
20
|
-
|
20
|
+
subject.proxy = {:host => 'example.com', :port => 8010}
|
21
21
|
|
22
|
-
|
23
|
-
|
22
|
+
subject.proxy[:host].should == 'example.com'
|
23
|
+
subject.proxy[:port].should == 8010
|
24
24
|
end
|
25
25
|
|
26
26
|
it "should default the :port option of new proxy settings" do
|
27
|
-
|
27
|
+
subject.proxy = {:host => 'example.com'}
|
28
28
|
|
29
|
-
|
30
|
-
|
29
|
+
subject.proxy[:host].should == 'example.com'
|
30
|
+
subject.proxy[:port].should == Spidr::COMMON_PROXY_PORT
|
31
31
|
end
|
32
32
|
|
33
33
|
it "should allow disabling the proxy" do
|
34
|
-
|
34
|
+
subject.disable_proxy!
|
35
35
|
|
36
|
-
|
36
|
+
subject.proxy[:host].should be_nil
|
37
37
|
end
|
38
38
|
end
|
39
39
|
end
|
data/spidr.gemspec
CHANGED
@@ -1,114 +1,15 @@
|
|
1
|
-
# Generated by jeweler
|
2
|
-
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
-
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
1
|
# -*- encoding: utf-8 -*-
|
5
2
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
"
|
17
|
-
"LICENSE.txt",
|
18
|
-
"README.md"
|
19
|
-
]
|
20
|
-
s.files = [
|
21
|
-
".gitignore",
|
22
|
-
".specopts",
|
23
|
-
".yardopts",
|
24
|
-
"ChangeLog.md",
|
25
|
-
"Gemfile",
|
26
|
-
"Gemfile.lock",
|
27
|
-
"LICENSE.txt",
|
28
|
-
"README.md",
|
29
|
-
"Rakefile",
|
30
|
-
"lib/spidr.rb",
|
31
|
-
"lib/spidr/actions.rb",
|
32
|
-
"lib/spidr/actions/actions.rb",
|
33
|
-
"lib/spidr/actions/exceptions.rb",
|
34
|
-
"lib/spidr/actions/exceptions/action.rb",
|
35
|
-
"lib/spidr/actions/exceptions/paused.rb",
|
36
|
-
"lib/spidr/actions/exceptions/skip_link.rb",
|
37
|
-
"lib/spidr/actions/exceptions/skip_page.rb",
|
38
|
-
"lib/spidr/agent.rb",
|
39
|
-
"lib/spidr/auth_credential.rb",
|
40
|
-
"lib/spidr/auth_store.rb",
|
41
|
-
"lib/spidr/cookie_jar.rb",
|
42
|
-
"lib/spidr/events.rb",
|
43
|
-
"lib/spidr/extensions.rb",
|
44
|
-
"lib/spidr/extensions/uri.rb",
|
45
|
-
"lib/spidr/filters.rb",
|
46
|
-
"lib/spidr/page.rb",
|
47
|
-
"lib/spidr/rules.rb",
|
48
|
-
"lib/spidr/sanitizers.rb",
|
49
|
-
"lib/spidr/session_cache.rb",
|
50
|
-
"lib/spidr/spidr.rb",
|
51
|
-
"lib/spidr/version.rb",
|
52
|
-
"spec/actions_spec.rb",
|
53
|
-
"spec/agent_spec.rb",
|
54
|
-
"spec/auth_store_spec.rb",
|
55
|
-
"spec/cookie_jar_spec.rb",
|
56
|
-
"spec/extensions/uri_spec.rb",
|
57
|
-
"spec/filters_spec.rb",
|
58
|
-
"spec/helpers/history.rb",
|
59
|
-
"spec/helpers/page.rb",
|
60
|
-
"spec/helpers/wsoc.rb",
|
61
|
-
"spec/page_examples.rb",
|
62
|
-
"spec/page_spec.rb",
|
63
|
-
"spec/rules_spec.rb",
|
64
|
-
"spec/sanitizers_spec.rb",
|
65
|
-
"spec/session_cache.rb",
|
66
|
-
"spec/spec_helper.rb",
|
67
|
-
"spec/spidr_spec.rb",
|
68
|
-
"spidr.gemspec"
|
69
|
-
]
|
70
|
-
s.has_rdoc = %q{yard}
|
71
|
-
s.homepage = %q{http://github.com/postmodern/spidr}
|
72
|
-
s.licenses = ["MIT"]
|
73
|
-
s.require_paths = ["lib"]
|
74
|
-
s.rubygems_version = %q{1.3.7}
|
75
|
-
s.summary = %q{A versatile Ruby web spidering library}
|
76
|
-
s.test_files = [
|
77
|
-
"spec/actions_spec.rb",
|
78
|
-
"spec/agent_spec.rb",
|
79
|
-
"spec/auth_store_spec.rb",
|
80
|
-
"spec/cookie_jar_spec.rb",
|
81
|
-
"spec/extensions/uri_spec.rb",
|
82
|
-
"spec/filters_spec.rb",
|
83
|
-
"spec/helpers/history.rb",
|
84
|
-
"spec/helpers/page.rb",
|
85
|
-
"spec/helpers/wsoc.rb",
|
86
|
-
"spec/page_examples.rb",
|
87
|
-
"spec/page_spec.rb",
|
88
|
-
"spec/rules_spec.rb",
|
89
|
-
"spec/sanitizers_spec.rb",
|
90
|
-
"spec/session_cache.rb",
|
91
|
-
"spec/spec_helper.rb",
|
92
|
-
"spec/spidr_spec.rb"
|
93
|
-
]
|
94
|
-
|
95
|
-
if s.respond_to? :specification_version then
|
96
|
-
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
97
|
-
s.specification_version = 3
|
98
|
-
|
99
|
-
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
100
|
-
s.add_development_dependency(%q<rake>, ["~> 0.8.7"])
|
101
|
-
s.add_development_dependency(%q<jeweler>, ["~> 1.4.0"])
|
102
|
-
s.add_development_dependency(%q<rspec>, ["~> 1.3.0"])
|
103
|
-
else
|
104
|
-
s.add_dependency(%q<rake>, ["~> 0.8.7"])
|
105
|
-
s.add_dependency(%q<jeweler>, ["~> 1.4.0"])
|
106
|
-
s.add_dependency(%q<rspec>, ["~> 1.3.0"])
|
107
|
-
end
|
108
|
-
else
|
109
|
-
s.add_dependency(%q<rake>, ["~> 0.8.7"])
|
110
|
-
s.add_dependency(%q<jeweler>, ["~> 1.4.0"])
|
111
|
-
s.add_dependency(%q<rspec>, ["~> 1.3.0"])
|
3
|
+
begin
|
4
|
+
Ore::Specification.new do |gemspec|
|
5
|
+
# custom logic here
|
6
|
+
end
|
7
|
+
rescue NameError
|
8
|
+
begin
|
9
|
+
require 'ore/specification'
|
10
|
+
retry
|
11
|
+
rescue LoadError
|
12
|
+
STDERR.puts "The '#{__FILE__}' file requires Ore."
|
13
|
+
STDERR.puts "Run `gem install ore-core` to install Ore."
|
112
14
|
end
|
113
15
|
end
|
114
|
-
|
metadata
CHANGED
@@ -1,12 +1,8 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
prerelease:
|
5
|
-
|
6
|
-
- 0
|
7
|
-
- 2
|
8
|
-
- 7
|
9
|
-
version: 0.2.7
|
4
|
+
prerelease:
|
5
|
+
version: 0.3.0
|
10
6
|
platform: ruby
|
11
7
|
authors:
|
12
8
|
- Postmodern
|
@@ -14,74 +10,59 @@ autorequire:
|
|
14
10
|
bindir: bin
|
15
11
|
cert_chain: []
|
16
12
|
|
17
|
-
date:
|
18
|
-
default_executable:
|
13
|
+
date: 2011-04-14 00:00:00 Z
|
19
14
|
dependencies:
|
20
15
|
- !ruby/object:Gem::Dependency
|
21
|
-
name:
|
16
|
+
name: nokogiri
|
22
17
|
requirement: &id001 !ruby/object:Gem::Requirement
|
23
18
|
none: false
|
24
19
|
requirements:
|
25
20
|
- - ~>
|
26
21
|
- !ruby/object:Gem::Version
|
27
|
-
|
28
|
-
|
29
|
-
- 8
|
30
|
-
- 7
|
31
|
-
version: 0.8.7
|
32
|
-
type: :development
|
22
|
+
version: "1.3"
|
23
|
+
type: :runtime
|
33
24
|
prerelease: false
|
34
25
|
version_requirements: *id001
|
35
26
|
- !ruby/object:Gem::Dependency
|
36
|
-
name:
|
27
|
+
name: bundler
|
37
28
|
requirement: &id002 !ruby/object:Gem::Requirement
|
38
29
|
none: false
|
39
30
|
requirements:
|
40
31
|
- - ~>
|
41
32
|
- !ruby/object:Gem::Version
|
42
|
-
|
43
|
-
- 1
|
44
|
-
- 4
|
45
|
-
- 0
|
46
|
-
version: 1.4.0
|
33
|
+
version: 1.0.0
|
47
34
|
type: :development
|
48
35
|
prerelease: false
|
49
36
|
version_requirements: *id002
|
50
37
|
- !ruby/object:Gem::Dependency
|
51
|
-
name:
|
38
|
+
name: yard
|
52
39
|
requirement: &id003 !ruby/object:Gem::Requirement
|
53
40
|
none: false
|
54
41
|
requirements:
|
55
42
|
- - ~>
|
56
43
|
- !ruby/object:Gem::Version
|
57
|
-
|
58
|
-
- 1
|
59
|
-
- 3
|
60
|
-
- 0
|
61
|
-
version: 1.3.0
|
44
|
+
version: 0.6.0
|
62
45
|
type: :development
|
63
46
|
prerelease: false
|
64
47
|
version_requirements: *id003
|
65
48
|
description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
|
66
|
-
email:
|
49
|
+
email:
|
50
|
+
- postmodern.mod3@gmail.com
|
67
51
|
executables: []
|
68
52
|
|
69
53
|
extensions: []
|
70
54
|
|
71
55
|
extra_rdoc_files:
|
72
|
-
- ChangeLog.md
|
73
|
-
- LICENSE.txt
|
74
56
|
- README.md
|
75
57
|
files:
|
76
|
-
- .
|
77
|
-
- .specopts
|
58
|
+
- .rspec
|
78
59
|
- .yardopts
|
79
60
|
- ChangeLog.md
|
80
61
|
- Gemfile
|
81
|
-
- Gemfile.lock
|
82
62
|
- LICENSE.txt
|
83
63
|
- README.md
|
84
64
|
- Rakefile
|
65
|
+
- gemspec.yml
|
85
66
|
- lib/spidr.rb
|
86
67
|
- lib/spidr/actions.rb
|
87
68
|
- lib/spidr/actions/actions.rb
|
@@ -93,11 +74,14 @@ files:
|
|
93
74
|
- lib/spidr/agent.rb
|
94
75
|
- lib/spidr/auth_credential.rb
|
95
76
|
- lib/spidr/auth_store.rb
|
77
|
+
- lib/spidr/body.rb
|
96
78
|
- lib/spidr/cookie_jar.rb
|
97
79
|
- lib/spidr/events.rb
|
98
80
|
- lib/spidr/extensions.rb
|
99
81
|
- lib/spidr/extensions/uri.rb
|
100
82
|
- lib/spidr/filters.rb
|
83
|
+
- lib/spidr/headers.rb
|
84
|
+
- lib/spidr/links.rb
|
101
85
|
- lib/spidr/page.rb
|
102
86
|
- lib/spidr/rules.rb
|
103
87
|
- lib/spidr/sanitizers.rb
|
@@ -121,7 +105,6 @@ files:
|
|
121
105
|
- spec/spec_helper.rb
|
122
106
|
- spec/spidr_spec.rb
|
123
107
|
- spidr.gemspec
|
124
|
-
has_rdoc: yard
|
125
108
|
homepage: http://github.com/postmodern/spidr
|
126
109
|
licenses:
|
127
110
|
- MIT
|
@@ -135,39 +118,28 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
135
118
|
requirements:
|
136
119
|
- - ">="
|
137
120
|
- !ruby/object:Gem::Version
|
138
|
-
hash: 4533863298463290280
|
139
|
-
segments:
|
140
|
-
- 0
|
141
121
|
version: "0"
|
142
122
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
143
123
|
none: false
|
144
124
|
requirements:
|
145
125
|
- - ">="
|
146
126
|
- !ruby/object:Gem::Version
|
147
|
-
|
148
|
-
- 0
|
149
|
-
version: "0"
|
127
|
+
version: 1.3.6
|
150
128
|
requirements: []
|
151
129
|
|
152
|
-
rubyforge_project:
|
153
|
-
rubygems_version: 1.
|
130
|
+
rubyforge_project: spidr
|
131
|
+
rubygems_version: 1.7.2
|
154
132
|
signing_key:
|
155
133
|
specification_version: 3
|
156
134
|
summary: A versatile Ruby web spidering library
|
157
135
|
test_files:
|
158
|
-
- spec/actions_spec.rb
|
159
136
|
- spec/agent_spec.rb
|
137
|
+
- spec/actions_spec.rb
|
138
|
+
- spec/rules_spec.rb
|
139
|
+
- spec/extensions/uri_spec.rb
|
160
140
|
- spec/auth_store_spec.rb
|
161
141
|
- spec/cookie_jar_spec.rb
|
162
|
-
- spec/extensions/uri_spec.rb
|
163
142
|
- spec/filters_spec.rb
|
164
|
-
- spec/helpers/history.rb
|
165
|
-
- spec/helpers/page.rb
|
166
|
-
- spec/helpers/wsoc.rb
|
167
|
-
- spec/page_examples.rb
|
168
|
-
- spec/page_spec.rb
|
169
|
-
- spec/rules_spec.rb
|
170
143
|
- spec/sanitizers_spec.rb
|
171
|
-
- spec/session_cache.rb
|
172
|
-
- spec/spec_helper.rb
|
173
144
|
- spec/spidr_spec.rb
|
145
|
+
- spec/page_spec.rb
|