spidr 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +14 -0
  3. data/ChangeLog.md +20 -2
  4. data/Gemfile +2 -2
  5. data/README.md +4 -2
  6. data/Rakefile +1 -0
  7. data/gemspec.yml +1 -1
  8. data/lib/spidr/agent.rb +145 -85
  9. data/lib/spidr/agent/filters.rb +1 -9
  10. data/lib/spidr/agent/robots.rb +36 -0
  11. data/lib/spidr/page.rb +76 -28
  12. data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
  13. data/lib/spidr/page/cookies.rb +60 -0
  14. data/lib/spidr/page/{links.rb → html.rb} +47 -23
  15. data/lib/spidr/page/status_codes.rb +112 -0
  16. data/lib/spidr/proxy.rb +56 -0
  17. data/lib/spidr/session_cache.rb +60 -24
  18. data/lib/spidr/settings.rb +3 -0
  19. data/lib/spidr/settings/proxy.rb +61 -0
  20. data/lib/spidr/settings/timeouts.rb +33 -0
  21. data/lib/spidr/settings/user_agent.rb +14 -0
  22. data/lib/spidr/spidr.rb +15 -79
  23. data/lib/spidr/version.rb +1 -1
  24. data/spec/agent/actions_spec.rb +158 -32
  25. data/spec/agent/filters_spec.rb +46 -29
  26. data/spec/agent/sanitizers_spec.rb +25 -31
  27. data/spec/agent_spec.rb +772 -50
  28. data/spec/example_app.rb +27 -0
  29. data/spec/example_page.rb +33 -0
  30. data/spec/page/content_types_spec.rb +150 -0
  31. data/spec/page/cookies_spec.rb +58 -0
  32. data/spec/page/html_spec.rb +524 -0
  33. data/spec/page/status_codes_spec.rb +87 -0
  34. data/spec/page_spec.rb +114 -78
  35. data/spec/proxy_spec.rb +45 -0
  36. data/spec/session_cache.rb +103 -2
  37. data/spec/settings/proxy_examples.rb +82 -0
  38. data/spec/settings/timeouts_examples.rb +93 -0
  39. data/spec/settings/user_agent_examples.rb +25 -0
  40. data/spec/spidr_spec.rb +6 -29
  41. data/spidr.gemspec +38 -109
  42. metadata +35 -31
  43. data/lib/spidr/page/body.rb +0 -98
  44. data/spec/helpers/history.rb +0 -34
  45. data/spec/helpers/page.rb +0 -8
  46. data/spec/helpers/wsoc.rb +0 -83
  47. data/spec/page_examples.rb +0 -21
@@ -0,0 +1,82 @@
1
+ require 'rspec'
2
+
3
+ shared_examples "includes Spidr::Settings::Proxy" do
4
+ let(:proxy_host) { 'proxy.example.com' }
5
+ let(:proxy_port) { 9999 }
6
+ let(:proxy) { Spidr::Proxy.new(host: proxy_host, port: proxy_port) }
7
+
8
+ describe "proxy" do
9
+ context "when @proxy is not set" do
10
+ before do
11
+ subject.instance_variable_set(:"@proxy",nil)
12
+ end
13
+
14
+ it "should return the disabled proxy" do
15
+ expect(subject.proxy).to be_disabled
16
+ end
17
+
18
+ it "should retain the default value" do
19
+ expect(subject.proxy.object_id).to be subject.proxy.object_id
20
+ end
21
+ end
22
+
23
+ context "when @proxy is set" do
24
+ before do
25
+ subject.instance_variable_set(:"@proxy",proxy)
26
+ end
27
+
28
+ it "should return the set @proxy" do
29
+ expect(subject.proxy).to be proxy
30
+ end
31
+ end
32
+ end
33
+
34
+ describe "proxy=" do
35
+ context "when given a Proxy object" do
36
+ let(:proxy) { Proxy.new(host: proxy_host, port: proxy_port) }
37
+
38
+ before do
39
+ subject.proxy = proxy
40
+ end
41
+
42
+ it "should save it" do
43
+ expect(subject.proxy).to be proxy
44
+ end
45
+ end
46
+
47
+ context "when given a Hash" do
48
+ before do
49
+ subject.proxy = {host: proxy_host, port: proxy_port}
50
+ end
51
+
52
+ it "should create a new Proxy object" do
53
+ expect(subject.proxy).to be_kind_of(Proxy)
54
+ expect(subject.proxy[:host]).to be proxy_host
55
+ expect(subject.proxy[:port]).to be proxy_port
56
+ end
57
+ end
58
+
59
+ context "when given nil" do
60
+ before do
61
+ subject.proxy = nil
62
+ end
63
+
64
+ it "should leave an empty proxy" do
65
+ expect(subject.proxy).to be_kind_of(Proxy)
66
+ expect(subject.proxy[:host]).to be_nil
67
+ end
68
+ end
69
+ end
70
+
71
+ describe "disable_proxy!" do
72
+ before do
73
+ subject.proxy = proxy
74
+
75
+ subject.disable_proxy!
76
+ end
77
+
78
+ it "should reset the proxy" do
79
+ expect(subject.proxy).to be_disabled
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,93 @@
1
+ require 'rspec'
2
+
3
+ shared_examples_for "includes Spidr::Settings::Timeouts" do
4
+ describe "read_timeout" do
5
+ context "default value" do
6
+ it { expect(subject.read_timeout).to be_nil }
7
+ end
8
+ end
9
+
10
+ describe "read_timeout=" do
11
+ let(:value) { 5 }
12
+
13
+ before { subject.read_timeout = value }
14
+
15
+ it "should update read_timeout" do
16
+ expect(subject.read_timeout).to be == value
17
+ end
18
+
19
+ after { subject.read_timeout = nil }
20
+ end
21
+
22
+ describe "open_timeout" do
23
+ context "default value" do
24
+ it { expect(subject.open_timeout).to be_nil }
25
+ end
26
+ end
27
+
28
+ describe "open_timeout=" do
29
+ let(:value) { 5 }
30
+
31
+ before { subject.open_timeout = value }
32
+
33
+ it "should update open_timeout" do
34
+ expect(subject.open_timeout).to be == value
35
+ end
36
+
37
+ after { subject.open_timeout = nil }
38
+ end
39
+
40
+ describe "ssl_timeout" do
41
+ context "default value" do
42
+ it { expect(subject.ssl_timeout).to be_nil }
43
+ end
44
+ end
45
+
46
+ describe "ssl_timeout=" do
47
+ let(:value) { 5 }
48
+
49
+ before { subject.ssl_timeout = value }
50
+
51
+ it "should update ssl_timeout" do
52
+ expect(subject.ssl_timeout).to be == value
53
+ end
54
+
55
+ after { subject.ssl_timeout = nil }
56
+ end
57
+
58
+ describe "continue_timeout" do
59
+ context "default value" do
60
+ it { expect(subject.continue_timeout).to be_nil }
61
+ end
62
+ end
63
+
64
+ describe "continue_timeout=" do
65
+ let(:value) { 5 }
66
+
67
+ before { subject.continue_timeout = value }
68
+
69
+ it "should update continue_timeout" do
70
+ expect(subject.continue_timeout).to be == value
71
+ end
72
+
73
+ after { subject.continue_timeout = nil }
74
+ end
75
+
76
+ describe "keep_alive_timeout" do
77
+ context "default value" do
78
+ it { expect(subject.keep_alive_timeout).to be_nil }
79
+ end
80
+ end
81
+
82
+ describe "keep_alive_timeout=" do
83
+ let(:value) { 5 }
84
+
85
+ before { subject.keep_alive_timeout = value }
86
+
87
+ it "should update keep_alive_timeout" do
88
+ expect(subject.keep_alive_timeout).to be == value
89
+ end
90
+
91
+ after { subject.keep_alive_timeout = nil }
92
+ end
93
+ end
@@ -0,0 +1,25 @@
1
+ require 'rspec'
2
+
3
+ shared_examples_for "includes Spidr::Settings::UserAgent" do
4
+ describe "user_agent" do
5
+ context "default value" do
6
+ it { expect(subject.user_agent).to be_nil }
7
+ end
8
+ end
9
+
10
+ describe "user_agent=" do
11
+ let(:user_agent) { 'Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405' }
12
+
13
+ before do
14
+ subject.user_agent = user_agent
15
+ end
16
+
17
+ it "should update the user_agent" do
18
+ expect(subject.user_agent).to be == user_agent
19
+ end
20
+
21
+ after do
22
+ subject.user_agent = nil
23
+ end
24
+ end
25
+ end
@@ -1,39 +1,16 @@
1
1
  require 'spidr'
2
2
 
3
3
  require 'spec_helper'
4
+ require 'settings/proxy_examples'
5
+ require 'settings/timeouts_examples'
6
+ require 'settings/user_agent_examples'
4
7
 
5
8
  describe Spidr do
6
9
  it "should have a VERSION constant" do
7
10
  expect(subject.const_defined?('VERSION')).to eq(true)
8
11
  end
9
12
 
10
- describe "proxy" do
11
- after(:all) do
12
- Spidr.disable_proxy!
13
- end
14
-
15
- it "should not have proxy settings by default" do
16
- expect(subject.proxy[:host]).to be_nil
17
- end
18
-
19
- it "should allow setting new proxy settings" do
20
- subject.proxy = {host: 'example.com', port: 8010}
21
-
22
- expect(subject.proxy[:host]).to eq('example.com')
23
- expect(subject.proxy[:port]).to eq(8010)
24
- end
25
-
26
- it "should default the :port option of new proxy settings" do
27
- subject.proxy = {host: 'example.com'}
28
-
29
- expect(subject.proxy[:host]).to eq('example.com')
30
- expect(subject.proxy[:port]).to eq(Spidr::COMMON_PROXY_PORT)
31
- end
32
-
33
- it "should allow disabling the proxy" do
34
- subject.disable_proxy!
35
-
36
- expect(subject.proxy[:host]).to be_nil
37
- end
38
- end
13
+ it_should_behave_like "includes Spidr::Settings::Proxy"
14
+ it_should_behave_like "includes Spidr::Settings::Timeouts"
15
+ it_should_behave_like "includes Spidr::Settings::UserAgent"
39
16
  end
@@ -2,130 +2,59 @@
2
2
 
3
3
  require 'yaml'
4
4
 
5
- Gem::Specification.new do |gemspec|
6
- root = File.dirname(__FILE__)
7
- lib_dir = File.join(root,'lib')
8
- files = if File.directory?('.git')
9
- `git ls-files`.split($/)
10
- elsif File.directory?('.hg')
11
- `hg manifest`.split($/)
12
- elsif File.directory?('.svn')
13
- `svn ls -R`.split($/).select { |path| File.file?(path) }
14
- else
15
- Dir['{**/}{.*,*}'].select { |path| File.file?(path) }
16
- end
5
+ Gem::Specification.new do |gem|
6
+ gemspec = YAML.load_file('gemspec.yml')
17
7
 
18
- filter_files = lambda { |paths|
19
- case paths
20
- when Array
21
- (files & paths)
22
- when String
23
- (files & Dir[paths])
24
- end
25
- }
26
-
27
- version = {
28
- :file => 'spidr/version',
29
- :constant => 'Spidr::VERSION'
30
- }
31
-
32
- defaults = {
33
- 'name' => File.basename(root),
34
- 'files' => files,
35
- 'executables' => filter_files['bin/*'].map { |path| File.basename(path) },
36
- 'test_files' => filter_files['{test/{**/}*_test.rb,spec/{**/}*_spec.rb}'],
37
- 'extra_doc_files' => filter_files['*.{txt,rdoc,md,markdown,tt,textile}'],
38
- }
8
+ gem.name = gemspec.fetch('name')
9
+ gem.version = gemspec.fetch('version') do
10
+ lib_dir = File.join(File.dirname(__FILE__),'lib')
11
+ $LOAD_PATH << lib_dir unless $LOAD_PATH.include?(lib_dir)
39
12
 
40
- metadata = defaults.merge(YAML.load_file('gemspec.yml'))
13
+ require 'spidr/version'
14
+ Spidr::VERSION
15
+ end
41
16
 
42
- gemspec.name = metadata.fetch('name',defaults[:name])
43
- gemspec.version = if metadata['version']
44
- metadata['version']
45
- else
46
- $LOAD_PATH << lib_dir unless $LOAD_PATH.include?(lib_dir)
17
+ gem.summary = gemspec['summary']
18
+ gem.description = gemspec['description']
19
+ gem.licenses = Array(gemspec['license'])
20
+ gem.authors = Array(gemspec['authors'])
21
+ gem.email = gemspec['email']
22
+ gem.homepage = gemspec['homepage']
47
23
 
48
- require version[:file]
49
- eval(version[:constant])
50
- end
24
+ glob = lambda { |patterns| gem.files & Dir[*patterns] }
51
25
 
52
- gemspec.summary = metadata.fetch('summary',metadata['description'])
53
- gemspec.description = metadata.fetch('description',metadata['summary'])
26
+ gem.files = `git ls-files`.split($/)
27
+ gem.files = glob[gemspec['files']] if gemspec['files']
54
28
 
55
- case metadata['license']
56
- when Array
57
- gemspec.licenses = metadata['license']
58
- when String
59
- gemspec.license = metadata['license']
29
+ gem.executables = gemspec.fetch('executables') do
30
+ glob['bin/*'].map { |path| File.basename(path) }
60
31
  end
32
+ gem.default_executable = gem.executables.first if Gem::VERSION < '1.7.'
61
33
 
62
- case metadata['authors']
63
- when Array
64
- gemspec.authors = metadata['authors']
65
- when String
66
- gemspec.author = metadata['authors']
67
- end
68
-
69
- gemspec.email = metadata['email']
70
- gemspec.homepage = metadata['homepage']
34
+ gem.extensions = glob[gemspec['extensions'] || 'ext/**/extconf.rb']
35
+ gem.test_files = glob[gemspec['test_files'] || '{test/{**/}*_test.rb']
36
+ gem.extra_rdoc_files = glob[gemspec['extra_doc_files'] || '*.{txt,md}']
71
37
 
72
- case metadata['require_paths']
73
- when Array
74
- gemspec.require_paths = metadata['require_paths']
75
- when String
76
- gemspec.require_path = metadata['require_paths']
77
- end
38
+ gem.require_paths = Array(gemspec.fetch('require_paths') {
39
+ %w[ext lib].select { |dir| File.directory?(dir) }
40
+ })
78
41
 
79
- gemspec.files = filter_files[metadata['files']]
42
+ gem.requirements = Array(gemspec['requirements'])
43
+ gem.required_ruby_version = gemspec['required_ruby_version']
44
+ gem.required_rubygems_version = gemspec['required_rubygems_version']
45
+ gem.post_install_message = gemspec['post_install_message']
80
46
 
81
- gemspec.executables = metadata['executables']
82
- gemspec.extensions = metadata['extensions']
83
-
84
- if Gem::VERSION < '1.7.'
85
- gemspec.default_executable = gemspec.executables.first
86
- end
87
-
88
- gemspec.test_files = filter_files[metadata['test_files']]
89
-
90
- unless gemspec.files.include?('.document')
91
- gemspec.extra_rdoc_files = metadata['extra_doc_files']
92
- end
93
-
94
- gemspec.post_install_message = metadata['post_install_message']
95
- gemspec.requirements = metadata['requirements']
96
-
97
- if gemspec.respond_to?(:required_ruby_version=)
98
- gemspec.required_ruby_version = metadata['required_ruby_version']
99
- end
100
-
101
- if gemspec.respond_to?(:required_rubygems_version=)
102
- gemspec.required_rubygems_version = metadata['required_rubygems_version']
103
- end
104
-
105
- parse_versions = lambda { |versions|
106
- case versions
107
- when Array
108
- versions.map { |v| v.to_s }
109
- when String
110
- versions.split(/,\s*/)
111
- end
112
- }
113
-
114
- if metadata['dependencies']
115
- metadata['dependencies'].each do |name,versions|
116
- gemspec.add_dependency(name,parse_versions[versions])
117
- end
118
- end
47
+ split = lambda { |string| string.split(/,\s*/) }
119
48
 
120
- if metadata['runtime_dependencies']
121
- metadata['runtime_dependencies'].each do |name,versions|
122
- gemspec.add_runtime_dependency(name,parse_versions[versions])
49
+ if gemspec['dependencies']
50
+ gemspec['dependencies'].each do |name,versions|
51
+ gem.add_dependency(name,split[versions])
123
52
  end
124
53
  end
125
54
 
126
- if metadata['development_dependencies']
127
- metadata['development_dependencies'].each do |name,versions|
128
- gemspec.add_development_dependency(name,parse_versions[versions])
55
+ if gemspec['development_dependencies']
56
+ gemspec['development_dependencies'].each do |name,versions|
57
+ gem.add_development_dependency(name,split[versions])
129
58
  end
130
59
  end
131
60
  end
metadata CHANGED
@@ -1,41 +1,41 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-04 00:00:00.000000000 Z
11
+ date: 2016-08-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ~>
18
18
  - !ruby/object:Gem::Version
19
19
  version: '1.3'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ~>
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.3'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: bundler
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ~>
32
32
  - !ruby/object:Gem::Version
33
33
  version: '1.0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - "~>"
38
+ - - ~>
39
39
  - !ruby/object:Gem::Version
40
40
  version: '1.0'
41
41
  description: Spidr is a versatile Ruby web spidering library that can spider a site,
@@ -49,9 +49,10 @@ extra_rdoc_files:
49
49
  - LICENSE.txt
50
50
  - README.md
51
51
  files:
52
- - ".gitignore"
53
- - ".rspec"
54
- - ".yardopts"
52
+ - .gitignore
53
+ - .rspec
54
+ - .travis.yml
55
+ - .yardopts
55
56
  - ChangeLog.md
56
57
  - Gemfile
57
58
  - LICENSE.txt
@@ -63,6 +64,7 @@ files:
63
64
  - lib/spidr/agent/actions.rb
64
65
  - lib/spidr/agent/events.rb
65
66
  - lib/spidr/agent/filters.rb
67
+ - lib/spidr/agent/robots.rb
66
68
  - lib/spidr/agent/sanitizers.rb
67
69
  - lib/spidr/auth_credential.rb
68
70
  - lib/spidr/auth_store.rb
@@ -70,11 +72,17 @@ files:
70
72
  - lib/spidr/extensions.rb
71
73
  - lib/spidr/extensions/uri.rb
72
74
  - lib/spidr/page.rb
73
- - lib/spidr/page/body.rb
74
- - lib/spidr/page/headers.rb
75
- - lib/spidr/page/links.rb
75
+ - lib/spidr/page/content_types.rb
76
+ - lib/spidr/page/cookies.rb
77
+ - lib/spidr/page/html.rb
78
+ - lib/spidr/page/status_codes.rb
79
+ - lib/spidr/proxy.rb
76
80
  - lib/spidr/rules.rb
77
81
  - lib/spidr/session_cache.rb
82
+ - lib/spidr/settings.rb
83
+ - lib/spidr/settings/proxy.rb
84
+ - lib/spidr/settings/timeouts.rb
85
+ - lib/spidr/settings/user_agent.rb
78
86
  - lib/spidr/spidr.rb
79
87
  - lib/spidr/version.rb
80
88
  - spec/agent/actions_spec.rb
@@ -83,14 +91,20 @@ files:
83
91
  - spec/agent_spec.rb
84
92
  - spec/auth_store_spec.rb
85
93
  - spec/cookie_jar_spec.rb
94
+ - spec/example_app.rb
95
+ - spec/example_page.rb
86
96
  - spec/extensions/uri_spec.rb
87
- - spec/helpers/history.rb
88
- - spec/helpers/page.rb
89
- - spec/helpers/wsoc.rb
90
- - spec/page_examples.rb
97
+ - spec/page/content_types_spec.rb
98
+ - spec/page/cookies_spec.rb
99
+ - spec/page/html_spec.rb
100
+ - spec/page/status_codes_spec.rb
91
101
  - spec/page_spec.rb
102
+ - spec/proxy_spec.rb
92
103
  - spec/rules_spec.rb
93
104
  - spec/session_cache.rb
105
+ - spec/settings/proxy_examples.rb
106
+ - spec/settings/timeouts_examples.rb
107
+ - spec/settings/user_agent_examples.rb
94
108
  - spec/spec_helper.rb
95
109
  - spec/spidr_spec.rb
96
110
  - spidr.gemspec
@@ -104,28 +118,18 @@ require_paths:
104
118
  - lib
105
119
  required_ruby_version: !ruby/object:Gem::Requirement
106
120
  requirements:
107
- - - ">="
121
+ - - '>='
108
122
  - !ruby/object:Gem::Version
109
- version: 1.9.1
123
+ version: 2.0.0
110
124
  required_rubygems_version: !ruby/object:Gem::Requirement
111
125
  requirements:
112
- - - ">="
126
+ - - '>='
113
127
  - !ruby/object:Gem::Version
114
128
  version: '0'
115
129
  requirements: []
116
130
  rubyforge_project:
117
- rubygems_version: 2.4.7
131
+ rubygems_version: 2.0.14.1
118
132
  signing_key:
119
133
  specification_version: 4
120
134
  summary: A versatile Ruby web spidering library
121
- test_files:
122
- - spec/agent/actions_spec.rb
123
- - spec/agent/filters_spec.rb
124
- - spec/agent/sanitizers_spec.rb
125
- - spec/agent_spec.rb
126
- - spec/auth_store_spec.rb
127
- - spec/cookie_jar_spec.rb
128
- - spec/extensions/uri_spec.rb
129
- - spec/page_spec.rb
130
- - spec/rules_spec.rb
131
- - spec/spidr_spec.rb
135
+ test_files: []