spidr 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +14 -0
  3. data/ChangeLog.md +20 -2
  4. data/Gemfile +2 -2
  5. data/README.md +4 -2
  6. data/Rakefile +1 -0
  7. data/gemspec.yml +1 -1
  8. data/lib/spidr/agent.rb +145 -85
  9. data/lib/spidr/agent/filters.rb +1 -9
  10. data/lib/spidr/agent/robots.rb +36 -0
  11. data/lib/spidr/page.rb +76 -28
  12. data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
  13. data/lib/spidr/page/cookies.rb +60 -0
  14. data/lib/spidr/page/{links.rb → html.rb} +47 -23
  15. data/lib/spidr/page/status_codes.rb +112 -0
  16. data/lib/spidr/proxy.rb +56 -0
  17. data/lib/spidr/session_cache.rb +60 -24
  18. data/lib/spidr/settings.rb +3 -0
  19. data/lib/spidr/settings/proxy.rb +61 -0
  20. data/lib/spidr/settings/timeouts.rb +33 -0
  21. data/lib/spidr/settings/user_agent.rb +14 -0
  22. data/lib/spidr/spidr.rb +15 -79
  23. data/lib/spidr/version.rb +1 -1
  24. data/spec/agent/actions_spec.rb +158 -32
  25. data/spec/agent/filters_spec.rb +46 -29
  26. data/spec/agent/sanitizers_spec.rb +25 -31
  27. data/spec/agent_spec.rb +772 -50
  28. data/spec/example_app.rb +27 -0
  29. data/spec/example_page.rb +33 -0
  30. data/spec/page/content_types_spec.rb +150 -0
  31. data/spec/page/cookies_spec.rb +58 -0
  32. data/spec/page/html_spec.rb +524 -0
  33. data/spec/page/status_codes_spec.rb +87 -0
  34. data/spec/page_spec.rb +114 -78
  35. data/spec/proxy_spec.rb +45 -0
  36. data/spec/session_cache.rb +103 -2
  37. data/spec/settings/proxy_examples.rb +82 -0
  38. data/spec/settings/timeouts_examples.rb +93 -0
  39. data/spec/settings/user_agent_examples.rb +25 -0
  40. data/spec/spidr_spec.rb +6 -29
  41. data/spidr.gemspec +38 -109
  42. metadata +35 -31
  43. data/lib/spidr/page/body.rb +0 -98
  44. data/spec/helpers/history.rb +0 -34
  45. data/spec/helpers/page.rb +0 -8
  46. data/spec/helpers/wsoc.rb +0 -83
  47. data/spec/page_examples.rb +0 -21
@@ -0,0 +1,82 @@
1
+ require 'rspec'
2
+
3
+ shared_examples "includes Spidr::Settings::Proxy" do
4
+ let(:proxy_host) { 'proxy.example.com' }
5
+ let(:proxy_port) { 9999 }
6
+ let(:proxy) { Spidr::Proxy.new(host: proxy_host, port: proxy_port) }
7
+
8
+ describe "proxy" do
9
+ context "when @proxy is not set" do
10
+ before do
11
+ subject.instance_variable_set(:"@proxy",nil)
12
+ end
13
+
14
+ it "should return the disabled proxy" do
15
+ expect(subject.proxy).to be_disabled
16
+ end
17
+
18
+ it "should retain the default value" do
19
+ expect(subject.proxy.object_id).to be subject.proxy.object_id
20
+ end
21
+ end
22
+
23
+ context "when @proxy is set" do
24
+ before do
25
+ subject.instance_variable_set(:"@proxy",proxy)
26
+ end
27
+
28
+ it "should return the set @proxy" do
29
+ expect(subject.proxy).to be proxy
30
+ end
31
+ end
32
+ end
33
+
34
+ describe "proxy=" do
35
+ context "when given a Proxy object" do
36
+ let(:proxy) { Proxy.new(host: proxy_host, port: proxy_port) }
37
+
38
+ before do
39
+ subject.proxy = proxy
40
+ end
41
+
42
+ it "should save it" do
43
+ expect(subject.proxy).to be proxy
44
+ end
45
+ end
46
+
47
+ context "when given a Hash" do
48
+ before do
49
+ subject.proxy = {host: proxy_host, port: proxy_port}
50
+ end
51
+
52
+ it "should create a new Proxy object" do
53
+ expect(subject.proxy).to be_kind_of(Proxy)
54
+ expect(subject.proxy[:host]).to be proxy_host
55
+ expect(subject.proxy[:port]).to be proxy_port
56
+ end
57
+ end
58
+
59
+ context "when given nil" do
60
+ before do
61
+ subject.proxy = nil
62
+ end
63
+
64
+ it "should leave an empty proxy" do
65
+ expect(subject.proxy).to be_kind_of(Proxy)
66
+ expect(subject.proxy[:host]).to be_nil
67
+ end
68
+ end
69
+ end
70
+
71
+ describe "disable_proxy!" do
72
+ before do
73
+ subject.proxy = proxy
74
+
75
+ subject.disable_proxy!
76
+ end
77
+
78
+ it "should reset the proxy" do
79
+ expect(subject.proxy).to be_disabled
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,93 @@
1
+ require 'rspec'
2
+
3
+ shared_examples_for "includes Spidr::Settings::Timeouts" do
4
+ describe "read_timeout" do
5
+ context "default value" do
6
+ it { expect(subject.read_timeout).to be_nil }
7
+ end
8
+ end
9
+
10
+ describe "read_timeout=" do
11
+ let(:value) { 5 }
12
+
13
+ before { subject.read_timeout = value }
14
+
15
+ it "should update read_timeout" do
16
+ expect(subject.read_timeout).to be == value
17
+ end
18
+
19
+ after { subject.read_timeout = nil }
20
+ end
21
+
22
+ describe "open_timeout" do
23
+ context "default value" do
24
+ it { expect(subject.open_timeout).to be_nil }
25
+ end
26
+ end
27
+
28
+ describe "open_timeout=" do
29
+ let(:value) { 5 }
30
+
31
+ before { subject.open_timeout = value }
32
+
33
+ it "should update open_timeout" do
34
+ expect(subject.open_timeout).to be == value
35
+ end
36
+
37
+ after { subject.open_timeout = nil }
38
+ end
39
+
40
+ describe "ssl_timeout" do
41
+ context "default value" do
42
+ it { expect(subject.ssl_timeout).to be_nil }
43
+ end
44
+ end
45
+
46
+ describe "ssl_timeout=" do
47
+ let(:value) { 5 }
48
+
49
+ before { subject.ssl_timeout = value }
50
+
51
+ it "should update ssl_timeout" do
52
+ expect(subject.ssl_timeout).to be == value
53
+ end
54
+
55
+ after { subject.ssl_timeout = nil }
56
+ end
57
+
58
+ describe "continue_timeout" do
59
+ context "default value" do
60
+ it { expect(subject.continue_timeout).to be_nil }
61
+ end
62
+ end
63
+
64
+ describe "continue_timeout=" do
65
+ let(:value) { 5 }
66
+
67
+ before { subject.continue_timeout = value }
68
+
69
+ it "should update continue_timeout" do
70
+ expect(subject.continue_timeout).to be == value
71
+ end
72
+
73
+ after { subject.continue_timeout = nil }
74
+ end
75
+
76
+ describe "keep_alive_timeout" do
77
+ context "default value" do
78
+ it { expect(subject.keep_alive_timeout).to be_nil }
79
+ end
80
+ end
81
+
82
+ describe "keep_alive_timeout=" do
83
+ let(:value) { 5 }
84
+
85
+ before { subject.keep_alive_timeout = value }
86
+
87
+ it "should update keep_alive_timeout" do
88
+ expect(subject.keep_alive_timeout).to be == value
89
+ end
90
+
91
+ after { subject.keep_alive_timeout = nil }
92
+ end
93
+ end
@@ -0,0 +1,25 @@
1
+ require 'rspec'
2
+
3
+ shared_examples_for "includes Spidr::Settings::UserAgent" do
4
+ describe "user_agent" do
5
+ context "default value" do
6
+ it { expect(subject.user_agent).to be_nil }
7
+ end
8
+ end
9
+
10
+ describe "user_agent=" do
11
+ let(:user_agent) { 'Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405' }
12
+
13
+ before do
14
+ subject.user_agent = user_agent
15
+ end
16
+
17
+ it "should update the user_agent" do
18
+ expect(subject.user_agent).to be == user_agent
19
+ end
20
+
21
+ after do
22
+ subject.user_agent = nil
23
+ end
24
+ end
25
+ end
@@ -1,39 +1,16 @@
1
1
  require 'spidr'
2
2
 
3
3
  require 'spec_helper'
4
+ require 'settings/proxy_examples'
5
+ require 'settings/timeouts_examples'
6
+ require 'settings/user_agent_examples'
4
7
 
5
8
  describe Spidr do
6
9
  it "should have a VERSION constant" do
7
10
  expect(subject.const_defined?('VERSION')).to eq(true)
8
11
  end
9
12
 
10
- describe "proxy" do
11
- after(:all) do
12
- Spidr.disable_proxy!
13
- end
14
-
15
- it "should not have proxy settings by default" do
16
- expect(subject.proxy[:host]).to be_nil
17
- end
18
-
19
- it "should allow setting new proxy settings" do
20
- subject.proxy = {host: 'example.com', port: 8010}
21
-
22
- expect(subject.proxy[:host]).to eq('example.com')
23
- expect(subject.proxy[:port]).to eq(8010)
24
- end
25
-
26
- it "should default the :port option of new proxy settings" do
27
- subject.proxy = {host: 'example.com'}
28
-
29
- expect(subject.proxy[:host]).to eq('example.com')
30
- expect(subject.proxy[:port]).to eq(Spidr::COMMON_PROXY_PORT)
31
- end
32
-
33
- it "should allow disabling the proxy" do
34
- subject.disable_proxy!
35
-
36
- expect(subject.proxy[:host]).to be_nil
37
- end
38
- end
13
+ it_should_behave_like "includes Spidr::Settings::Proxy"
14
+ it_should_behave_like "includes Spidr::Settings::Timeouts"
15
+ it_should_behave_like "includes Spidr::Settings::UserAgent"
39
16
  end
@@ -2,130 +2,59 @@
2
2
 
3
3
  require 'yaml'
4
4
 
5
- Gem::Specification.new do |gemspec|
6
- root = File.dirname(__FILE__)
7
- lib_dir = File.join(root,'lib')
8
- files = if File.directory?('.git')
9
- `git ls-files`.split($/)
10
- elsif File.directory?('.hg')
11
- `hg manifest`.split($/)
12
- elsif File.directory?('.svn')
13
- `svn ls -R`.split($/).select { |path| File.file?(path) }
14
- else
15
- Dir['{**/}{.*,*}'].select { |path| File.file?(path) }
16
- end
5
+ Gem::Specification.new do |gem|
6
+ gemspec = YAML.load_file('gemspec.yml')
17
7
 
18
- filter_files = lambda { |paths|
19
- case paths
20
- when Array
21
- (files & paths)
22
- when String
23
- (files & Dir[paths])
24
- end
25
- }
26
-
27
- version = {
28
- :file => 'spidr/version',
29
- :constant => 'Spidr::VERSION'
30
- }
31
-
32
- defaults = {
33
- 'name' => File.basename(root),
34
- 'files' => files,
35
- 'executables' => filter_files['bin/*'].map { |path| File.basename(path) },
36
- 'test_files' => filter_files['{test/{**/}*_test.rb,spec/{**/}*_spec.rb}'],
37
- 'extra_doc_files' => filter_files['*.{txt,rdoc,md,markdown,tt,textile}'],
38
- }
8
+ gem.name = gemspec.fetch('name')
9
+ gem.version = gemspec.fetch('version') do
10
+ lib_dir = File.join(File.dirname(__FILE__),'lib')
11
+ $LOAD_PATH << lib_dir unless $LOAD_PATH.include?(lib_dir)
39
12
 
40
- metadata = defaults.merge(YAML.load_file('gemspec.yml'))
13
+ require 'spidr/version'
14
+ Spidr::VERSION
15
+ end
41
16
 
42
- gemspec.name = metadata.fetch('name',defaults[:name])
43
- gemspec.version = if metadata['version']
44
- metadata['version']
45
- else
46
- $LOAD_PATH << lib_dir unless $LOAD_PATH.include?(lib_dir)
17
+ gem.summary = gemspec['summary']
18
+ gem.description = gemspec['description']
19
+ gem.licenses = Array(gemspec['license'])
20
+ gem.authors = Array(gemspec['authors'])
21
+ gem.email = gemspec['email']
22
+ gem.homepage = gemspec['homepage']
47
23
 
48
- require version[:file]
49
- eval(version[:constant])
50
- end
24
+ glob = lambda { |patterns| gem.files & Dir[*patterns] }
51
25
 
52
- gemspec.summary = metadata.fetch('summary',metadata['description'])
53
- gemspec.description = metadata.fetch('description',metadata['summary'])
26
+ gem.files = `git ls-files`.split($/)
27
+ gem.files = glob[gemspec['files']] if gemspec['files']
54
28
 
55
- case metadata['license']
56
- when Array
57
- gemspec.licenses = metadata['license']
58
- when String
59
- gemspec.license = metadata['license']
29
+ gem.executables = gemspec.fetch('executables') do
30
+ glob['bin/*'].map { |path| File.basename(path) }
60
31
  end
32
+ gem.default_executable = gem.executables.first if Gem::VERSION < '1.7.'
61
33
 
62
- case metadata['authors']
63
- when Array
64
- gemspec.authors = metadata['authors']
65
- when String
66
- gemspec.author = metadata['authors']
67
- end
68
-
69
- gemspec.email = metadata['email']
70
- gemspec.homepage = metadata['homepage']
34
+ gem.extensions = glob[gemspec['extensions'] || 'ext/**/extconf.rb']
35
+ gem.test_files = glob[gemspec['test_files'] || '{test/{**/}*_test.rb']
36
+ gem.extra_rdoc_files = glob[gemspec['extra_doc_files'] || '*.{txt,md}']
71
37
 
72
- case metadata['require_paths']
73
- when Array
74
- gemspec.require_paths = metadata['require_paths']
75
- when String
76
- gemspec.require_path = metadata['require_paths']
77
- end
38
+ gem.require_paths = Array(gemspec.fetch('require_paths') {
39
+ %w[ext lib].select { |dir| File.directory?(dir) }
40
+ })
78
41
 
79
- gemspec.files = filter_files[metadata['files']]
42
+ gem.requirements = Array(gemspec['requirements'])
43
+ gem.required_ruby_version = gemspec['required_ruby_version']
44
+ gem.required_rubygems_version = gemspec['required_rubygems_version']
45
+ gem.post_install_message = gemspec['post_install_message']
80
46
 
81
- gemspec.executables = metadata['executables']
82
- gemspec.extensions = metadata['extensions']
83
-
84
- if Gem::VERSION < '1.7.'
85
- gemspec.default_executable = gemspec.executables.first
86
- end
87
-
88
- gemspec.test_files = filter_files[metadata['test_files']]
89
-
90
- unless gemspec.files.include?('.document')
91
- gemspec.extra_rdoc_files = metadata['extra_doc_files']
92
- end
93
-
94
- gemspec.post_install_message = metadata['post_install_message']
95
- gemspec.requirements = metadata['requirements']
96
-
97
- if gemspec.respond_to?(:required_ruby_version=)
98
- gemspec.required_ruby_version = metadata['required_ruby_version']
99
- end
100
-
101
- if gemspec.respond_to?(:required_rubygems_version=)
102
- gemspec.required_rubygems_version = metadata['required_rubygems_version']
103
- end
104
-
105
- parse_versions = lambda { |versions|
106
- case versions
107
- when Array
108
- versions.map { |v| v.to_s }
109
- when String
110
- versions.split(/,\s*/)
111
- end
112
- }
113
-
114
- if metadata['dependencies']
115
- metadata['dependencies'].each do |name,versions|
116
- gemspec.add_dependency(name,parse_versions[versions])
117
- end
118
- end
47
+ split = lambda { |string| string.split(/,\s*/) }
119
48
 
120
- if metadata['runtime_dependencies']
121
- metadata['runtime_dependencies'].each do |name,versions|
122
- gemspec.add_runtime_dependency(name,parse_versions[versions])
49
+ if gemspec['dependencies']
50
+ gemspec['dependencies'].each do |name,versions|
51
+ gem.add_dependency(name,split[versions])
123
52
  end
124
53
  end
125
54
 
126
- if metadata['development_dependencies']
127
- metadata['development_dependencies'].each do |name,versions|
128
- gemspec.add_development_dependency(name,parse_versions[versions])
55
+ if gemspec['development_dependencies']
56
+ gemspec['development_dependencies'].each do |name,versions|
57
+ gem.add_development_dependency(name,split[versions])
129
58
  end
130
59
  end
131
60
  end
metadata CHANGED
@@ -1,41 +1,41 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-04 00:00:00.000000000 Z
11
+ date: 2016-08-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ~>
18
18
  - !ruby/object:Gem::Version
19
19
  version: '1.3'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ~>
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.3'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: bundler
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ~>
32
32
  - !ruby/object:Gem::Version
33
33
  version: '1.0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - "~>"
38
+ - - ~>
39
39
  - !ruby/object:Gem::Version
40
40
  version: '1.0'
41
41
  description: Spidr is a versatile Ruby web spidering library that can spider a site,
@@ -49,9 +49,10 @@ extra_rdoc_files:
49
49
  - LICENSE.txt
50
50
  - README.md
51
51
  files:
52
- - ".gitignore"
53
- - ".rspec"
54
- - ".yardopts"
52
+ - .gitignore
53
+ - .rspec
54
+ - .travis.yml
55
+ - .yardopts
55
56
  - ChangeLog.md
56
57
  - Gemfile
57
58
  - LICENSE.txt
@@ -63,6 +64,7 @@ files:
63
64
  - lib/spidr/agent/actions.rb
64
65
  - lib/spidr/agent/events.rb
65
66
  - lib/spidr/agent/filters.rb
67
+ - lib/spidr/agent/robots.rb
66
68
  - lib/spidr/agent/sanitizers.rb
67
69
  - lib/spidr/auth_credential.rb
68
70
  - lib/spidr/auth_store.rb
@@ -70,11 +72,17 @@ files:
70
72
  - lib/spidr/extensions.rb
71
73
  - lib/spidr/extensions/uri.rb
72
74
  - lib/spidr/page.rb
73
- - lib/spidr/page/body.rb
74
- - lib/spidr/page/headers.rb
75
- - lib/spidr/page/links.rb
75
+ - lib/spidr/page/content_types.rb
76
+ - lib/spidr/page/cookies.rb
77
+ - lib/spidr/page/html.rb
78
+ - lib/spidr/page/status_codes.rb
79
+ - lib/spidr/proxy.rb
76
80
  - lib/spidr/rules.rb
77
81
  - lib/spidr/session_cache.rb
82
+ - lib/spidr/settings.rb
83
+ - lib/spidr/settings/proxy.rb
84
+ - lib/spidr/settings/timeouts.rb
85
+ - lib/spidr/settings/user_agent.rb
78
86
  - lib/spidr/spidr.rb
79
87
  - lib/spidr/version.rb
80
88
  - spec/agent/actions_spec.rb
@@ -83,14 +91,20 @@ files:
83
91
  - spec/agent_spec.rb
84
92
  - spec/auth_store_spec.rb
85
93
  - spec/cookie_jar_spec.rb
94
+ - spec/example_app.rb
95
+ - spec/example_page.rb
86
96
  - spec/extensions/uri_spec.rb
87
- - spec/helpers/history.rb
88
- - spec/helpers/page.rb
89
- - spec/helpers/wsoc.rb
90
- - spec/page_examples.rb
97
+ - spec/page/content_types_spec.rb
98
+ - spec/page/cookies_spec.rb
99
+ - spec/page/html_spec.rb
100
+ - spec/page/status_codes_spec.rb
91
101
  - spec/page_spec.rb
102
+ - spec/proxy_spec.rb
92
103
  - spec/rules_spec.rb
93
104
  - spec/session_cache.rb
105
+ - spec/settings/proxy_examples.rb
106
+ - spec/settings/timeouts_examples.rb
107
+ - spec/settings/user_agent_examples.rb
94
108
  - spec/spec_helper.rb
95
109
  - spec/spidr_spec.rb
96
110
  - spidr.gemspec
@@ -104,28 +118,18 @@ require_paths:
104
118
  - lib
105
119
  required_ruby_version: !ruby/object:Gem::Requirement
106
120
  requirements:
107
- - - ">="
121
+ - - '>='
108
122
  - !ruby/object:Gem::Version
109
- version: 1.9.1
123
+ version: 2.0.0
110
124
  required_rubygems_version: !ruby/object:Gem::Requirement
111
125
  requirements:
112
- - - ">="
126
+ - - '>='
113
127
  - !ruby/object:Gem::Version
114
128
  version: '0'
115
129
  requirements: []
116
130
  rubyforge_project:
117
- rubygems_version: 2.4.7
131
+ rubygems_version: 2.0.14.1
118
132
  signing_key:
119
133
  specification_version: 4
120
134
  summary: A versatile Ruby web spidering library
121
- test_files:
122
- - spec/agent/actions_spec.rb
123
- - spec/agent/filters_spec.rb
124
- - spec/agent/sanitizers_spec.rb
125
- - spec/agent_spec.rb
126
- - spec/auth_store_spec.rb
127
- - spec/cookie_jar_spec.rb
128
- - spec/extensions/uri_spec.rb
129
- - spec/page_spec.rb
130
- - spec/rules_spec.rb
131
- - spec/spidr_spec.rb
135
+ test_files: []