spidr 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +14 -0
- data/ChangeLog.md +20 -2
- data/Gemfile +2 -2
- data/README.md +4 -2
- data/Rakefile +1 -0
- data/gemspec.yml +1 -1
- data/lib/spidr/agent.rb +145 -85
- data/lib/spidr/agent/filters.rb +1 -9
- data/lib/spidr/agent/robots.rb +36 -0
- data/lib/spidr/page.rb +76 -28
- data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
- data/lib/spidr/page/cookies.rb +60 -0
- data/lib/spidr/page/{links.rb → html.rb} +47 -23
- data/lib/spidr/page/status_codes.rb +112 -0
- data/lib/spidr/proxy.rb +56 -0
- data/lib/spidr/session_cache.rb +60 -24
- data/lib/spidr/settings.rb +3 -0
- data/lib/spidr/settings/proxy.rb +61 -0
- data/lib/spidr/settings/timeouts.rb +33 -0
- data/lib/spidr/settings/user_agent.rb +14 -0
- data/lib/spidr/spidr.rb +15 -79
- data/lib/spidr/version.rb +1 -1
- data/spec/agent/actions_spec.rb +158 -32
- data/spec/agent/filters_spec.rb +46 -29
- data/spec/agent/sanitizers_spec.rb +25 -31
- data/spec/agent_spec.rb +772 -50
- data/spec/example_app.rb +27 -0
- data/spec/example_page.rb +33 -0
- data/spec/page/content_types_spec.rb +150 -0
- data/spec/page/cookies_spec.rb +58 -0
- data/spec/page/html_spec.rb +524 -0
- data/spec/page/status_codes_spec.rb +87 -0
- data/spec/page_spec.rb +114 -78
- data/spec/proxy_spec.rb +45 -0
- data/spec/session_cache.rb +103 -2
- data/spec/settings/proxy_examples.rb +82 -0
- data/spec/settings/timeouts_examples.rb +93 -0
- data/spec/settings/user_agent_examples.rb +25 -0
- data/spec/spidr_spec.rb +6 -29
- data/spidr.gemspec +38 -109
- metadata +35 -31
- data/lib/spidr/page/body.rb +0 -98
- data/spec/helpers/history.rb +0 -34
- data/spec/helpers/page.rb +0 -8
- data/spec/helpers/wsoc.rb +0 -83
- data/spec/page_examples.rb +0 -21
data/lib/spidr/page/body.rb
DELETED
@@ -1,98 +0,0 @@
|
|
1
|
-
require 'nokogiri'
|
2
|
-
|
3
|
-
module Spidr
|
4
|
-
class Page
|
5
|
-
#
|
6
|
-
# The body of the response.
|
7
|
-
#
|
8
|
-
# @return [String]
|
9
|
-
# The body of the response.
|
10
|
-
#
|
11
|
-
def body
|
12
|
-
(response.body || '')
|
13
|
-
end
|
14
|
-
|
15
|
-
#
|
16
|
-
# Returns a parsed document object for HTML, XML, RSS and Atom pages.
|
17
|
-
#
|
18
|
-
# @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
|
19
|
-
# The document that represents HTML or XML pages.
|
20
|
-
# Returns `nil` if the page is neither HTML, XML, RSS, Atom or if
|
21
|
-
# the page could not be parsed properly.
|
22
|
-
#
|
23
|
-
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
24
|
-
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
|
25
|
-
#
|
26
|
-
def doc
|
27
|
-
unless body.empty?
|
28
|
-
begin
|
29
|
-
if html?
|
30
|
-
@doc ||= Nokogiri::HTML(body, @url.to_s, content_charset)
|
31
|
-
elsif (rss? || atom? || xml? || xsl?)
|
32
|
-
@doc ||= Nokogiri::XML(body, @url.to_s, content_charset)
|
33
|
-
end
|
34
|
-
rescue
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
#
|
40
|
-
# Searches the document for XPath or CSS Path paths.
|
41
|
-
#
|
42
|
-
# @param [Array<String>] paths
|
43
|
-
# CSS or XPath expressions to search the document with.
|
44
|
-
#
|
45
|
-
# @return [Array]
|
46
|
-
# The matched nodes from the document.
|
47
|
-
# Returns an empty Array if no nodes were matched, or if the page
|
48
|
-
# is not an HTML or XML document.
|
49
|
-
#
|
50
|
-
# @example
|
51
|
-
# page.search('//a[@href]')
|
52
|
-
#
|
53
|
-
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
|
54
|
-
#
|
55
|
-
def search(*paths)
|
56
|
-
if doc
|
57
|
-
doc.search(*paths)
|
58
|
-
else
|
59
|
-
[]
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
#
|
64
|
-
# Searches for the first occurrence an XPath or CSS Path expression.
|
65
|
-
#
|
66
|
-
# @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
|
67
|
-
# The first matched node. Returns `nil` if no nodes could be matched,
|
68
|
-
# or if the page is not a HTML or XML document.
|
69
|
-
#
|
70
|
-
# @example
|
71
|
-
# page.at('//title')
|
72
|
-
#
|
73
|
-
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
|
74
|
-
#
|
75
|
-
def at(*arguments)
|
76
|
-
if doc
|
77
|
-
doc.at(*arguments)
|
78
|
-
end
|
79
|
-
end
|
80
|
-
|
81
|
-
alias / search
|
82
|
-
alias % at
|
83
|
-
|
84
|
-
#
|
85
|
-
# The title of the HTML page.
|
86
|
-
#
|
87
|
-
# @return [String]
|
88
|
-
# The inner-text of the title element of the page.
|
89
|
-
#
|
90
|
-
def title
|
91
|
-
if (node = at('//title'))
|
92
|
-
node.inner_text
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
alias to_s body
|
97
|
-
end
|
98
|
-
end
|
data/spec/helpers/history.rb
DELETED
@@ -1,34 +0,0 @@
|
|
1
|
-
module Helpers
|
2
|
-
module History
|
3
|
-
def visited_once?(url)
|
4
|
-
return @agent.visited_urls.select { |visited_url|
|
5
|
-
visited_url == url
|
6
|
-
}.length == 1
|
7
|
-
end
|
8
|
-
|
9
|
-
def visited_link?(url)
|
10
|
-
@agent.visited?(url)
|
11
|
-
end
|
12
|
-
|
13
|
-
def visit_failed?(url)
|
14
|
-
@agent.failed?(url)
|
15
|
-
end
|
16
|
-
|
17
|
-
def should_visit_link(url)
|
18
|
-
expect(visited_link?(url)).to eq(true)
|
19
|
-
end
|
20
|
-
|
21
|
-
def should_ignore_link(url)
|
22
|
-
expect(visited_link?(url)).to eq(false)
|
23
|
-
end
|
24
|
-
|
25
|
-
def should_visit_once(url)
|
26
|
-
expect(visited_once?(url)).to eq(true)
|
27
|
-
end
|
28
|
-
|
29
|
-
def should_fail_link(url)
|
30
|
-
expect(visited_link?(url)).to eq(false)
|
31
|
-
expect(visit_failed?(url)).to eq(true)
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
data/spec/helpers/page.rb
DELETED
data/spec/helpers/wsoc.rb
DELETED
@@ -1,83 +0,0 @@
|
|
1
|
-
require 'wsoc/config'
|
2
|
-
require 'open-uri'
|
3
|
-
require 'json'
|
4
|
-
|
5
|
-
require 'helpers/history'
|
6
|
-
|
7
|
-
module Helpers
|
8
|
-
module WSOC
|
9
|
-
include History
|
10
|
-
|
11
|
-
SERVER_URL = URI::HTTP.build(
|
12
|
-
host: (ENV['HOST'] || ::WSOC::Config::DEFAULT_HOST),
|
13
|
-
port: (ENV['PORT'] || ::WSOC::Config::DEFAULT_PORT)
|
14
|
-
)
|
15
|
-
|
16
|
-
SPECS_URL = SERVER_URL.merge(::WSOC::Config::SPECS_PATHS[:json])
|
17
|
-
|
18
|
-
COURSE_URL = SERVER_URL.merge(::WSOC::Config::COURSE_START_PATH)
|
19
|
-
|
20
|
-
COURSE_METADATA = {}
|
21
|
-
|
22
|
-
def self.included(base)
|
23
|
-
hash = JSON.parse(open(SPECS_URL).read)
|
24
|
-
metadata = hash['metadata']
|
25
|
-
specs = hash['specs']
|
26
|
-
|
27
|
-
if metadata.kind_of?(Hash)
|
28
|
-
COURSE_METADATA.merge!(metadata)
|
29
|
-
end
|
30
|
-
|
31
|
-
if specs.kind_of?(Array)
|
32
|
-
specs.each do |spec|
|
33
|
-
message = spec['message'].dump
|
34
|
-
url = spec['url'].dump
|
35
|
-
|
36
|
-
case spec['behavior']
|
37
|
-
when 'visit'
|
38
|
-
base.module_eval %{
|
39
|
-
it #{message} do
|
40
|
-
should_visit_link(#{url})
|
41
|
-
end
|
42
|
-
}
|
43
|
-
when 'ignore'
|
44
|
-
base.module_eval %{
|
45
|
-
it #{message} do
|
46
|
-
should_ignore_link(#{url})
|
47
|
-
end
|
48
|
-
}
|
49
|
-
when 'fail'
|
50
|
-
base.module_eval %{
|
51
|
-
it #{message} do
|
52
|
-
should_fail_link(#{url})
|
53
|
-
end
|
54
|
-
}
|
55
|
-
end
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
def course
|
61
|
-
WSOC::COURSE_METADATA
|
62
|
-
end
|
63
|
-
|
64
|
-
def course_auth_store
|
65
|
-
course['auth_store']
|
66
|
-
end
|
67
|
-
|
68
|
-
def run_course
|
69
|
-
Spidr::Agent.start_at(COURSE_URL) do |agent|
|
70
|
-
course_auth_store.each do |path,auth|
|
71
|
-
agent.authorized.add(
|
72
|
-
COURSE_URL.merge(path),
|
73
|
-
auth['user'],
|
74
|
-
auth['password']
|
75
|
-
)
|
76
|
-
end
|
77
|
-
|
78
|
-
agent.every_failed_url { |url| puts "[FAILED] #{url}" }
|
79
|
-
agent.every_url { |url| puts url }
|
80
|
-
end
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
data/spec/page_examples.rb
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
require 'spidr/page'
|
2
|
-
|
3
|
-
require 'spec_helper'
|
4
|
-
|
5
|
-
shared_examples_for "Page" do
|
6
|
-
it "should have a status code" do
|
7
|
-
expect(@page.code).to be_integer
|
8
|
-
end
|
9
|
-
|
10
|
-
it "should have a body" do
|
11
|
-
expect(@page.body).not_to be_empty
|
12
|
-
end
|
13
|
-
|
14
|
-
it "should provide transparent access to the response headers" do
|
15
|
-
expect(@page.content_type).to eq(@page.response['Content-Type'])
|
16
|
-
end
|
17
|
-
|
18
|
-
it "should allow content-types" do
|
19
|
-
expect(@page.content_types).not_to be_empty
|
20
|
-
end
|
21
|
-
end
|