spidr 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +14 -0
- data/ChangeLog.md +20 -2
- data/Gemfile +2 -2
- data/README.md +4 -2
- data/Rakefile +1 -0
- data/gemspec.yml +1 -1
- data/lib/spidr/agent.rb +145 -85
- data/lib/spidr/agent/filters.rb +1 -9
- data/lib/spidr/agent/robots.rb +36 -0
- data/lib/spidr/page.rb +76 -28
- data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
- data/lib/spidr/page/cookies.rb +60 -0
- data/lib/spidr/page/{links.rb → html.rb} +47 -23
- data/lib/spidr/page/status_codes.rb +112 -0
- data/lib/spidr/proxy.rb +56 -0
- data/lib/spidr/session_cache.rb +60 -24
- data/lib/spidr/settings.rb +3 -0
- data/lib/spidr/settings/proxy.rb +61 -0
- data/lib/spidr/settings/timeouts.rb +33 -0
- data/lib/spidr/settings/user_agent.rb +14 -0
- data/lib/spidr/spidr.rb +15 -79
- data/lib/spidr/version.rb +1 -1
- data/spec/agent/actions_spec.rb +158 -32
- data/spec/agent/filters_spec.rb +46 -29
- data/spec/agent/sanitizers_spec.rb +25 -31
- data/spec/agent_spec.rb +772 -50
- data/spec/example_app.rb +27 -0
- data/spec/example_page.rb +33 -0
- data/spec/page/content_types_spec.rb +150 -0
- data/spec/page/cookies_spec.rb +58 -0
- data/spec/page/html_spec.rb +524 -0
- data/spec/page/status_codes_spec.rb +87 -0
- data/spec/page_spec.rb +114 -78
- data/spec/proxy_spec.rb +45 -0
- data/spec/session_cache.rb +103 -2
- data/spec/settings/proxy_examples.rb +82 -0
- data/spec/settings/timeouts_examples.rb +93 -0
- data/spec/settings/user_agent_examples.rb +25 -0
- data/spec/spidr_spec.rb +6 -29
- data/spidr.gemspec +38 -109
- metadata +35 -31
- data/lib/spidr/page/body.rb +0 -98
- data/spec/helpers/history.rb +0 -34
- data/spec/helpers/page.rb +0 -8
- data/spec/helpers/wsoc.rb +0 -83
- data/spec/page_examples.rb +0 -21
data/lib/spidr/page/body.rb
DELETED
@@ -1,98 +0,0 @@
|
|
1
|
-
require 'nokogiri'
|
2
|
-
|
3
|
-
module Spidr
|
4
|
-
class Page
|
5
|
-
#
|
6
|
-
# The body of the response.
|
7
|
-
#
|
8
|
-
# @return [String]
|
9
|
-
# The body of the response.
|
10
|
-
#
|
11
|
-
def body
|
12
|
-
(response.body || '')
|
13
|
-
end
|
14
|
-
|
15
|
-
#
|
16
|
-
# Returns a parsed document object for HTML, XML, RSS and Atom pages.
|
17
|
-
#
|
18
|
-
# @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
|
19
|
-
# The document that represents HTML or XML pages.
|
20
|
-
# Returns `nil` if the page is neither HTML, XML, RSS, Atom or if
|
21
|
-
# the page could not be parsed properly.
|
22
|
-
#
|
23
|
-
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
24
|
-
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
|
25
|
-
#
|
26
|
-
def doc
|
27
|
-
unless body.empty?
|
28
|
-
begin
|
29
|
-
if html?
|
30
|
-
@doc ||= Nokogiri::HTML(body, @url.to_s, content_charset)
|
31
|
-
elsif (rss? || atom? || xml? || xsl?)
|
32
|
-
@doc ||= Nokogiri::XML(body, @url.to_s, content_charset)
|
33
|
-
end
|
34
|
-
rescue
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
#
|
40
|
-
# Searches the document for XPath or CSS Path paths.
|
41
|
-
#
|
42
|
-
# @param [Array<String>] paths
|
43
|
-
# CSS or XPath expressions to search the document with.
|
44
|
-
#
|
45
|
-
# @return [Array]
|
46
|
-
# The matched nodes from the document.
|
47
|
-
# Returns an empty Array if no nodes were matched, or if the page
|
48
|
-
# is not an HTML or XML document.
|
49
|
-
#
|
50
|
-
# @example
|
51
|
-
# page.search('//a[@href]')
|
52
|
-
#
|
53
|
-
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
|
54
|
-
#
|
55
|
-
def search(*paths)
|
56
|
-
if doc
|
57
|
-
doc.search(*paths)
|
58
|
-
else
|
59
|
-
[]
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
#
|
64
|
-
# Searches for the first occurrence an XPath or CSS Path expression.
|
65
|
-
#
|
66
|
-
# @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
|
67
|
-
# The first matched node. Returns `nil` if no nodes could be matched,
|
68
|
-
# or if the page is not a HTML or XML document.
|
69
|
-
#
|
70
|
-
# @example
|
71
|
-
# page.at('//title')
|
72
|
-
#
|
73
|
-
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
|
74
|
-
#
|
75
|
-
def at(*arguments)
|
76
|
-
if doc
|
77
|
-
doc.at(*arguments)
|
78
|
-
end
|
79
|
-
end
|
80
|
-
|
81
|
-
alias / search
|
82
|
-
alias % at
|
83
|
-
|
84
|
-
#
|
85
|
-
# The title of the HTML page.
|
86
|
-
#
|
87
|
-
# @return [String]
|
88
|
-
# The inner-text of the title element of the page.
|
89
|
-
#
|
90
|
-
def title
|
91
|
-
if (node = at('//title'))
|
92
|
-
node.inner_text
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
alias to_s body
|
97
|
-
end
|
98
|
-
end
|
data/spec/helpers/history.rb
DELETED
@@ -1,34 +0,0 @@
|
|
1
|
-
module Helpers
|
2
|
-
module History
|
3
|
-
def visited_once?(url)
|
4
|
-
return @agent.visited_urls.select { |visited_url|
|
5
|
-
visited_url == url
|
6
|
-
}.length == 1
|
7
|
-
end
|
8
|
-
|
9
|
-
def visited_link?(url)
|
10
|
-
@agent.visited?(url)
|
11
|
-
end
|
12
|
-
|
13
|
-
def visit_failed?(url)
|
14
|
-
@agent.failed?(url)
|
15
|
-
end
|
16
|
-
|
17
|
-
def should_visit_link(url)
|
18
|
-
expect(visited_link?(url)).to eq(true)
|
19
|
-
end
|
20
|
-
|
21
|
-
def should_ignore_link(url)
|
22
|
-
expect(visited_link?(url)).to eq(false)
|
23
|
-
end
|
24
|
-
|
25
|
-
def should_visit_once(url)
|
26
|
-
expect(visited_once?(url)).to eq(true)
|
27
|
-
end
|
28
|
-
|
29
|
-
def should_fail_link(url)
|
30
|
-
expect(visited_link?(url)).to eq(false)
|
31
|
-
expect(visit_failed?(url)).to eq(true)
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
data/spec/helpers/page.rb
DELETED
data/spec/helpers/wsoc.rb
DELETED
@@ -1,83 +0,0 @@
|
|
1
|
-
require 'wsoc/config'
|
2
|
-
require 'open-uri'
|
3
|
-
require 'json'
|
4
|
-
|
5
|
-
require 'helpers/history'
|
6
|
-
|
7
|
-
module Helpers
|
8
|
-
module WSOC
|
9
|
-
include History
|
10
|
-
|
11
|
-
SERVER_URL = URI::HTTP.build(
|
12
|
-
host: (ENV['HOST'] || ::WSOC::Config::DEFAULT_HOST),
|
13
|
-
port: (ENV['PORT'] || ::WSOC::Config::DEFAULT_PORT)
|
14
|
-
)
|
15
|
-
|
16
|
-
SPECS_URL = SERVER_URL.merge(::WSOC::Config::SPECS_PATHS[:json])
|
17
|
-
|
18
|
-
COURSE_URL = SERVER_URL.merge(::WSOC::Config::COURSE_START_PATH)
|
19
|
-
|
20
|
-
COURSE_METADATA = {}
|
21
|
-
|
22
|
-
def self.included(base)
|
23
|
-
hash = JSON.parse(open(SPECS_URL).read)
|
24
|
-
metadata = hash['metadata']
|
25
|
-
specs = hash['specs']
|
26
|
-
|
27
|
-
if metadata.kind_of?(Hash)
|
28
|
-
COURSE_METADATA.merge!(metadata)
|
29
|
-
end
|
30
|
-
|
31
|
-
if specs.kind_of?(Array)
|
32
|
-
specs.each do |spec|
|
33
|
-
message = spec['message'].dump
|
34
|
-
url = spec['url'].dump
|
35
|
-
|
36
|
-
case spec['behavior']
|
37
|
-
when 'visit'
|
38
|
-
base.module_eval %{
|
39
|
-
it #{message} do
|
40
|
-
should_visit_link(#{url})
|
41
|
-
end
|
42
|
-
}
|
43
|
-
when 'ignore'
|
44
|
-
base.module_eval %{
|
45
|
-
it #{message} do
|
46
|
-
should_ignore_link(#{url})
|
47
|
-
end
|
48
|
-
}
|
49
|
-
when 'fail'
|
50
|
-
base.module_eval %{
|
51
|
-
it #{message} do
|
52
|
-
should_fail_link(#{url})
|
53
|
-
end
|
54
|
-
}
|
55
|
-
end
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
def course
|
61
|
-
WSOC::COURSE_METADATA
|
62
|
-
end
|
63
|
-
|
64
|
-
def course_auth_store
|
65
|
-
course['auth_store']
|
66
|
-
end
|
67
|
-
|
68
|
-
def run_course
|
69
|
-
Spidr::Agent.start_at(COURSE_URL) do |agent|
|
70
|
-
course_auth_store.each do |path,auth|
|
71
|
-
agent.authorized.add(
|
72
|
-
COURSE_URL.merge(path),
|
73
|
-
auth['user'],
|
74
|
-
auth['password']
|
75
|
-
)
|
76
|
-
end
|
77
|
-
|
78
|
-
agent.every_failed_url { |url| puts "[FAILED] #{url}" }
|
79
|
-
agent.every_url { |url| puts url }
|
80
|
-
end
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
data/spec/page_examples.rb
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
require 'spidr/page'
|
2
|
-
|
3
|
-
require 'spec_helper'
|
4
|
-
|
5
|
-
shared_examples_for "Page" do
|
6
|
-
it "should have a status code" do
|
7
|
-
expect(@page.code).to be_integer
|
8
|
-
end
|
9
|
-
|
10
|
-
it "should have a body" do
|
11
|
-
expect(@page.body).not_to be_empty
|
12
|
-
end
|
13
|
-
|
14
|
-
it "should provide transparent access to the response headers" do
|
15
|
-
expect(@page.content_type).to eq(@page.response['Content-Type'])
|
16
|
-
end
|
17
|
-
|
18
|
-
it "should allow content-types" do
|
19
|
-
expect(@page.content_types).not_to be_empty
|
20
|
-
end
|
21
|
-
end
|