spidr_epg 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.gitignore +10 -0
- data/.rspec +1 -0
- data/.yardopts +1 -0
- data/ChangeLog.md +291 -0
- data/ChangeLog.md~ +291 -0
- data/Gemfile +16 -0
- data/Gemfile.lock +49 -0
- data/Gemfile~ +16 -0
- data/LICENSE.txt +20 -0
- data/README.md +193 -0
- data/README.md~ +190 -0
- data/Rakefile +29 -0
- data/gemspec.yml +19 -0
- data/lib/spidr/actions/actions.rb +83 -0
- data/lib/spidr/actions/exceptions/action.rb +9 -0
- data/lib/spidr/actions/exceptions/paused.rb +11 -0
- data/lib/spidr/actions/exceptions/skip_link.rb +12 -0
- data/lib/spidr/actions/exceptions/skip_page.rb +12 -0
- data/lib/spidr/actions/exceptions.rb +4 -0
- data/lib/spidr/actions.rb +2 -0
- data/lib/spidr/agent.rb +866 -0
- data/lib/spidr/auth_credential.rb +28 -0
- data/lib/spidr/auth_store.rb +161 -0
- data/lib/spidr/body.rb +98 -0
- data/lib/spidr/cookie_jar.rb +202 -0
- data/lib/spidr/events.rb +537 -0
- data/lib/spidr/extensions/uri.rb +52 -0
- data/lib/spidr/extensions.rb +1 -0
- data/lib/spidr/filters.rb +539 -0
- data/lib/spidr/headers.rb +370 -0
- data/lib/spidr/links.rb +229 -0
- data/lib/spidr/page.rb +108 -0
- data/lib/spidr/rules.rb +79 -0
- data/lib/spidr/sanitizers.rb +56 -0
- data/lib/spidr/session_cache.rb +145 -0
- data/lib/spidr/spidr.rb +107 -0
- data/lib/spidr/version.rb +4 -0
- data/lib/spidr/version.rb~ +4 -0
- data/lib/spidr.rb +3 -0
- data/pkg/spidr-1.0.0.gem +0 -0
- data/spec/actions_spec.rb +59 -0
- data/spec/agent_spec.rb +81 -0
- data/spec/auth_store_spec.rb +85 -0
- data/spec/cookie_jar_spec.rb +144 -0
- data/spec/extensions/uri_spec.rb +43 -0
- data/spec/filters_spec.rb +61 -0
- data/spec/helpers/history.rb +34 -0
- data/spec/helpers/page.rb +8 -0
- data/spec/helpers/wsoc.rb +83 -0
- data/spec/page_examples.rb +21 -0
- data/spec/page_spec.rb +125 -0
- data/spec/rules_spec.rb +45 -0
- data/spec/sanitizers_spec.rb +61 -0
- data/spec/session_cache.rb +58 -0
- data/spec/spec_helper.rb +4 -0
- data/spec/spidr_spec.rb +39 -0
- data/spidr.gemspec +133 -0
- data/spidr.gemspec~ +131 -0
- metadata +158 -0
@@ -0,0 +1,28 @@
|
|
1
|
+
module Spidr
|
2
|
+
#
|
3
|
+
# Represents HTTP Authentication credentials for a website.
|
4
|
+
#
|
5
|
+
class AuthCredential
|
6
|
+
|
7
|
+
# The username
|
8
|
+
attr_reader :username
|
9
|
+
|
10
|
+
# The password
|
11
|
+
attr_reader :password
|
12
|
+
|
13
|
+
#
|
14
|
+
# Creates a new credential used for authentication.
|
15
|
+
#
|
16
|
+
# @param [String] username
|
17
|
+
# The username for the credential.
|
18
|
+
#
|
19
|
+
# @param [String] password
|
20
|
+
# The password for the credential.
|
21
|
+
#
|
22
|
+
def initialize(username,password)
|
23
|
+
@username = username
|
24
|
+
@password = password
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,161 @@
|
|
1
|
+
require 'spidrs/extensions/uri'
|
2
|
+
require 'spidrs/auth_credential'
|
3
|
+
require 'spidrs/page'
|
4
|
+
|
5
|
+
require 'base64'
|
6
|
+
|
7
|
+
module Spidr
|
8
|
+
#
|
9
|
+
# Stores {AuthCredential} objects organized by a website's scheme,
|
10
|
+
# host-name and sub-directory.
|
11
|
+
#
|
12
|
+
class AuthStore
|
13
|
+
|
14
|
+
#
|
15
|
+
# Creates a new auth store.
|
16
|
+
#
|
17
|
+
# @since 0.2.2
|
18
|
+
#
|
19
|
+
def initialize
|
20
|
+
@credentials = {}
|
21
|
+
end
|
22
|
+
|
23
|
+
#
|
24
|
+
# Given a URL, return the most specific matching auth credential.
|
25
|
+
#
|
26
|
+
# @param [URI] url
|
27
|
+
# A fully qualified url including optional path.
|
28
|
+
#
|
29
|
+
# @return [AuthCredential, nil]
|
30
|
+
# Closest matching {AuthCredential} values for the URL,
|
31
|
+
# or `nil` if nothing matches.
|
32
|
+
#
|
33
|
+
# @since 0.2.2
|
34
|
+
#
|
35
|
+
def [](url)
|
36
|
+
# normalize the url
|
37
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
38
|
+
|
39
|
+
key = [url.scheme, url.host, url.port]
|
40
|
+
paths = @credentials[key]
|
41
|
+
|
42
|
+
return nil unless paths
|
43
|
+
|
44
|
+
# longest path first
|
45
|
+
ordered_paths = paths.keys.sort_by { |key| key.length }.reverse
|
46
|
+
|
47
|
+
# directories of the path
|
48
|
+
path_dirs = URI.expand_path(url.path).split('/')
|
49
|
+
|
50
|
+
ordered_paths.each do |path|
|
51
|
+
return paths[path] if path_dirs[0,path.length] == path
|
52
|
+
end
|
53
|
+
|
54
|
+
return nil
|
55
|
+
end
|
56
|
+
|
57
|
+
#
|
58
|
+
# Add an auth credential to the store for supplied base URL.
|
59
|
+
#
|
60
|
+
# @param [URI] url_base
|
61
|
+
# A URL pattern to associate with a set of auth credentials.
|
62
|
+
#
|
63
|
+
# @param [AuthCredential]
|
64
|
+
# The auth credential for this URL pattern.
|
65
|
+
#
|
66
|
+
# @return [AuthCredential]
|
67
|
+
# The newly added auth credential.
|
68
|
+
#
|
69
|
+
# @since 0.2.2
|
70
|
+
#
|
71
|
+
def []=(url,auth)
|
72
|
+
# normalize the url
|
73
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
74
|
+
|
75
|
+
# normalize the URL path
|
76
|
+
path = URI.expand_path(url.path)
|
77
|
+
|
78
|
+
key = [url.scheme, url.host, url.port]
|
79
|
+
|
80
|
+
@credentials[key] ||= {}
|
81
|
+
@credentials[key][path.split('/')] = auth
|
82
|
+
return auth
|
83
|
+
end
|
84
|
+
|
85
|
+
#
|
86
|
+
# Convenience method to add username and password credentials
|
87
|
+
# for a named URL.
|
88
|
+
#
|
89
|
+
# @param [URI] url
|
90
|
+
# The base URL that requires authorization.
|
91
|
+
#
|
92
|
+
# @param [String] username
|
93
|
+
# The username required to access the URL.
|
94
|
+
#
|
95
|
+
# @param [String] password
|
96
|
+
# The password required to access the URL.
|
97
|
+
#
|
98
|
+
# @return [AuthCredential]
|
99
|
+
# The newly added auth credential.
|
100
|
+
#
|
101
|
+
# @since 0.2.2
|
102
|
+
#
|
103
|
+
def add(url,username,password)
|
104
|
+
self[url] = AuthCredential.new(username,password)
|
105
|
+
end
|
106
|
+
|
107
|
+
#
|
108
|
+
# Returns the base64 encoded authorization string for the URL
|
109
|
+
# or `nil` if no authorization exists.
|
110
|
+
#
|
111
|
+
# @param [URI] url
|
112
|
+
# The url.
|
113
|
+
#
|
114
|
+
# @return [String, nil]
|
115
|
+
# The base64 encoded authorizatio string or `nil`.
|
116
|
+
#
|
117
|
+
# @since 0.2.2
|
118
|
+
#
|
119
|
+
def for_url(url)
|
120
|
+
if (auth = self[url])
|
121
|
+
return Base64.encode64("#{auth.username}:#{auth.password}")
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
#
|
126
|
+
# Clear the contents of the auth store.
|
127
|
+
#
|
128
|
+
# @return [AuthStore]
|
129
|
+
# The cleared auth store.
|
130
|
+
#
|
131
|
+
# @since 0.2.2
|
132
|
+
#
|
133
|
+
def clear!
|
134
|
+
@credentials.clear
|
135
|
+
return self
|
136
|
+
end
|
137
|
+
|
138
|
+
#
|
139
|
+
# Size of the current auth store (number of URL paths stored).
|
140
|
+
#
|
141
|
+
# @return [Integer]
|
142
|
+
# The size of the auth store.
|
143
|
+
#
|
144
|
+
# @since 0.2.2
|
145
|
+
#
|
146
|
+
def size
|
147
|
+
@credentials.inject(0) { |res, arr| res + arr[1].length }
|
148
|
+
end
|
149
|
+
|
150
|
+
#
|
151
|
+
# Inspects the auth store.
|
152
|
+
#
|
153
|
+
# @return [String]
|
154
|
+
# The inspected version of the auth store.
|
155
|
+
#
|
156
|
+
def inspect
|
157
|
+
"#<#{self.class}: #{@credentials.inspect}>"
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|
161
|
+
end
|
data/lib/spidr/body.rb
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Spidr
|
4
|
+
module Body
|
5
|
+
#
|
6
|
+
# The body of the response.
|
7
|
+
#
|
8
|
+
# @return [String]
|
9
|
+
# The body of the response.
|
10
|
+
#
|
11
|
+
def body
|
12
|
+
(response.body || '')
|
13
|
+
end
|
14
|
+
|
15
|
+
#
|
16
|
+
# Returns a parsed document object for HTML, XML, RSS and Atom pages.
|
17
|
+
#
|
18
|
+
# @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
|
19
|
+
# The document that represents HTML or XML pages.
|
20
|
+
# Returns `nil` if the page is neither HTML, XML, RSS, Atom or if
|
21
|
+
# the page could not be parsed properly.
|
22
|
+
#
|
23
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
24
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
|
25
|
+
#
|
26
|
+
def doc
|
27
|
+
unless body.empty?
|
28
|
+
begin
|
29
|
+
if html?
|
30
|
+
@doc ||= Nokogiri::HTML(body, @url.to_s, content_charset)
|
31
|
+
elsif (rss? || atom? || xml? || xsl?)
|
32
|
+
@doc ||= Nokogiri::XML(body, @url.to_s, content_charset)
|
33
|
+
end
|
34
|
+
rescue
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
#
|
40
|
+
# Searches the document for XPath or CSS Path paths.
|
41
|
+
#
|
42
|
+
# @param [Array<String>] paths
|
43
|
+
# CSS or XPath expressions to search the document with.
|
44
|
+
#
|
45
|
+
# @return [Array]
|
46
|
+
# The matched nodes from the document.
|
47
|
+
# Returns an empty Array if no nodes were matched, or if the page
|
48
|
+
# is not an HTML or XML document.
|
49
|
+
#
|
50
|
+
# @example
|
51
|
+
# page.search('//a[@href]')
|
52
|
+
#
|
53
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
|
54
|
+
#
|
55
|
+
def search(*paths)
|
56
|
+
if doc
|
57
|
+
doc.search(*paths)
|
58
|
+
else
|
59
|
+
[]
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
#
|
64
|
+
# Searches for the first occurrence an XPath or CSS Path expression.
|
65
|
+
#
|
66
|
+
# @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
|
67
|
+
# The first matched node. Returns `nil` if no nodes could be matched,
|
68
|
+
# or if the page is not a HTML or XML document.
|
69
|
+
#
|
70
|
+
# @example
|
71
|
+
# page.at('//title')
|
72
|
+
#
|
73
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
|
74
|
+
#
|
75
|
+
def at(*arguments)
|
76
|
+
if doc
|
77
|
+
doc.at(*arguments)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
alias / search
|
82
|
+
alias % at
|
83
|
+
|
84
|
+
#
|
85
|
+
# The title of the HTML page.
|
86
|
+
#
|
87
|
+
# @return [String]
|
88
|
+
# The inner-text of the title element of the page.
|
89
|
+
#
|
90
|
+
def title
|
91
|
+
if (node = at('//title'))
|
92
|
+
node.inner_text
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
alias to_s body
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,202 @@
|
|
1
|
+
require 'spidrs/page'
|
2
|
+
|
3
|
+
require 'set'
|
4
|
+
|
5
|
+
module Spidr
|
6
|
+
#
|
7
|
+
# Stores HTTP Cookies organized by host-name.
|
8
|
+
#
|
9
|
+
class CookieJar
|
10
|
+
|
11
|
+
include Enumerable
|
12
|
+
|
13
|
+
#
|
14
|
+
# Creates a new Cookie Jar object.
|
15
|
+
#
|
16
|
+
# @since 0.2.2
|
17
|
+
#
|
18
|
+
def initialize
|
19
|
+
@params = {}
|
20
|
+
|
21
|
+
@dirty = Set[]
|
22
|
+
@cookies = {}
|
23
|
+
end
|
24
|
+
|
25
|
+
#
|
26
|
+
# Enumerates over the host-name and cookie value pairs in the
|
27
|
+
# cookie jar.
|
28
|
+
#
|
29
|
+
# @yield [host, cookie]
|
30
|
+
# If a block is given, it will be passed each host-name and cookie
|
31
|
+
# value pair.
|
32
|
+
#
|
33
|
+
# @yieldparam [String] host
|
34
|
+
# The host-name that the cookie is bound to.
|
35
|
+
#
|
36
|
+
# @yieldparam [String] cookie
|
37
|
+
# The cookie value.
|
38
|
+
#
|
39
|
+
# @since 0.2.2
|
40
|
+
#
|
41
|
+
def each(&block)
|
42
|
+
@params.each(&block)
|
43
|
+
end
|
44
|
+
|
45
|
+
#
|
46
|
+
# Return all relevant cookies in a single string for the
|
47
|
+
# named host or domain (in browser request format).
|
48
|
+
#
|
49
|
+
# @param [String] host
|
50
|
+
# Host or domain name for cookies.
|
51
|
+
#
|
52
|
+
# @return [String, nil]
|
53
|
+
# The cookie values or `nil` if the host does not have a cookie in the
|
54
|
+
# jar.
|
55
|
+
#
|
56
|
+
# @since 0.2.2
|
57
|
+
#
|
58
|
+
def [](host)
|
59
|
+
@params[host] ||= {}
|
60
|
+
end
|
61
|
+
|
62
|
+
#
|
63
|
+
# Add a cookie to the jar for a particular domain.
|
64
|
+
#
|
65
|
+
# @param [String] host
|
66
|
+
# Host or domain name to associate with the cookie.
|
67
|
+
#
|
68
|
+
# @param [Hash{String => String}] cookies
|
69
|
+
# Cookie params.
|
70
|
+
#
|
71
|
+
# @since 0.2.2
|
72
|
+
#
|
73
|
+
def []=(host,cookies)
|
74
|
+
collected = self[host]
|
75
|
+
|
76
|
+
cookies.each do |key,value|
|
77
|
+
if collected[key] != value
|
78
|
+
collected.merge!(cookies)
|
79
|
+
@dirty << host
|
80
|
+
|
81
|
+
break
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
return cookies
|
86
|
+
end
|
87
|
+
|
88
|
+
#
|
89
|
+
# Retrieve cookies for a domain from a page response header.
|
90
|
+
#
|
91
|
+
# @param [Page] page
|
92
|
+
# The response page from which to extract cookie data.
|
93
|
+
#
|
94
|
+
# @return [Boolean]
|
95
|
+
# Specifies whether cookies were added from the page.
|
96
|
+
#
|
97
|
+
# @since 0.2.2
|
98
|
+
#
|
99
|
+
def from_page(page)
|
100
|
+
cookies = page.cookie_params
|
101
|
+
|
102
|
+
unless cookies.empty?
|
103
|
+
self[page.url.host] = cookies
|
104
|
+
return true
|
105
|
+
end
|
106
|
+
|
107
|
+
return false
|
108
|
+
end
|
109
|
+
|
110
|
+
#
|
111
|
+
# Returns the pre-encoded Cookie for a given host.
|
112
|
+
#
|
113
|
+
# @param [String] host
|
114
|
+
# The name of the host.
|
115
|
+
#
|
116
|
+
# @return [String]
|
117
|
+
# The encoded Cookie.
|
118
|
+
#
|
119
|
+
# @since 0.2.2
|
120
|
+
#
|
121
|
+
def for_host(host)
|
122
|
+
if @dirty.include?(host)
|
123
|
+
values = []
|
124
|
+
|
125
|
+
cookies_for_host(host).each do |name,value|
|
126
|
+
values << "#{name}=#{value}"
|
127
|
+
end
|
128
|
+
|
129
|
+
@cookies[host] = values.join('; ')
|
130
|
+
@dirty.delete(host)
|
131
|
+
end
|
132
|
+
|
133
|
+
return @cookies[host]
|
134
|
+
end
|
135
|
+
|
136
|
+
#
|
137
|
+
# Returns raw cookie value pairs for a given host. Includes cookies set on
|
138
|
+
# parent domain(s).
|
139
|
+
#
|
140
|
+
# @param [String] host
|
141
|
+
# The name of the host.
|
142
|
+
#
|
143
|
+
# @return [Hash{String => String}]
|
144
|
+
# Cookie params.
|
145
|
+
#
|
146
|
+
# @since 0.2.7
|
147
|
+
#
|
148
|
+
def cookies_for_host(host)
|
149
|
+
host_cookies = (@params[host] || {})
|
150
|
+
sub_domains = host.split('.')
|
151
|
+
|
152
|
+
while sub_domains.length > 2
|
153
|
+
sub_domains.shift
|
154
|
+
|
155
|
+
if (parent_cookies = @params[sub_domains.join('.')])
|
156
|
+
parent_cookies.each do |name,value|
|
157
|
+
# copy in the parent cookies, only if they haven't been
|
158
|
+
# overridden yet.
|
159
|
+
unless host_cookies.has_key?(name)
|
160
|
+
host_cookies[name] = value
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
return host_cookies
|
167
|
+
end
|
168
|
+
|
169
|
+
#
|
170
|
+
# Clear out the jar, removing all stored cookies.
|
171
|
+
#
|
172
|
+
# @since 0.2.2
|
173
|
+
#
|
174
|
+
def clear!
|
175
|
+
@params.clear
|
176
|
+
|
177
|
+
@dirty.clear
|
178
|
+
@cookies.clear
|
179
|
+
return self
|
180
|
+
end
|
181
|
+
|
182
|
+
#
|
183
|
+
# Size of the current cookie jar store.
|
184
|
+
#
|
185
|
+
# @since 0.2.2
|
186
|
+
#
|
187
|
+
def size
|
188
|
+
@params.size
|
189
|
+
end
|
190
|
+
|
191
|
+
#
|
192
|
+
# Inspects the cookie jar.
|
193
|
+
#
|
194
|
+
# @return [String]
|
195
|
+
# The inspected version of the cookie jar.
|
196
|
+
#
|
197
|
+
def inspect
|
198
|
+
"#<#{self.class}: #{@params.inspect}>"
|
199
|
+
end
|
200
|
+
|
201
|
+
end
|
202
|
+
end
|