spidr_epg 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.gitignore +10 -0
- data/.rspec +1 -0
- data/.yardopts +1 -0
- data/ChangeLog.md +291 -0
- data/ChangeLog.md~ +291 -0
- data/Gemfile +16 -0
- data/Gemfile.lock +49 -0
- data/Gemfile~ +16 -0
- data/LICENSE.txt +20 -0
- data/README.md +193 -0
- data/README.md~ +190 -0
- data/Rakefile +29 -0
- data/gemspec.yml +19 -0
- data/lib/spidr/actions/actions.rb +83 -0
- data/lib/spidr/actions/exceptions/action.rb +9 -0
- data/lib/spidr/actions/exceptions/paused.rb +11 -0
- data/lib/spidr/actions/exceptions/skip_link.rb +12 -0
- data/lib/spidr/actions/exceptions/skip_page.rb +12 -0
- data/lib/spidr/actions/exceptions.rb +4 -0
- data/lib/spidr/actions.rb +2 -0
- data/lib/spidr/agent.rb +866 -0
- data/lib/spidr/auth_credential.rb +28 -0
- data/lib/spidr/auth_store.rb +161 -0
- data/lib/spidr/body.rb +98 -0
- data/lib/spidr/cookie_jar.rb +202 -0
- data/lib/spidr/events.rb +537 -0
- data/lib/spidr/extensions/uri.rb +52 -0
- data/lib/spidr/extensions.rb +1 -0
- data/lib/spidr/filters.rb +539 -0
- data/lib/spidr/headers.rb +370 -0
- data/lib/spidr/links.rb +229 -0
- data/lib/spidr/page.rb +108 -0
- data/lib/spidr/rules.rb +79 -0
- data/lib/spidr/sanitizers.rb +56 -0
- data/lib/spidr/session_cache.rb +145 -0
- data/lib/spidr/spidr.rb +107 -0
- data/lib/spidr/version.rb +4 -0
- data/lib/spidr/version.rb~ +4 -0
- data/lib/spidr.rb +3 -0
- data/pkg/spidr-1.0.0.gem +0 -0
- data/spec/actions_spec.rb +59 -0
- data/spec/agent_spec.rb +81 -0
- data/spec/auth_store_spec.rb +85 -0
- data/spec/cookie_jar_spec.rb +144 -0
- data/spec/extensions/uri_spec.rb +43 -0
- data/spec/filters_spec.rb +61 -0
- data/spec/helpers/history.rb +34 -0
- data/spec/helpers/page.rb +8 -0
- data/spec/helpers/wsoc.rb +83 -0
- data/spec/page_examples.rb +21 -0
- data/spec/page_spec.rb +125 -0
- data/spec/rules_spec.rb +45 -0
- data/spec/sanitizers_spec.rb +61 -0
- data/spec/session_cache.rb +58 -0
- data/spec/spec_helper.rb +4 -0
- data/spec/spidr_spec.rb +39 -0
- data/spidr.gemspec +133 -0
- data/spidr.gemspec~ +131 -0
- metadata +158 -0
@@ -0,0 +1,28 @@
|
|
1
|
+
module Spidr
|
2
|
+
#
|
3
|
+
# Represents HTTP Authentication credentials for a website.
|
4
|
+
#
|
5
|
+
class AuthCredential
|
6
|
+
|
7
|
+
# The username
|
8
|
+
attr_reader :username
|
9
|
+
|
10
|
+
# The password
|
11
|
+
attr_reader :password
|
12
|
+
|
13
|
+
#
|
14
|
+
# Creates a new credential used for authentication.
|
15
|
+
#
|
16
|
+
# @param [String] username
|
17
|
+
# The username for the credential.
|
18
|
+
#
|
19
|
+
# @param [String] password
|
20
|
+
# The password for the credential.
|
21
|
+
#
|
22
|
+
def initialize(username,password)
|
23
|
+
@username = username
|
24
|
+
@password = password
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,161 @@
|
|
1
|
+
require 'spidrs/extensions/uri'
|
2
|
+
require 'spidrs/auth_credential'
|
3
|
+
require 'spidrs/page'
|
4
|
+
|
5
|
+
require 'base64'
|
6
|
+
|
7
|
+
module Spidr
|
8
|
+
#
|
9
|
+
# Stores {AuthCredential} objects organized by a website's scheme,
|
10
|
+
# host-name and sub-directory.
|
11
|
+
#
|
12
|
+
class AuthStore
|
13
|
+
|
14
|
+
#
|
15
|
+
# Creates a new auth store.
|
16
|
+
#
|
17
|
+
# @since 0.2.2
|
18
|
+
#
|
19
|
+
def initialize
|
20
|
+
@credentials = {}
|
21
|
+
end
|
22
|
+
|
23
|
+
#
|
24
|
+
# Given a URL, return the most specific matching auth credential.
|
25
|
+
#
|
26
|
+
# @param [URI] url
|
27
|
+
# A fully qualified url including optional path.
|
28
|
+
#
|
29
|
+
# @return [AuthCredential, nil]
|
30
|
+
# Closest matching {AuthCredential} values for the URL,
|
31
|
+
# or `nil` if nothing matches.
|
32
|
+
#
|
33
|
+
# @since 0.2.2
|
34
|
+
#
|
35
|
+
def [](url)
|
36
|
+
# normalize the url
|
37
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
38
|
+
|
39
|
+
key = [url.scheme, url.host, url.port]
|
40
|
+
paths = @credentials[key]
|
41
|
+
|
42
|
+
return nil unless paths
|
43
|
+
|
44
|
+
# longest path first
|
45
|
+
ordered_paths = paths.keys.sort_by { |key| key.length }.reverse
|
46
|
+
|
47
|
+
# directories of the path
|
48
|
+
path_dirs = URI.expand_path(url.path).split('/')
|
49
|
+
|
50
|
+
ordered_paths.each do |path|
|
51
|
+
return paths[path] if path_dirs[0,path.length] == path
|
52
|
+
end
|
53
|
+
|
54
|
+
return nil
|
55
|
+
end
|
56
|
+
|
57
|
+
#
|
58
|
+
# Add an auth credential to the store for supplied base URL.
|
59
|
+
#
|
60
|
+
# @param [URI] url_base
|
61
|
+
# A URL pattern to associate with a set of auth credentials.
|
62
|
+
#
|
63
|
+
# @param [AuthCredential]
|
64
|
+
# The auth credential for this URL pattern.
|
65
|
+
#
|
66
|
+
# @return [AuthCredential]
|
67
|
+
# The newly added auth credential.
|
68
|
+
#
|
69
|
+
# @since 0.2.2
|
70
|
+
#
|
71
|
+
def []=(url,auth)
|
72
|
+
# normalize the url
|
73
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
74
|
+
|
75
|
+
# normalize the URL path
|
76
|
+
path = URI.expand_path(url.path)
|
77
|
+
|
78
|
+
key = [url.scheme, url.host, url.port]
|
79
|
+
|
80
|
+
@credentials[key] ||= {}
|
81
|
+
@credentials[key][path.split('/')] = auth
|
82
|
+
return auth
|
83
|
+
end
|
84
|
+
|
85
|
+
#
|
86
|
+
# Convenience method to add username and password credentials
|
87
|
+
# for a named URL.
|
88
|
+
#
|
89
|
+
# @param [URI] url
|
90
|
+
# The base URL that requires authorization.
|
91
|
+
#
|
92
|
+
# @param [String] username
|
93
|
+
# The username required to access the URL.
|
94
|
+
#
|
95
|
+
# @param [String] password
|
96
|
+
# The password required to access the URL.
|
97
|
+
#
|
98
|
+
# @return [AuthCredential]
|
99
|
+
# The newly added auth credential.
|
100
|
+
#
|
101
|
+
# @since 0.2.2
|
102
|
+
#
|
103
|
+
def add(url,username,password)
|
104
|
+
self[url] = AuthCredential.new(username,password)
|
105
|
+
end
|
106
|
+
|
107
|
+
#
|
108
|
+
# Returns the base64 encoded authorization string for the URL
|
109
|
+
# or `nil` if no authorization exists.
|
110
|
+
#
|
111
|
+
# @param [URI] url
|
112
|
+
# The url.
|
113
|
+
#
|
114
|
+
# @return [String, nil]
|
115
|
+
# The base64 encoded authorizatio string or `nil`.
|
116
|
+
#
|
117
|
+
# @since 0.2.2
|
118
|
+
#
|
119
|
+
def for_url(url)
|
120
|
+
if (auth = self[url])
|
121
|
+
return Base64.encode64("#{auth.username}:#{auth.password}")
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
#
|
126
|
+
# Clear the contents of the auth store.
|
127
|
+
#
|
128
|
+
# @return [AuthStore]
|
129
|
+
# The cleared auth store.
|
130
|
+
#
|
131
|
+
# @since 0.2.2
|
132
|
+
#
|
133
|
+
def clear!
|
134
|
+
@credentials.clear
|
135
|
+
return self
|
136
|
+
end
|
137
|
+
|
138
|
+
#
|
139
|
+
# Size of the current auth store (number of URL paths stored).
|
140
|
+
#
|
141
|
+
# @return [Integer]
|
142
|
+
# The size of the auth store.
|
143
|
+
#
|
144
|
+
# @since 0.2.2
|
145
|
+
#
|
146
|
+
def size
|
147
|
+
@credentials.inject(0) { |res, arr| res + arr[1].length }
|
148
|
+
end
|
149
|
+
|
150
|
+
#
|
151
|
+
# Inspects the auth store.
|
152
|
+
#
|
153
|
+
# @return [String]
|
154
|
+
# The inspected version of the auth store.
|
155
|
+
#
|
156
|
+
def inspect
|
157
|
+
"#<#{self.class}: #{@credentials.inspect}>"
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|
161
|
+
end
|
data/lib/spidr/body.rb
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Spidr
|
4
|
+
module Body
|
5
|
+
#
|
6
|
+
# The body of the response.
|
7
|
+
#
|
8
|
+
# @return [String]
|
9
|
+
# The body of the response.
|
10
|
+
#
|
11
|
+
def body
|
12
|
+
(response.body || '')
|
13
|
+
end
|
14
|
+
|
15
|
+
#
|
16
|
+
# Returns a parsed document object for HTML, XML, RSS and Atom pages.
|
17
|
+
#
|
18
|
+
# @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
|
19
|
+
# The document that represents HTML or XML pages.
|
20
|
+
# Returns `nil` if the page is neither HTML, XML, RSS, Atom or if
|
21
|
+
# the page could not be parsed properly.
|
22
|
+
#
|
23
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
24
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
|
25
|
+
#
|
26
|
+
def doc
|
27
|
+
unless body.empty?
|
28
|
+
begin
|
29
|
+
if html?
|
30
|
+
@doc ||= Nokogiri::HTML(body, @url.to_s, content_charset)
|
31
|
+
elsif (rss? || atom? || xml? || xsl?)
|
32
|
+
@doc ||= Nokogiri::XML(body, @url.to_s, content_charset)
|
33
|
+
end
|
34
|
+
rescue
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
#
|
40
|
+
# Searches the document for XPath or CSS Path paths.
|
41
|
+
#
|
42
|
+
# @param [Array<String>] paths
|
43
|
+
# CSS or XPath expressions to search the document with.
|
44
|
+
#
|
45
|
+
# @return [Array]
|
46
|
+
# The matched nodes from the document.
|
47
|
+
# Returns an empty Array if no nodes were matched, or if the page
|
48
|
+
# is not an HTML or XML document.
|
49
|
+
#
|
50
|
+
# @example
|
51
|
+
# page.search('//a[@href]')
|
52
|
+
#
|
53
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
|
54
|
+
#
|
55
|
+
def search(*paths)
|
56
|
+
if doc
|
57
|
+
doc.search(*paths)
|
58
|
+
else
|
59
|
+
[]
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
#
|
64
|
+
# Searches for the first occurrence an XPath or CSS Path expression.
|
65
|
+
#
|
66
|
+
# @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
|
67
|
+
# The first matched node. Returns `nil` if no nodes could be matched,
|
68
|
+
# or if the page is not a HTML or XML document.
|
69
|
+
#
|
70
|
+
# @example
|
71
|
+
# page.at('//title')
|
72
|
+
#
|
73
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
|
74
|
+
#
|
75
|
+
def at(*arguments)
|
76
|
+
if doc
|
77
|
+
doc.at(*arguments)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
alias / search
|
82
|
+
alias % at
|
83
|
+
|
84
|
+
#
|
85
|
+
# The title of the HTML page.
|
86
|
+
#
|
87
|
+
# @return [String]
|
88
|
+
# The inner-text of the title element of the page.
|
89
|
+
#
|
90
|
+
def title
|
91
|
+
if (node = at('//title'))
|
92
|
+
node.inner_text
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
alias to_s body
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,202 @@
|
|
1
|
+
require 'spidrs/page'
|
2
|
+
|
3
|
+
require 'set'
|
4
|
+
|
5
|
+
module Spidr
|
6
|
+
#
|
7
|
+
# Stores HTTP Cookies organized by host-name.
|
8
|
+
#
|
9
|
+
class CookieJar
|
10
|
+
|
11
|
+
include Enumerable
|
12
|
+
|
13
|
+
#
|
14
|
+
# Creates a new Cookie Jar object.
|
15
|
+
#
|
16
|
+
# @since 0.2.2
|
17
|
+
#
|
18
|
+
def initialize
|
19
|
+
@params = {}
|
20
|
+
|
21
|
+
@dirty = Set[]
|
22
|
+
@cookies = {}
|
23
|
+
end
|
24
|
+
|
25
|
+
#
|
26
|
+
# Enumerates over the host-name and cookie value pairs in the
|
27
|
+
# cookie jar.
|
28
|
+
#
|
29
|
+
# @yield [host, cookie]
|
30
|
+
# If a block is given, it will be passed each host-name and cookie
|
31
|
+
# value pair.
|
32
|
+
#
|
33
|
+
# @yieldparam [String] host
|
34
|
+
# The host-name that the cookie is bound to.
|
35
|
+
#
|
36
|
+
# @yieldparam [String] cookie
|
37
|
+
# The cookie value.
|
38
|
+
#
|
39
|
+
# @since 0.2.2
|
40
|
+
#
|
41
|
+
def each(&block)
|
42
|
+
@params.each(&block)
|
43
|
+
end
|
44
|
+
|
45
|
+
#
|
46
|
+
# Return all relevant cookies in a single string for the
|
47
|
+
# named host or domain (in browser request format).
|
48
|
+
#
|
49
|
+
# @param [String] host
|
50
|
+
# Host or domain name for cookies.
|
51
|
+
#
|
52
|
+
# @return [String, nil]
|
53
|
+
# The cookie values or `nil` if the host does not have a cookie in the
|
54
|
+
# jar.
|
55
|
+
#
|
56
|
+
# @since 0.2.2
|
57
|
+
#
|
58
|
+
def [](host)
|
59
|
+
@params[host] ||= {}
|
60
|
+
end
|
61
|
+
|
62
|
+
#
|
63
|
+
# Add a cookie to the jar for a particular domain.
|
64
|
+
#
|
65
|
+
# @param [String] host
|
66
|
+
# Host or domain name to associate with the cookie.
|
67
|
+
#
|
68
|
+
# @param [Hash{String => String}] cookies
|
69
|
+
# Cookie params.
|
70
|
+
#
|
71
|
+
# @since 0.2.2
|
72
|
+
#
|
73
|
+
def []=(host,cookies)
|
74
|
+
collected = self[host]
|
75
|
+
|
76
|
+
cookies.each do |key,value|
|
77
|
+
if collected[key] != value
|
78
|
+
collected.merge!(cookies)
|
79
|
+
@dirty << host
|
80
|
+
|
81
|
+
break
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
return cookies
|
86
|
+
end
|
87
|
+
|
88
|
+
#
|
89
|
+
# Retrieve cookies for a domain from a page response header.
|
90
|
+
#
|
91
|
+
# @param [Page] page
|
92
|
+
# The response page from which to extract cookie data.
|
93
|
+
#
|
94
|
+
# @return [Boolean]
|
95
|
+
# Specifies whether cookies were added from the page.
|
96
|
+
#
|
97
|
+
# @since 0.2.2
|
98
|
+
#
|
99
|
+
def from_page(page)
|
100
|
+
cookies = page.cookie_params
|
101
|
+
|
102
|
+
unless cookies.empty?
|
103
|
+
self[page.url.host] = cookies
|
104
|
+
return true
|
105
|
+
end
|
106
|
+
|
107
|
+
return false
|
108
|
+
end
|
109
|
+
|
110
|
+
#
|
111
|
+
# Returns the pre-encoded Cookie for a given host.
|
112
|
+
#
|
113
|
+
# @param [String] host
|
114
|
+
# The name of the host.
|
115
|
+
#
|
116
|
+
# @return [String]
|
117
|
+
# The encoded Cookie.
|
118
|
+
#
|
119
|
+
# @since 0.2.2
|
120
|
+
#
|
121
|
+
def for_host(host)
|
122
|
+
if @dirty.include?(host)
|
123
|
+
values = []
|
124
|
+
|
125
|
+
cookies_for_host(host).each do |name,value|
|
126
|
+
values << "#{name}=#{value}"
|
127
|
+
end
|
128
|
+
|
129
|
+
@cookies[host] = values.join('; ')
|
130
|
+
@dirty.delete(host)
|
131
|
+
end
|
132
|
+
|
133
|
+
return @cookies[host]
|
134
|
+
end
|
135
|
+
|
136
|
+
#
|
137
|
+
# Returns raw cookie value pairs for a given host. Includes cookies set on
|
138
|
+
# parent domain(s).
|
139
|
+
#
|
140
|
+
# @param [String] host
|
141
|
+
# The name of the host.
|
142
|
+
#
|
143
|
+
# @return [Hash{String => String}]
|
144
|
+
# Cookie params.
|
145
|
+
#
|
146
|
+
# @since 0.2.7
|
147
|
+
#
|
148
|
+
def cookies_for_host(host)
|
149
|
+
host_cookies = (@params[host] || {})
|
150
|
+
sub_domains = host.split('.')
|
151
|
+
|
152
|
+
while sub_domains.length > 2
|
153
|
+
sub_domains.shift
|
154
|
+
|
155
|
+
if (parent_cookies = @params[sub_domains.join('.')])
|
156
|
+
parent_cookies.each do |name,value|
|
157
|
+
# copy in the parent cookies, only if they haven't been
|
158
|
+
# overridden yet.
|
159
|
+
unless host_cookies.has_key?(name)
|
160
|
+
host_cookies[name] = value
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
return host_cookies
|
167
|
+
end
|
168
|
+
|
169
|
+
#
|
170
|
+
# Clear out the jar, removing all stored cookies.
|
171
|
+
#
|
172
|
+
# @since 0.2.2
|
173
|
+
#
|
174
|
+
def clear!
|
175
|
+
@params.clear
|
176
|
+
|
177
|
+
@dirty.clear
|
178
|
+
@cookies.clear
|
179
|
+
return self
|
180
|
+
end
|
181
|
+
|
182
|
+
#
|
183
|
+
# Size of the current cookie jar store.
|
184
|
+
#
|
185
|
+
# @since 0.2.2
|
186
|
+
#
|
187
|
+
def size
|
188
|
+
@params.size
|
189
|
+
end
|
190
|
+
|
191
|
+
#
|
192
|
+
# Inspects the cookie jar.
|
193
|
+
#
|
194
|
+
# @return [String]
|
195
|
+
# The inspected version of the cookie jar.
|
196
|
+
#
|
197
|
+
def inspect
|
198
|
+
"#<#{self.class}: #{@params.inspect}>"
|
199
|
+
end
|
200
|
+
|
201
|
+
end
|
202
|
+
end
|