spidr_epg 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +10 -0
  3. data/.rspec +1 -0
  4. data/.yardopts +1 -0
  5. data/ChangeLog.md +291 -0
  6. data/ChangeLog.md~ +291 -0
  7. data/Gemfile +16 -0
  8. data/Gemfile.lock +49 -0
  9. data/Gemfile~ +16 -0
  10. data/LICENSE.txt +20 -0
  11. data/README.md +193 -0
  12. data/README.md~ +190 -0
  13. data/Rakefile +29 -0
  14. data/gemspec.yml +19 -0
  15. data/lib/spidr/actions/actions.rb +83 -0
  16. data/lib/spidr/actions/exceptions/action.rb +9 -0
  17. data/lib/spidr/actions/exceptions/paused.rb +11 -0
  18. data/lib/spidr/actions/exceptions/skip_link.rb +12 -0
  19. data/lib/spidr/actions/exceptions/skip_page.rb +12 -0
  20. data/lib/spidr/actions/exceptions.rb +4 -0
  21. data/lib/spidr/actions.rb +2 -0
  22. data/lib/spidr/agent.rb +866 -0
  23. data/lib/spidr/auth_credential.rb +28 -0
  24. data/lib/spidr/auth_store.rb +161 -0
  25. data/lib/spidr/body.rb +98 -0
  26. data/lib/spidr/cookie_jar.rb +202 -0
  27. data/lib/spidr/events.rb +537 -0
  28. data/lib/spidr/extensions/uri.rb +52 -0
  29. data/lib/spidr/extensions.rb +1 -0
  30. data/lib/spidr/filters.rb +539 -0
  31. data/lib/spidr/headers.rb +370 -0
  32. data/lib/spidr/links.rb +229 -0
  33. data/lib/spidr/page.rb +108 -0
  34. data/lib/spidr/rules.rb +79 -0
  35. data/lib/spidr/sanitizers.rb +56 -0
  36. data/lib/spidr/session_cache.rb +145 -0
  37. data/lib/spidr/spidr.rb +107 -0
  38. data/lib/spidr/version.rb +4 -0
  39. data/lib/spidr/version.rb~ +4 -0
  40. data/lib/spidr.rb +3 -0
  41. data/pkg/spidr-1.0.0.gem +0 -0
  42. data/spec/actions_spec.rb +59 -0
  43. data/spec/agent_spec.rb +81 -0
  44. data/spec/auth_store_spec.rb +85 -0
  45. data/spec/cookie_jar_spec.rb +144 -0
  46. data/spec/extensions/uri_spec.rb +43 -0
  47. data/spec/filters_spec.rb +61 -0
  48. data/spec/helpers/history.rb +34 -0
  49. data/spec/helpers/page.rb +8 -0
  50. data/spec/helpers/wsoc.rb +83 -0
  51. data/spec/page_examples.rb +21 -0
  52. data/spec/page_spec.rb +125 -0
  53. data/spec/rules_spec.rb +45 -0
  54. data/spec/sanitizers_spec.rb +61 -0
  55. data/spec/session_cache.rb +58 -0
  56. data/spec/spec_helper.rb +4 -0
  57. data/spec/spidr_spec.rb +39 -0
  58. data/spidr.gemspec +133 -0
  59. data/spidr.gemspec~ +131 -0
  60. metadata +158 -0
@@ -0,0 +1,28 @@
1
+ module Spidr
2
+ #
3
+ # Represents HTTP Authentication credentials for a website.
4
+ #
5
+ class AuthCredential
6
+
7
+ # The username
8
+ attr_reader :username
9
+
10
+ # The password
11
+ attr_reader :password
12
+
13
+ #
14
+ # Creates a new credential used for authentication.
15
+ #
16
+ # @param [String] username
17
+ # The username for the credential.
18
+ #
19
+ # @param [String] password
20
+ # The password for the credential.
21
+ #
22
+ def initialize(username,password)
23
+ @username = username
24
+ @password = password
25
+ end
26
+
27
+ end
28
+ end
@@ -0,0 +1,161 @@
1
+ require 'spidrs/extensions/uri'
2
+ require 'spidrs/auth_credential'
3
+ require 'spidrs/page'
4
+
5
+ require 'base64'
6
+
7
+ module Spidr
8
+ #
9
+ # Stores {AuthCredential} objects organized by a website's scheme,
10
+ # host-name and sub-directory.
11
+ #
12
+ class AuthStore
13
+
14
+ #
15
+ # Creates a new auth store.
16
+ #
17
+ # @since 0.2.2
18
+ #
19
+ def initialize
20
+ @credentials = {}
21
+ end
22
+
23
+ #
24
+ # Given a URL, return the most specific matching auth credential.
25
+ #
26
+ # @param [URI] url
27
+ # A fully qualified url including optional path.
28
+ #
29
+ # @return [AuthCredential, nil]
30
+ # Closest matching {AuthCredential} values for the URL,
31
+ # or `nil` if nothing matches.
32
+ #
33
+ # @since 0.2.2
34
+ #
35
+ def [](url)
36
+ # normalize the url
37
+ url = URI(url.to_s) unless url.kind_of?(URI)
38
+
39
+ key = [url.scheme, url.host, url.port]
40
+ paths = @credentials[key]
41
+
42
+ return nil unless paths
43
+
44
+ # longest path first
45
+ ordered_paths = paths.keys.sort_by { |key| key.length }.reverse
46
+
47
+ # directories of the path
48
+ path_dirs = URI.expand_path(url.path).split('/')
49
+
50
+ ordered_paths.each do |path|
51
+ return paths[path] if path_dirs[0,path.length] == path
52
+ end
53
+
54
+ return nil
55
+ end
56
+
57
+ #
58
+ # Add an auth credential to the store for supplied base URL.
59
+ #
60
+ # @param [URI] url_base
61
+ # A URL pattern to associate with a set of auth credentials.
62
+ #
63
+ # @param [AuthCredential]
64
+ # The auth credential for this URL pattern.
65
+ #
66
+ # @return [AuthCredential]
67
+ # The newly added auth credential.
68
+ #
69
+ # @since 0.2.2
70
+ #
71
+ def []=(url,auth)
72
+ # normalize the url
73
+ url = URI(url.to_s) unless url.kind_of?(URI)
74
+
75
+ # normalize the URL path
76
+ path = URI.expand_path(url.path)
77
+
78
+ key = [url.scheme, url.host, url.port]
79
+
80
+ @credentials[key] ||= {}
81
+ @credentials[key][path.split('/')] = auth
82
+ return auth
83
+ end
84
+
85
+ #
86
+ # Convenience method to add username and password credentials
87
+ # for a named URL.
88
+ #
89
+ # @param [URI] url
90
+ # The base URL that requires authorization.
91
+ #
92
+ # @param [String] username
93
+ # The username required to access the URL.
94
+ #
95
+ # @param [String] password
96
+ # The password required to access the URL.
97
+ #
98
+ # @return [AuthCredential]
99
+ # The newly added auth credential.
100
+ #
101
+ # @since 0.2.2
102
+ #
103
+ def add(url,username,password)
104
+ self[url] = AuthCredential.new(username,password)
105
+ end
106
+
107
+ #
108
+ # Returns the base64 encoded authorization string for the URL
109
+ # or `nil` if no authorization exists.
110
+ #
111
+ # @param [URI] url
112
+ # The url.
113
+ #
114
+ # @return [String, nil]
115
+ # The base64 encoded authorizatio string or `nil`.
116
+ #
117
+ # @since 0.2.2
118
+ #
119
+ def for_url(url)
120
+ if (auth = self[url])
121
+ return Base64.encode64("#{auth.username}:#{auth.password}")
122
+ end
123
+ end
124
+
125
+ #
126
+ # Clear the contents of the auth store.
127
+ #
128
+ # @return [AuthStore]
129
+ # The cleared auth store.
130
+ #
131
+ # @since 0.2.2
132
+ #
133
+ def clear!
134
+ @credentials.clear
135
+ return self
136
+ end
137
+
138
+ #
139
+ # Size of the current auth store (number of URL paths stored).
140
+ #
141
+ # @return [Integer]
142
+ # The size of the auth store.
143
+ #
144
+ # @since 0.2.2
145
+ #
146
+ def size
147
+ @credentials.inject(0) { |res, arr| res + arr[1].length }
148
+ end
149
+
150
+ #
151
+ # Inspects the auth store.
152
+ #
153
+ # @return [String]
154
+ # The inspected version of the auth store.
155
+ #
156
+ def inspect
157
+ "#<#{self.class}: #{@credentials.inspect}>"
158
+ end
159
+
160
+ end
161
+ end
data/lib/spidr/body.rb ADDED
@@ -0,0 +1,98 @@
1
+ require 'nokogiri'
2
+
3
+ module Spidr
4
+ module Body
5
+ #
6
+ # The body of the response.
7
+ #
8
+ # @return [String]
9
+ # The body of the response.
10
+ #
11
+ def body
12
+ (response.body || '')
13
+ end
14
+
15
+ #
16
+ # Returns a parsed document object for HTML, XML, RSS and Atom pages.
17
+ #
18
+ # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
19
+ # The document that represents HTML or XML pages.
20
+ # Returns `nil` if the page is neither HTML, XML, RSS, Atom or if
21
+ # the page could not be parsed properly.
22
+ #
23
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
24
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
25
+ #
26
+ def doc
27
+ unless body.empty?
28
+ begin
29
+ if html?
30
+ @doc ||= Nokogiri::HTML(body, @url.to_s, content_charset)
31
+ elsif (rss? || atom? || xml? || xsl?)
32
+ @doc ||= Nokogiri::XML(body, @url.to_s, content_charset)
33
+ end
34
+ rescue
35
+ end
36
+ end
37
+ end
38
+
39
+ #
40
+ # Searches the document for XPath or CSS Path paths.
41
+ #
42
+ # @param [Array<String>] paths
43
+ # CSS or XPath expressions to search the document with.
44
+ #
45
+ # @return [Array]
46
+ # The matched nodes from the document.
47
+ # Returns an empty Array if no nodes were matched, or if the page
48
+ # is not an HTML or XML document.
49
+ #
50
+ # @example
51
+ # page.search('//a[@href]')
52
+ #
53
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
54
+ #
55
+ def search(*paths)
56
+ if doc
57
+ doc.search(*paths)
58
+ else
59
+ []
60
+ end
61
+ end
62
+
63
+ #
64
+ # Searches for the first occurrence an XPath or CSS Path expression.
65
+ #
66
+ # @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
67
+ # The first matched node. Returns `nil` if no nodes could be matched,
68
+ # or if the page is not a HTML or XML document.
69
+ #
70
+ # @example
71
+ # page.at('//title')
72
+ #
73
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
74
+ #
75
+ def at(*arguments)
76
+ if doc
77
+ doc.at(*arguments)
78
+ end
79
+ end
80
+
81
+ alias / search
82
+ alias % at
83
+
84
+ #
85
+ # The title of the HTML page.
86
+ #
87
+ # @return [String]
88
+ # The inner-text of the title element of the page.
89
+ #
90
+ def title
91
+ if (node = at('//title'))
92
+ node.inner_text
93
+ end
94
+ end
95
+
96
+ alias to_s body
97
+ end
98
+ end
@@ -0,0 +1,202 @@
1
+ require 'spidrs/page'
2
+
3
+ require 'set'
4
+
5
+ module Spidr
6
+ #
7
+ # Stores HTTP Cookies organized by host-name.
8
+ #
9
+ class CookieJar
10
+
11
+ include Enumerable
12
+
13
+ #
14
+ # Creates a new Cookie Jar object.
15
+ #
16
+ # @since 0.2.2
17
+ #
18
+ def initialize
19
+ @params = {}
20
+
21
+ @dirty = Set[]
22
+ @cookies = {}
23
+ end
24
+
25
+ #
26
+ # Enumerates over the host-name and cookie value pairs in the
27
+ # cookie jar.
28
+ #
29
+ # @yield [host, cookie]
30
+ # If a block is given, it will be passed each host-name and cookie
31
+ # value pair.
32
+ #
33
+ # @yieldparam [String] host
34
+ # The host-name that the cookie is bound to.
35
+ #
36
+ # @yieldparam [String] cookie
37
+ # The cookie value.
38
+ #
39
+ # @since 0.2.2
40
+ #
41
+ def each(&block)
42
+ @params.each(&block)
43
+ end
44
+
45
+ #
46
+ # Return all relevant cookies in a single string for the
47
+ # named host or domain (in browser request format).
48
+ #
49
+ # @param [String] host
50
+ # Host or domain name for cookies.
51
+ #
52
+ # @return [String, nil]
53
+ # The cookie values or `nil` if the host does not have a cookie in the
54
+ # jar.
55
+ #
56
+ # @since 0.2.2
57
+ #
58
+ def [](host)
59
+ @params[host] ||= {}
60
+ end
61
+
62
+ #
63
+ # Add a cookie to the jar for a particular domain.
64
+ #
65
+ # @param [String] host
66
+ # Host or domain name to associate with the cookie.
67
+ #
68
+ # @param [Hash{String => String}] cookies
69
+ # Cookie params.
70
+ #
71
+ # @since 0.2.2
72
+ #
73
+ def []=(host,cookies)
74
+ collected = self[host]
75
+
76
+ cookies.each do |key,value|
77
+ if collected[key] != value
78
+ collected.merge!(cookies)
79
+ @dirty << host
80
+
81
+ break
82
+ end
83
+ end
84
+
85
+ return cookies
86
+ end
87
+
88
+ #
89
+ # Retrieve cookies for a domain from a page response header.
90
+ #
91
+ # @param [Page] page
92
+ # The response page from which to extract cookie data.
93
+ #
94
+ # @return [Boolean]
95
+ # Specifies whether cookies were added from the page.
96
+ #
97
+ # @since 0.2.2
98
+ #
99
+ def from_page(page)
100
+ cookies = page.cookie_params
101
+
102
+ unless cookies.empty?
103
+ self[page.url.host] = cookies
104
+ return true
105
+ end
106
+
107
+ return false
108
+ end
109
+
110
+ #
111
+ # Returns the pre-encoded Cookie for a given host.
112
+ #
113
+ # @param [String] host
114
+ # The name of the host.
115
+ #
116
+ # @return [String]
117
+ # The encoded Cookie.
118
+ #
119
+ # @since 0.2.2
120
+ #
121
+ def for_host(host)
122
+ if @dirty.include?(host)
123
+ values = []
124
+
125
+ cookies_for_host(host).each do |name,value|
126
+ values << "#{name}=#{value}"
127
+ end
128
+
129
+ @cookies[host] = values.join('; ')
130
+ @dirty.delete(host)
131
+ end
132
+
133
+ return @cookies[host]
134
+ end
135
+
136
+ #
137
+ # Returns raw cookie value pairs for a given host. Includes cookies set on
138
+ # parent domain(s).
139
+ #
140
+ # @param [String] host
141
+ # The name of the host.
142
+ #
143
+ # @return [Hash{String => String}]
144
+ # Cookie params.
145
+ #
146
+ # @since 0.2.7
147
+ #
148
+ def cookies_for_host(host)
149
+ host_cookies = (@params[host] || {})
150
+ sub_domains = host.split('.')
151
+
152
+ while sub_domains.length > 2
153
+ sub_domains.shift
154
+
155
+ if (parent_cookies = @params[sub_domains.join('.')])
156
+ parent_cookies.each do |name,value|
157
+ # copy in the parent cookies, only if they haven't been
158
+ # overridden yet.
159
+ unless host_cookies.has_key?(name)
160
+ host_cookies[name] = value
161
+ end
162
+ end
163
+ end
164
+ end
165
+
166
+ return host_cookies
167
+ end
168
+
169
+ #
170
+ # Clear out the jar, removing all stored cookies.
171
+ #
172
+ # @since 0.2.2
173
+ #
174
+ def clear!
175
+ @params.clear
176
+
177
+ @dirty.clear
178
+ @cookies.clear
179
+ return self
180
+ end
181
+
182
+ #
183
+ # Size of the current cookie jar store.
184
+ #
185
+ # @since 0.2.2
186
+ #
187
+ def size
188
+ @params.size
189
+ end
190
+
191
+ #
192
+ # Inspects the cookie jar.
193
+ #
194
+ # @return [String]
195
+ # The inspected version of the cookie jar.
196
+ #
197
+ def inspect
198
+ "#<#{self.class}: #{@params.inspect}>"
199
+ end
200
+
201
+ end
202
+ end