spidr_epg 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +10 -0
  3. data/.rspec +1 -0
  4. data/.yardopts +1 -0
  5. data/ChangeLog.md +291 -0
  6. data/ChangeLog.md~ +291 -0
  7. data/Gemfile +16 -0
  8. data/Gemfile.lock +49 -0
  9. data/Gemfile~ +16 -0
  10. data/LICENSE.txt +20 -0
  11. data/README.md +193 -0
  12. data/README.md~ +190 -0
  13. data/Rakefile +29 -0
  14. data/gemspec.yml +19 -0
  15. data/lib/spidr/actions/actions.rb +83 -0
  16. data/lib/spidr/actions/exceptions/action.rb +9 -0
  17. data/lib/spidr/actions/exceptions/paused.rb +11 -0
  18. data/lib/spidr/actions/exceptions/skip_link.rb +12 -0
  19. data/lib/spidr/actions/exceptions/skip_page.rb +12 -0
  20. data/lib/spidr/actions/exceptions.rb +4 -0
  21. data/lib/spidr/actions.rb +2 -0
  22. data/lib/spidr/agent.rb +866 -0
  23. data/lib/spidr/auth_credential.rb +28 -0
  24. data/lib/spidr/auth_store.rb +161 -0
  25. data/lib/spidr/body.rb +98 -0
  26. data/lib/spidr/cookie_jar.rb +202 -0
  27. data/lib/spidr/events.rb +537 -0
  28. data/lib/spidr/extensions/uri.rb +52 -0
  29. data/lib/spidr/extensions.rb +1 -0
  30. data/lib/spidr/filters.rb +539 -0
  31. data/lib/spidr/headers.rb +370 -0
  32. data/lib/spidr/links.rb +229 -0
  33. data/lib/spidr/page.rb +108 -0
  34. data/lib/spidr/rules.rb +79 -0
  35. data/lib/spidr/sanitizers.rb +56 -0
  36. data/lib/spidr/session_cache.rb +145 -0
  37. data/lib/spidr/spidr.rb +107 -0
  38. data/lib/spidr/version.rb +4 -0
  39. data/lib/spidr/version.rb~ +4 -0
  40. data/lib/spidr.rb +3 -0
  41. data/pkg/spidr-1.0.0.gem +0 -0
  42. data/spec/actions_spec.rb +59 -0
  43. data/spec/agent_spec.rb +81 -0
  44. data/spec/auth_store_spec.rb +85 -0
  45. data/spec/cookie_jar_spec.rb +144 -0
  46. data/spec/extensions/uri_spec.rb +43 -0
  47. data/spec/filters_spec.rb +61 -0
  48. data/spec/helpers/history.rb +34 -0
  49. data/spec/helpers/page.rb +8 -0
  50. data/spec/helpers/wsoc.rb +83 -0
  51. data/spec/page_examples.rb +21 -0
  52. data/spec/page_spec.rb +125 -0
  53. data/spec/rules_spec.rb +45 -0
  54. data/spec/sanitizers_spec.rb +61 -0
  55. data/spec/session_cache.rb +58 -0
  56. data/spec/spec_helper.rb +4 -0
  57. data/spec/spidr_spec.rb +39 -0
  58. data/spidr.gemspec +133 -0
  59. data/spidr.gemspec~ +131 -0
  60. metadata +158 -0
@@ -0,0 +1,28 @@
1
+ module Spidr
2
+ #
3
+ # Represents HTTP Authentication credentials for a website.
4
+ #
5
+ class AuthCredential
6
+
7
+ # The username
8
+ attr_reader :username
9
+
10
+ # The password
11
+ attr_reader :password
12
+
13
+ #
14
+ # Creates a new credential used for authentication.
15
+ #
16
+ # @param [String] username
17
+ # The username for the credential.
18
+ #
19
+ # @param [String] password
20
+ # The password for the credential.
21
+ #
22
+ def initialize(username,password)
23
+ @username = username
24
+ @password = password
25
+ end
26
+
27
+ end
28
+ end
@@ -0,0 +1,161 @@
1
+ require 'spidrs/extensions/uri'
2
+ require 'spidrs/auth_credential'
3
+ require 'spidrs/page'
4
+
5
+ require 'base64'
6
+
7
+ module Spidr
8
+ #
9
+ # Stores {AuthCredential} objects organized by a website's scheme,
10
+ # host-name and sub-directory.
11
+ #
12
+ class AuthStore
13
+
14
+ #
15
+ # Creates a new auth store.
16
+ #
17
+ # @since 0.2.2
18
+ #
19
+ def initialize
20
+ @credentials = {}
21
+ end
22
+
23
+ #
24
+ # Given a URL, return the most specific matching auth credential.
25
+ #
26
+ # @param [URI] url
27
+ # A fully qualified url including optional path.
28
+ #
29
+ # @return [AuthCredential, nil]
30
+ # Closest matching {AuthCredential} values for the URL,
31
+ # or `nil` if nothing matches.
32
+ #
33
+ # @since 0.2.2
34
+ #
35
+ def [](url)
36
+ # normalize the url
37
+ url = URI(url.to_s) unless url.kind_of?(URI)
38
+
39
+ key = [url.scheme, url.host, url.port]
40
+ paths = @credentials[key]
41
+
42
+ return nil unless paths
43
+
44
+ # longest path first
45
+ ordered_paths = paths.keys.sort_by { |key| key.length }.reverse
46
+
47
+ # directories of the path
48
+ path_dirs = URI.expand_path(url.path).split('/')
49
+
50
+ ordered_paths.each do |path|
51
+ return paths[path] if path_dirs[0,path.length] == path
52
+ end
53
+
54
+ return nil
55
+ end
56
+
57
+ #
58
+ # Add an auth credential to the store for supplied base URL.
59
+ #
60
+ # @param [URI] url_base
61
+ # A URL pattern to associate with a set of auth credentials.
62
+ #
63
+ # @param [AuthCredential]
64
+ # The auth credential for this URL pattern.
65
+ #
66
+ # @return [AuthCredential]
67
+ # The newly added auth credential.
68
+ #
69
+ # @since 0.2.2
70
+ #
71
+ def []=(url,auth)
72
+ # normalize the url
73
+ url = URI(url.to_s) unless url.kind_of?(URI)
74
+
75
+ # normalize the URL path
76
+ path = URI.expand_path(url.path)
77
+
78
+ key = [url.scheme, url.host, url.port]
79
+
80
+ @credentials[key] ||= {}
81
+ @credentials[key][path.split('/')] = auth
82
+ return auth
83
+ end
84
+
85
+ #
86
+ # Convenience method to add username and password credentials
87
+ # for a named URL.
88
+ #
89
+ # @param [URI] url
90
+ # The base URL that requires authorization.
91
+ #
92
+ # @param [String] username
93
+ # The username required to access the URL.
94
+ #
95
+ # @param [String] password
96
+ # The password required to access the URL.
97
+ #
98
+ # @return [AuthCredential]
99
+ # The newly added auth credential.
100
+ #
101
+ # @since 0.2.2
102
+ #
103
+ def add(url,username,password)
104
+ self[url] = AuthCredential.new(username,password)
105
+ end
106
+
107
+ #
108
+ # Returns the base64 encoded authorization string for the URL
109
+ # or `nil` if no authorization exists.
110
+ #
111
+ # @param [URI] url
112
+ # The url.
113
+ #
114
+ # @return [String, nil]
115
+ # The base64 encoded authorizatio string or `nil`.
116
+ #
117
+ # @since 0.2.2
118
+ #
119
+ def for_url(url)
120
+ if (auth = self[url])
121
+ return Base64.encode64("#{auth.username}:#{auth.password}")
122
+ end
123
+ end
124
+
125
+ #
126
+ # Clear the contents of the auth store.
127
+ #
128
+ # @return [AuthStore]
129
+ # The cleared auth store.
130
+ #
131
+ # @since 0.2.2
132
+ #
133
+ def clear!
134
+ @credentials.clear
135
+ return self
136
+ end
137
+
138
+ #
139
+ # Size of the current auth store (number of URL paths stored).
140
+ #
141
+ # @return [Integer]
142
+ # The size of the auth store.
143
+ #
144
+ # @since 0.2.2
145
+ #
146
+ def size
147
+ @credentials.inject(0) { |res, arr| res + arr[1].length }
148
+ end
149
+
150
+ #
151
+ # Inspects the auth store.
152
+ #
153
+ # @return [String]
154
+ # The inspected version of the auth store.
155
+ #
156
+ def inspect
157
+ "#<#{self.class}: #{@credentials.inspect}>"
158
+ end
159
+
160
+ end
161
+ end
data/lib/spidr/body.rb ADDED
@@ -0,0 +1,98 @@
1
+ require 'nokogiri'
2
+
3
+ module Spidr
4
+ module Body
5
+ #
6
+ # The body of the response.
7
+ #
8
+ # @return [String]
9
+ # The body of the response.
10
+ #
11
+ def body
12
+ (response.body || '')
13
+ end
14
+
15
+ #
16
+ # Returns a parsed document object for HTML, XML, RSS and Atom pages.
17
+ #
18
+ # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
19
+ # The document that represents HTML or XML pages.
20
+ # Returns `nil` if the page is neither HTML, XML, RSS, Atom or if
21
+ # the page could not be parsed properly.
22
+ #
23
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
24
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
25
+ #
26
+ def doc
27
+ unless body.empty?
28
+ begin
29
+ if html?
30
+ @doc ||= Nokogiri::HTML(body, @url.to_s, content_charset)
31
+ elsif (rss? || atom? || xml? || xsl?)
32
+ @doc ||= Nokogiri::XML(body, @url.to_s, content_charset)
33
+ end
34
+ rescue
35
+ end
36
+ end
37
+ end
38
+
39
+ #
40
+ # Searches the document for XPath or CSS Path paths.
41
+ #
42
+ # @param [Array<String>] paths
43
+ # CSS or XPath expressions to search the document with.
44
+ #
45
+ # @return [Array]
46
+ # The matched nodes from the document.
47
+ # Returns an empty Array if no nodes were matched, or if the page
48
+ # is not an HTML or XML document.
49
+ #
50
+ # @example
51
+ # page.search('//a[@href]')
52
+ #
53
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
54
+ #
55
+ def search(*paths)
56
+ if doc
57
+ doc.search(*paths)
58
+ else
59
+ []
60
+ end
61
+ end
62
+
63
+ #
64
+ # Searches for the first occurrence an XPath or CSS Path expression.
65
+ #
66
+ # @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
67
+ # The first matched node. Returns `nil` if no nodes could be matched,
68
+ # or if the page is not a HTML or XML document.
69
+ #
70
+ # @example
71
+ # page.at('//title')
72
+ #
73
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
74
+ #
75
+ def at(*arguments)
76
+ if doc
77
+ doc.at(*arguments)
78
+ end
79
+ end
80
+
81
+ alias / search
82
+ alias % at
83
+
84
+ #
85
+ # The title of the HTML page.
86
+ #
87
+ # @return [String]
88
+ # The inner-text of the title element of the page.
89
+ #
90
+ def title
91
+ if (node = at('//title'))
92
+ node.inner_text
93
+ end
94
+ end
95
+
96
+ alias to_s body
97
+ end
98
+ end
@@ -0,0 +1,202 @@
1
+ require 'spidrs/page'
2
+
3
+ require 'set'
4
+
5
+ module Spidr
6
+ #
7
+ # Stores HTTP Cookies organized by host-name.
8
+ #
9
+ class CookieJar
10
+
11
+ include Enumerable
12
+
13
+ #
14
+ # Creates a new Cookie Jar object.
15
+ #
16
+ # @since 0.2.2
17
+ #
18
+ def initialize
19
+ @params = {}
20
+
21
+ @dirty = Set[]
22
+ @cookies = {}
23
+ end
24
+
25
+ #
26
+ # Enumerates over the host-name and cookie value pairs in the
27
+ # cookie jar.
28
+ #
29
+ # @yield [host, cookie]
30
+ # If a block is given, it will be passed each host-name and cookie
31
+ # value pair.
32
+ #
33
+ # @yieldparam [String] host
34
+ # The host-name that the cookie is bound to.
35
+ #
36
+ # @yieldparam [String] cookie
37
+ # The cookie value.
38
+ #
39
+ # @since 0.2.2
40
+ #
41
+ def each(&block)
42
+ @params.each(&block)
43
+ end
44
+
45
+ #
46
+ # Return all relevant cookies in a single string for the
47
+ # named host or domain (in browser request format).
48
+ #
49
+ # @param [String] host
50
+ # Host or domain name for cookies.
51
+ #
52
+ # @return [String, nil]
53
+ # The cookie values or `nil` if the host does not have a cookie in the
54
+ # jar.
55
+ #
56
+ # @since 0.2.2
57
+ #
58
+ def [](host)
59
+ @params[host] ||= {}
60
+ end
61
+
62
+ #
63
+ # Add a cookie to the jar for a particular domain.
64
+ #
65
+ # @param [String] host
66
+ # Host or domain name to associate with the cookie.
67
+ #
68
+ # @param [Hash{String => String}] cookies
69
+ # Cookie params.
70
+ #
71
+ # @since 0.2.2
72
+ #
73
+ def []=(host,cookies)
74
+ collected = self[host]
75
+
76
+ cookies.each do |key,value|
77
+ if collected[key] != value
78
+ collected.merge!(cookies)
79
+ @dirty << host
80
+
81
+ break
82
+ end
83
+ end
84
+
85
+ return cookies
86
+ end
87
+
88
+ #
89
+ # Retrieve cookies for a domain from a page response header.
90
+ #
91
+ # @param [Page] page
92
+ # The response page from which to extract cookie data.
93
+ #
94
+ # @return [Boolean]
95
+ # Specifies whether cookies were added from the page.
96
+ #
97
+ # @since 0.2.2
98
+ #
99
+ def from_page(page)
100
+ cookies = page.cookie_params
101
+
102
+ unless cookies.empty?
103
+ self[page.url.host] = cookies
104
+ return true
105
+ end
106
+
107
+ return false
108
+ end
109
+
110
+ #
111
+ # Returns the pre-encoded Cookie for a given host.
112
+ #
113
+ # @param [String] host
114
+ # The name of the host.
115
+ #
116
+ # @return [String]
117
+ # The encoded Cookie.
118
+ #
119
+ # @since 0.2.2
120
+ #
121
+ def for_host(host)
122
+ if @dirty.include?(host)
123
+ values = []
124
+
125
+ cookies_for_host(host).each do |name,value|
126
+ values << "#{name}=#{value}"
127
+ end
128
+
129
+ @cookies[host] = values.join('; ')
130
+ @dirty.delete(host)
131
+ end
132
+
133
+ return @cookies[host]
134
+ end
135
+
136
+ #
137
+ # Returns raw cookie value pairs for a given host. Includes cookies set on
138
+ # parent domain(s).
139
+ #
140
+ # @param [String] host
141
+ # The name of the host.
142
+ #
143
+ # @return [Hash{String => String}]
144
+ # Cookie params.
145
+ #
146
+ # @since 0.2.7
147
+ #
148
+ def cookies_for_host(host)
149
+ host_cookies = (@params[host] || {})
150
+ sub_domains = host.split('.')
151
+
152
+ while sub_domains.length > 2
153
+ sub_domains.shift
154
+
155
+ if (parent_cookies = @params[sub_domains.join('.')])
156
+ parent_cookies.each do |name,value|
157
+ # copy in the parent cookies, only if they haven't been
158
+ # overridden yet.
159
+ unless host_cookies.has_key?(name)
160
+ host_cookies[name] = value
161
+ end
162
+ end
163
+ end
164
+ end
165
+
166
+ return host_cookies
167
+ end
168
+
169
+ #
170
+ # Clear out the jar, removing all stored cookies.
171
+ #
172
+ # @since 0.2.2
173
+ #
174
+ def clear!
175
+ @params.clear
176
+
177
+ @dirty.clear
178
+ @cookies.clear
179
+ return self
180
+ end
181
+
182
+ #
183
+ # Size of the current cookie jar store.
184
+ #
185
+ # @since 0.2.2
186
+ #
187
+ def size
188
+ @params.size
189
+ end
190
+
191
+ #
192
+ # Inspects the cookie jar.
193
+ #
194
+ # @return [String]
195
+ # The inspected version of the cookie jar.
196
+ #
197
+ def inspect
198
+ "#<#{self.class}: #{@params.inspect}>"
199
+ end
200
+
201
+ end
202
+ end