spidr_epg 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +10 -0
  3. data/.rspec +1 -0
  4. data/.yardopts +1 -0
  5. data/ChangeLog.md +291 -0
  6. data/ChangeLog.md~ +291 -0
  7. data/Gemfile +16 -0
  8. data/Gemfile.lock +49 -0
  9. data/Gemfile~ +16 -0
  10. data/LICENSE.txt +20 -0
  11. data/README.md +193 -0
  12. data/README.md~ +190 -0
  13. data/Rakefile +29 -0
  14. data/gemspec.yml +19 -0
  15. data/lib/spidr/actions/actions.rb +83 -0
  16. data/lib/spidr/actions/exceptions/action.rb +9 -0
  17. data/lib/spidr/actions/exceptions/paused.rb +11 -0
  18. data/lib/spidr/actions/exceptions/skip_link.rb +12 -0
  19. data/lib/spidr/actions/exceptions/skip_page.rb +12 -0
  20. data/lib/spidr/actions/exceptions.rb +4 -0
  21. data/lib/spidr/actions.rb +2 -0
  22. data/lib/spidr/agent.rb +866 -0
  23. data/lib/spidr/auth_credential.rb +28 -0
  24. data/lib/spidr/auth_store.rb +161 -0
  25. data/lib/spidr/body.rb +98 -0
  26. data/lib/spidr/cookie_jar.rb +202 -0
  27. data/lib/spidr/events.rb +537 -0
  28. data/lib/spidr/extensions/uri.rb +52 -0
  29. data/lib/spidr/extensions.rb +1 -0
  30. data/lib/spidr/filters.rb +539 -0
  31. data/lib/spidr/headers.rb +370 -0
  32. data/lib/spidr/links.rb +229 -0
  33. data/lib/spidr/page.rb +108 -0
  34. data/lib/spidr/rules.rb +79 -0
  35. data/lib/spidr/sanitizers.rb +56 -0
  36. data/lib/spidr/session_cache.rb +145 -0
  37. data/lib/spidr/spidr.rb +107 -0
  38. data/lib/spidr/version.rb +4 -0
  39. data/lib/spidr/version.rb~ +4 -0
  40. data/lib/spidr.rb +3 -0
  41. data/pkg/spidr-1.0.0.gem +0 -0
  42. data/spec/actions_spec.rb +59 -0
  43. data/spec/agent_spec.rb +81 -0
  44. data/spec/auth_store_spec.rb +85 -0
  45. data/spec/cookie_jar_spec.rb +144 -0
  46. data/spec/extensions/uri_spec.rb +43 -0
  47. data/spec/filters_spec.rb +61 -0
  48. data/spec/helpers/history.rb +34 -0
  49. data/spec/helpers/page.rb +8 -0
  50. data/spec/helpers/wsoc.rb +83 -0
  51. data/spec/page_examples.rb +21 -0
  52. data/spec/page_spec.rb +125 -0
  53. data/spec/rules_spec.rb +45 -0
  54. data/spec/sanitizers_spec.rb +61 -0
  55. data/spec/session_cache.rb +58 -0
  56. data/spec/spec_helper.rb +4 -0
  57. data/spec/spidr_spec.rb +39 -0
  58. data/spidr.gemspec +133 -0
  59. data/spidr.gemspec~ +131 -0
  60. metadata +158 -0
@@ -0,0 +1,370 @@
1
+ require 'set'
2
+
3
+ module Spidr
4
+ module Headers
5
+ # Reserved names used within Cookie strings
6
+ RESERVED_COOKIE_NAMES = Set['path', 'expires', 'domain']
7
+
8
+ #
9
+ # The response code from the page.
10
+ #
11
+ # @return [Integer]
12
+ # Response code from the page.
13
+ #
14
+ def code
15
+ response.code.to_i
16
+ end
17
+
18
+ #
19
+ # Determines if the response code is `200`.
20
+ #
21
+ # @return [Boolean]
22
+ # Specifies whether the response code is `200`.
23
+ #
24
+ def is_ok?
25
+ code == 200
26
+ end
27
+
28
+ alias ok? is_ok?
29
+
30
+ #
31
+ # Determines if the response code is `308`.
32
+ #
33
+ # @return [Boolean]
34
+ # Specifies whether the response code is `308`.
35
+ #
36
+ def timedout?
37
+ code == 308
38
+ end
39
+
40
+ #
41
+ # Determines if the response code is `400`.
42
+ #
43
+ # @return [Boolean]
44
+ # Specifies whether the response code is `400`.
45
+ #
46
+ def bad_request?
47
+ code == 400
48
+ end
49
+
50
+ #
51
+ # Determines if the response code is `401`.
52
+ #
53
+ # @return [Boolean]
54
+ # Specifies whether the response code is `401`.
55
+ #
56
+ def is_unauthorized?
57
+ code == 401
58
+ end
59
+
60
+ alias unauthorized? is_unauthorized?
61
+
62
+ #
63
+ # Determines if the response code is `403`.
64
+ #
65
+ # @return [Boolean]
66
+ # Specifies whether the response code is `403`.
67
+ #
68
+ def is_forbidden?
69
+ code == 403
70
+ end
71
+
72
+ alias forbidden? is_forbidden?
73
+
74
+ #
75
+ # Determines if the response code is `404`.
76
+ #
77
+ # @return [Boolean]
78
+ # Specifies whether the response code is `404`.
79
+ #
80
+ def is_missing?
81
+ code == 404
82
+ end
83
+
84
+ alias missing? is_missing?
85
+
86
+ #
87
+ # Determines if the response code is `500`.
88
+ #
89
+ # @return [Boolean]
90
+ # Specifies whether the response code is `500`.
91
+ #
92
+ def had_internal_server_error?
93
+ code == 500
94
+ end
95
+
96
+ #
97
+ # The Content-Type of the page.
98
+ #
99
+ # @return [String]
100
+ # The Content-Type of the page.
101
+ #
102
+ def content_type
103
+ (response['Content-Type'] || '')
104
+ end
105
+
106
+ #
107
+ # The content types of the page.
108
+ #
109
+ # @return [Array<String>]
110
+ # The values within the Content-Type header.
111
+ #
112
+ # @since 0.2.2
113
+ #
114
+ def content_types
115
+ (headers['content-type'] || [])
116
+ end
117
+
118
+ #
119
+ # The charset included in the Content-Type.
120
+ #
121
+ # @return [String, nil]
122
+ # The charset of the content.
123
+ #
124
+ # @since 0.4.0
125
+ #
126
+ def content_charset
127
+ content_types.each do |value|
128
+ if value.include?(';')
129
+ value.split(';').each do |param|
130
+ param.strip!
131
+
132
+ if param.start_with?('charset=')
133
+ return param.split('=',2).last
134
+ end
135
+ end
136
+ end
137
+ end
138
+
139
+ return nil
140
+ end
141
+
142
+ #
143
+ # Determines if any of the content-types of the page include a given
144
+ # type.
145
+ #
146
+ # @param [String] type
147
+ # The content-type to test for.
148
+ #
149
+ # @return [Boolean]
150
+ # Specifies whether the page includes the given content-type.
151
+ #
152
+ # @example Match the Content-Type
153
+ # page.is_content_type?('application/json')
154
+ #
155
+ # @example Match the sub-type of the Content-Type
156
+ # page.is_content_type?('json')
157
+ #
158
+ # @since 0.4.0
159
+ #
160
+ def is_content_type?(type)
161
+ if type.include?('/')
162
+ # otherwise only match the first param
163
+ content_types.any? do |value|
164
+ value = value.split(';',2).first
165
+
166
+ value == type
167
+ end
168
+ else
169
+ # otherwise only match the sub-type
170
+ content_types.any? do |value|
171
+ value = value.split(';',2).first
172
+ value = value.split('/',2).last
173
+
174
+ value == type
175
+ end
176
+ end
177
+ end
178
+
179
+ #
180
+ # Determines if the page is plain-text.
181
+ #
182
+ # @return [Boolean]
183
+ # Specifies whether the page is plain-text.
184
+ #
185
+ def plain_text?
186
+ is_content_type?('text/plain')
187
+ end
188
+
189
+ alias txt? plain_text?
190
+
191
+ #
192
+ # Determines if the page is a Directory Listing.
193
+ #
194
+ # @return [Boolean]
195
+ # Specifies whether the page is a Directory Listing.
196
+ #
197
+ # @since 0.3.0
198
+ #
199
+ def directory?
200
+ is_content_type?('text/directory')
201
+ end
202
+
203
+ #
204
+ # Determines if the page is HTML document.
205
+ #
206
+ # @return [Boolean]
207
+ # Specifies whether the page is HTML document.
208
+ #
209
+ def html?
210
+ is_content_type?('text/html')
211
+ end
212
+
213
+ #
214
+ # Determines if the page is XML document.
215
+ #
216
+ # @return [Boolean]
217
+ # Specifies whether the page is XML document.
218
+ #
219
+ def xml?
220
+ is_content_type?('text/xml') || \
221
+ is_content_type?('application/xml')
222
+ end
223
+
224
+ #
225
+ # Determines if the page is XML Stylesheet (XSL).
226
+ #
227
+ # @return [Boolean]
228
+ # Specifies whether the page is XML Stylesheet (XSL).
229
+ #
230
+ def xsl?
231
+ is_content_type?('text/xsl')
232
+ end
233
+
234
+ #
235
+ # Determines if the page is JavaScript.
236
+ #
237
+ # @return [Boolean]
238
+ # Specifies whether the page is JavaScript.
239
+ #
240
+ def javascript?
241
+ is_content_type?('text/javascript') || \
242
+ is_content_type?('application/javascript')
243
+ end
244
+
245
+ #
246
+ # Determines if the page is JSON.
247
+ #
248
+ # @return [Boolean]
249
+ # Specifies whether the page is JSON.
250
+ #
251
+ # @since 0.3.0
252
+ #
253
+ def json?
254
+ is_content_type?('application/json')
255
+ end
256
+
257
+ #
258
+ # Determines if the page is a CSS stylesheet.
259
+ #
260
+ # @return [Boolean]
261
+ # Specifies whether the page is a CSS stylesheet.
262
+ #
263
+ def css?
264
+ is_content_type?('text/css')
265
+ end
266
+
267
+ #
268
+ # Determines if the page is a RSS feed.
269
+ #
270
+ # @return [Boolean]
271
+ # Specifies whether the page is a RSS feed.
272
+ #
273
+ def rss?
274
+ is_content_type?('application/rss+xml') || \
275
+ is_content_type?('application/rdf+xml')
276
+ end
277
+
278
+ #
279
+ # Determines if the page is an Atom feed.
280
+ #
281
+ # @return [Boolean]
282
+ # Specifies whether the page is an Atom feed.
283
+ #
284
+ def atom?
285
+ is_content_type?('application/atom+xml')
286
+ end
287
+
288
+ #
289
+ # Determines if the page is a MS Word document.
290
+ #
291
+ # @return [Boolean]
292
+ # Specifies whether the page is a MS Word document.
293
+ #
294
+ def ms_word?
295
+ is_content_type?('application/msword')
296
+ end
297
+
298
+ #
299
+ # Determines if the page is a PDF document.
300
+ #
301
+ # @return [Boolean]
302
+ # Specifies whether the page is a PDF document.
303
+ #
304
+ def pdf?
305
+ is_content_type?('application/pdf')
306
+ end
307
+
308
+ #
309
+ # Determines if the page is a ZIP archive.
310
+ #
311
+ # @return [Boolean]
312
+ # Specifies whether the page is a ZIP archive.
313
+ #
314
+ def zip?
315
+ is_content_type?('application/zip')
316
+ end
317
+
318
+ #
319
+ # The raw Cookie String sent along with the page.
320
+ #
321
+ # @return [String]
322
+ # The raw Cookie from the response.
323
+ #
324
+ # @since 0.2.7
325
+ #
326
+ def cookie
327
+ (response['Set-Cookie'] || '')
328
+ end
329
+
330
+ alias raw_cookie cookie
331
+
332
+ #
333
+ # The Cookie values sent along with the page.
334
+ #
335
+ # @return [Array<String>]
336
+ # The Cookies from the response.
337
+ #
338
+ # @since 0.2.2
339
+ #
340
+ def cookies
341
+ (headers['set-cookie'] || [])
342
+ end
343
+
344
+ #
345
+ # The Cookie key -> value pairs returned with the response.
346
+ #
347
+ # @return [Hash{String => String}]
348
+ # The cookie keys and values.
349
+ #
350
+ # @since 0.2.2
351
+ #
352
+ def cookie_params
353
+ params = {}
354
+
355
+ cookies.each do |value|
356
+ value.split(';').each do |param|
357
+ param.strip!
358
+
359
+ name, value = param.split('=',2)
360
+
361
+ unless RESERVED_COOKIE_NAMES.include?(name)
362
+ params[name] = (value || '')
363
+ end
364
+ end
365
+ end
366
+
367
+ return params
368
+ end
369
+ end
370
+ end
@@ -0,0 +1,229 @@
1
+ require 'spidrs/extensions/uri'
2
+ require 'uri'
3
+
4
+ module Spidr
5
+ module Links
6
+ include Enumerable
7
+
8
+ #
9
+ # Enumerates over the meta-redirect links in the page.
10
+ #
11
+ # @yield [link]
12
+ # If a block is given, it will be passed every meta-redirect link
13
+ # from the page.
14
+ #
15
+ # @yieldparam [String] link
16
+ # A meta-redirect link from the page.
17
+ #
18
+ # @return [Enumerator]
19
+ # If no block is given, an enumerator object will be returned.
20
+ #
21
+ # @since 0.3.0
22
+ #
23
+ def each_meta_redirect
24
+ return enum_for(:each_meta_redirect) unless block_given?
25
+
26
+ if (html? && doc)
27
+ search('//meta[@http-equiv and @content]').each do |node|
28
+ if node.get_attribute('http-equiv') =~ /refresh/i
29
+ content = node.get_attribute('content')
30
+
31
+ if (redirect = content.match(/url=(\S+)$/))
32
+ yield redirect[1]
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
38
+
39
+ #
40
+ # Returns a boolean indicating whether or not page-level meta
41
+ # redirects are present in this page.
42
+ #
43
+ # @return [Boolean]
44
+ # Specifies whether the page includes page-level redirects.
45
+ #
46
+ def meta_redirect?
47
+ !(each_meta_redirect.first.nil?)
48
+ end
49
+
50
+ #
51
+ # The meta-redirect links of the page.
52
+ #
53
+ # @return [Array<String>]
54
+ # All meta-redirect links in the page.
55
+ #
56
+ # @since 0.3.0
57
+ #
58
+ def meta_redirects
59
+ each_meta_redirect.to_a
60
+ end
61
+
62
+ #
63
+ # Enumerates over every HTTP or meta-redirect link in the page.
64
+ #
65
+ # @yield [link]
66
+ # The given block will be passed every redirection link from the page.
67
+ #
68
+ # @yieldparam [String] link
69
+ # A HTTP or meta-redirect link from the page.
70
+ #
71
+ # @return [Enumerator]
72
+ # If no block is given, an enumerator object will be returned.
73
+ #
74
+ # @since 0.3.0
75
+ #
76
+ def each_redirect(&block)
77
+ return enum_for(:each_redirect) unless block
78
+
79
+ location = headers['location']
80
+
81
+ if location.nil?
82
+ # check page-level meta redirects if there isn't a location header
83
+ each_meta_redirect(&block)
84
+ elsif location.kind_of?(Array)
85
+ location.each(&block)
86
+ else
87
+ # usually the location header contains a single String
88
+ yield location
89
+ end
90
+ end
91
+
92
+ #
93
+ # URLs that this document redirects to.
94
+ #
95
+ # @return [Array<String>]
96
+ # The links that this page redirects to (usually found in a
97
+ # location header or by way of a page-level meta redirect).
98
+ #
99
+ def redirects_to
100
+ each_redirect.to_a
101
+ end
102
+
103
+ #
104
+ # Enumerates over every link in the page.
105
+ #
106
+ # @yield [link]
107
+ # The given block will be passed every non-empty link in the page.
108
+ #
109
+ # @yieldparam [String] link
110
+ # A link in the page.
111
+ #
112
+ # @return [Enumerator]
113
+ # If no block is given, an enumerator object will be returned.
114
+ #
115
+ # @since 0.3.0
116
+ #
117
+ def each_link
118
+ return enum_for(:each_link) unless block_given?
119
+
120
+ filter = lambda { |url|
121
+ yield url unless (url.nil? || url.empty?)
122
+ }
123
+
124
+ each_redirect(&filter) if is_redirect?
125
+
126
+ if (html? && doc)
127
+ doc.search('//a[@href]').each do |a|
128
+ filter.call(a.get_attribute('href'))
129
+ end
130
+
131
+ doc.search('//frame[@src]').each do |iframe|
132
+ filter.call(iframe.get_attribute('src'))
133
+ end
134
+
135
+ doc.search('//iframe[@src]').each do |iframe|
136
+ filter.call(iframe.get_attribute('src'))
137
+ end
138
+
139
+ doc.search('//link[@href]').each do |link|
140
+ filter.call(link.get_attribute('href'))
141
+ end
142
+
143
+ doc.search('//script[@src]').each do |script|
144
+ filter.call(script.get_attribute('src'))
145
+ end
146
+ end
147
+ end
148
+
149
+ #
150
+ # The links from within the page.
151
+ #
152
+ # @return [Array<String>]
153
+ # All links within the HTML page, frame/iframe source URLs and any
154
+ # links in the `Location` header.
155
+ #
156
+ def links
157
+ each_link.to_a
158
+ end
159
+
160
+ #
161
+ # Enumerates over every absolute URL in the page.
162
+ #
163
+ # @yield [url]
164
+ # The given block will be passed every URL in the page.
165
+ #
166
+ # @yieldparam [URI::HTTP] url
167
+ # An absolute URL in the page.
168
+ #
169
+ # @return [Enumerator]
170
+ # If no block is given, an enumerator object will be returned.
171
+ #
172
+ # @since 0.3.0
173
+ #
174
+ def each_url
175
+ return enum_for(:each_url) unless block_given?
176
+
177
+ each_link do |link|
178
+ if (url = to_absolute(link))
179
+ yield url
180
+ end
181
+ end
182
+ end
183
+
184
+ alias each each_url
185
+
186
+ #
187
+ # Absolute URIs from within the page.
188
+ #
189
+ # @return [Array<URI::HTTP>]
190
+ # The links from within the page, converted to absolute URIs.
191
+ #
192
+ def urls
193
+ each_url.to_a
194
+ end
195
+
196
+ #
197
+ # Normalizes and expands a given link into a proper URI.
198
+ #
199
+ # @param [String] link
200
+ # The link to normalize and expand.
201
+ #
202
+ # @return [URI::HTTP]
203
+ # The normalized URI.
204
+ #
205
+ def to_absolute(link)
206
+ begin
207
+ new_url = url.merge(link.to_s)
208
+ rescue Exception
209
+ return nil
210
+ end
211
+
212
+ if new_url.path
213
+ path = new_url.path
214
+
215
+ # ensure that paths begin with a leading '/' for URI::FTP
216
+ if (new_url.scheme == 'ftp' && path[0,1] != '/')
217
+ path.insert(0,'/')
218
+ end
219
+
220
+ # make sure the path does not contain any .. or . directories,
221
+ # since URI::Generic#merge cannot normalize paths such as
222
+ # "/stuff/../"
223
+ new_url.path = URI.expand_path(path)
224
+ end
225
+
226
+ return new_url
227
+ end
228
+ end
229
+ end
data/lib/spidr/page.rb ADDED
@@ -0,0 +1,108 @@
1
+ require 'spidrs/headers'
2
+ require 'spidrs/body'
3
+ require 'spidrs/links'
4
+
5
+ module Spidr
6
+ #
7
+ # Represents a requested page from a website.
8
+ #
9
+ class Page
10
+
11
+ include Headers
12
+ include Body
13
+ include Links
14
+
15
+ # URL of the page
16
+ attr_reader :url
17
+
18
+ # HTTP Response
19
+ attr_reader :response
20
+
21
+ # Headers returned with the body
22
+ attr_reader :headers
23
+
24
+ #
25
+ # Creates a new Page object.
26
+ #
27
+ # @param [URI::HTTP] url
28
+ # The URL of the page.
29
+ #
30
+ # @param [Net::HTTP::Response] response
31
+ # The response from the request for the page.
32
+ #
33
+ def initialize(url,response)
34
+ @url = url
35
+ @response = response
36
+ @headers = response.to_hash
37
+ @doc = nil
38
+ end
39
+
40
+ #
41
+ # The meta-redirect links of the page.
42
+ #
43
+ # @return [Array<String>]
44
+ # All meta-redirect links in the page.
45
+ #
46
+ # @deprecated
47
+ # Deprecated in 0.3.0 and will be removed in 0.4.0.
48
+ # Use {#meta_redirects} instead.
49
+ #
50
+ def meta_redirect
51
+ STDERR.puts 'DEPRECATION: Spidr::Page#meta_redirect will be removed in 0.3.0'
52
+ STDERR.puts 'DEPRECATION: Use Spidr::Page#meta_redirects instead'
53
+
54
+ meta_redirects
55
+ end
56
+
57
+ #
58
+ # Determines if the response code is `300`, `301`, `302`, `303`
59
+ # or `307`. Also checks for "soft" redirects added at the page
60
+ # level by a meta refresh tag.
61
+ #
62
+ # @return [Boolean]
63
+ # Specifies whether the response code is a HTTP Redirect code.
64
+ #
65
+ def is_redirect?
66
+ case code
67
+ when 300..303, 307
68
+ true
69
+ when 200
70
+ meta_redirect?
71
+ else
72
+ false
73
+ end
74
+ end
75
+
76
+ alias redirect? is_redirect?
77
+
78
+ protected
79
+
80
+ #
81
+ # Provides transparent access to the values in {#headers}.
82
+ #
83
+ # @param [Symbol] name
84
+ # The name of the missing method.
85
+ #
86
+ # @param [Array] arguments
87
+ # Additional arguments for the missing method.
88
+ #
89
+ # @return [String]
90
+ # The missing method mapped to a header in {#headers}.
91
+ #
92
+ # @raise [NoMethodError]
93
+ # The missing method did not map to a header in {#headers}.
94
+ #
95
+ def method_missing(name,*arguments,&block)
96
+ if (arguments.empty? && block.nil?)
97
+ header_name = name.to_s.sub('_','-')
98
+
99
+ if @response.key?(header_name)
100
+ return @response[header_name]
101
+ end
102
+ end
103
+
104
+ return super(name,*arguments,&block)
105
+ end
106
+
107
+ end
108
+ end