spidr_epg 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +10 -0
  3. data/.rspec +1 -0
  4. data/.yardopts +1 -0
  5. data/ChangeLog.md +291 -0
  6. data/ChangeLog.md~ +291 -0
  7. data/Gemfile +16 -0
  8. data/Gemfile.lock +49 -0
  9. data/Gemfile~ +16 -0
  10. data/LICENSE.txt +20 -0
  11. data/README.md +193 -0
  12. data/README.md~ +190 -0
  13. data/Rakefile +29 -0
  14. data/gemspec.yml +19 -0
  15. data/lib/spidr/actions/actions.rb +83 -0
  16. data/lib/spidr/actions/exceptions/action.rb +9 -0
  17. data/lib/spidr/actions/exceptions/paused.rb +11 -0
  18. data/lib/spidr/actions/exceptions/skip_link.rb +12 -0
  19. data/lib/spidr/actions/exceptions/skip_page.rb +12 -0
  20. data/lib/spidr/actions/exceptions.rb +4 -0
  21. data/lib/spidr/actions.rb +2 -0
  22. data/lib/spidr/agent.rb +866 -0
  23. data/lib/spidr/auth_credential.rb +28 -0
  24. data/lib/spidr/auth_store.rb +161 -0
  25. data/lib/spidr/body.rb +98 -0
  26. data/lib/spidr/cookie_jar.rb +202 -0
  27. data/lib/spidr/events.rb +537 -0
  28. data/lib/spidr/extensions/uri.rb +52 -0
  29. data/lib/spidr/extensions.rb +1 -0
  30. data/lib/spidr/filters.rb +539 -0
  31. data/lib/spidr/headers.rb +370 -0
  32. data/lib/spidr/links.rb +229 -0
  33. data/lib/spidr/page.rb +108 -0
  34. data/lib/spidr/rules.rb +79 -0
  35. data/lib/spidr/sanitizers.rb +56 -0
  36. data/lib/spidr/session_cache.rb +145 -0
  37. data/lib/spidr/spidr.rb +107 -0
  38. data/lib/spidr/version.rb +4 -0
  39. data/lib/spidr/version.rb~ +4 -0
  40. data/lib/spidr.rb +3 -0
  41. data/pkg/spidr-1.0.0.gem +0 -0
  42. data/spec/actions_spec.rb +59 -0
  43. data/spec/agent_spec.rb +81 -0
  44. data/spec/auth_store_spec.rb +85 -0
  45. data/spec/cookie_jar_spec.rb +144 -0
  46. data/spec/extensions/uri_spec.rb +43 -0
  47. data/spec/filters_spec.rb +61 -0
  48. data/spec/helpers/history.rb +34 -0
  49. data/spec/helpers/page.rb +8 -0
  50. data/spec/helpers/wsoc.rb +83 -0
  51. data/spec/page_examples.rb +21 -0
  52. data/spec/page_spec.rb +125 -0
  53. data/spec/rules_spec.rb +45 -0
  54. data/spec/sanitizers_spec.rb +61 -0
  55. data/spec/session_cache.rb +58 -0
  56. data/spec/spec_helper.rb +4 -0
  57. data/spec/spidr_spec.rb +39 -0
  58. data/spidr.gemspec +133 -0
  59. data/spidr.gemspec~ +131 -0
  60. metadata +158 -0
@@ -0,0 +1,370 @@
1
+ require 'set'
2
+
3
+ module Spidr
4
+ module Headers
5
+ # Reserved names used within Cookie strings
6
+ RESERVED_COOKIE_NAMES = Set['path', 'expires', 'domain']
7
+
8
+ #
9
+ # The response code from the page.
10
+ #
11
+ # @return [Integer]
12
+ # Response code from the page.
13
+ #
14
+ def code
15
+ response.code.to_i
16
+ end
17
+
18
+ #
19
+ # Determines if the response code is `200`.
20
+ #
21
+ # @return [Boolean]
22
+ # Specifies whether the response code is `200`.
23
+ #
24
+ def is_ok?
25
+ code == 200
26
+ end
27
+
28
+ alias ok? is_ok?
29
+
30
+ #
31
+ # Determines if the response code is `308`.
32
+ #
33
+ # @return [Boolean]
34
+ # Specifies whether the response code is `308`.
35
+ #
36
+ def timedout?
37
+ code == 308
38
+ end
39
+
40
+ #
41
+ # Determines if the response code is `400`.
42
+ #
43
+ # @return [Boolean]
44
+ # Specifies whether the response code is `400`.
45
+ #
46
+ def bad_request?
47
+ code == 400
48
+ end
49
+
50
+ #
51
+ # Determines if the response code is `401`.
52
+ #
53
+ # @return [Boolean]
54
+ # Specifies whether the response code is `401`.
55
+ #
56
+ def is_unauthorized?
57
+ code == 401
58
+ end
59
+
60
+ alias unauthorized? is_unauthorized?
61
+
62
+ #
63
+ # Determines if the response code is `403`.
64
+ #
65
+ # @return [Boolean]
66
+ # Specifies whether the response code is `403`.
67
+ #
68
+ def is_forbidden?
69
+ code == 403
70
+ end
71
+
72
+ alias forbidden? is_forbidden?
73
+
74
+ #
75
+ # Determines if the response code is `404`.
76
+ #
77
+ # @return [Boolean]
78
+ # Specifies whether the response code is `404`.
79
+ #
80
+ def is_missing?
81
+ code == 404
82
+ end
83
+
84
+ alias missing? is_missing?
85
+
86
+ #
87
+ # Determines if the response code is `500`.
88
+ #
89
+ # @return [Boolean]
90
+ # Specifies whether the response code is `500`.
91
+ #
92
+ def had_internal_server_error?
93
+ code == 500
94
+ end
95
+
96
+ #
97
+ # The Content-Type of the page.
98
+ #
99
+ # @return [String]
100
+ # The Content-Type of the page.
101
+ #
102
+ def content_type
103
+ (response['Content-Type'] || '')
104
+ end
105
+
106
+ #
107
+ # The content types of the page.
108
+ #
109
+ # @return [Array<String>]
110
+ # The values within the Content-Type header.
111
+ #
112
+ # @since 0.2.2
113
+ #
114
+ def content_types
115
+ (headers['content-type'] || [])
116
+ end
117
+
118
+ #
119
+ # The charset included in the Content-Type.
120
+ #
121
+ # @return [String, nil]
122
+ # The charset of the content.
123
+ #
124
+ # @since 0.4.0
125
+ #
126
+ def content_charset
127
+ content_types.each do |value|
128
+ if value.include?(';')
129
+ value.split(';').each do |param|
130
+ param.strip!
131
+
132
+ if param.start_with?('charset=')
133
+ return param.split('=',2).last
134
+ end
135
+ end
136
+ end
137
+ end
138
+
139
+ return nil
140
+ end
141
+
142
+ #
143
+ # Determines if any of the content-types of the page include a given
144
+ # type.
145
+ #
146
+ # @param [String] type
147
+ # The content-type to test for.
148
+ #
149
+ # @return [Boolean]
150
+ # Specifies whether the page includes the given content-type.
151
+ #
152
+ # @example Match the Content-Type
153
+ # page.is_content_type?('application/json')
154
+ #
155
+ # @example Match the sub-type of the Content-Type
156
+ # page.is_content_type?('json')
157
+ #
158
+ # @since 0.4.0
159
+ #
160
+ def is_content_type?(type)
161
+ if type.include?('/')
162
+ # otherwise only match the first param
163
+ content_types.any? do |value|
164
+ value = value.split(';',2).first
165
+
166
+ value == type
167
+ end
168
+ else
169
+ # otherwise only match the sub-type
170
+ content_types.any? do |value|
171
+ value = value.split(';',2).first
172
+ value = value.split('/',2).last
173
+
174
+ value == type
175
+ end
176
+ end
177
+ end
178
+
179
+ #
180
+ # Determines if the page is plain-text.
181
+ #
182
+ # @return [Boolean]
183
+ # Specifies whether the page is plain-text.
184
+ #
185
+ def plain_text?
186
+ is_content_type?('text/plain')
187
+ end
188
+
189
+ alias txt? plain_text?
190
+
191
+ #
192
+ # Determines if the page is a Directory Listing.
193
+ #
194
+ # @return [Boolean]
195
+ # Specifies whether the page is a Directory Listing.
196
+ #
197
+ # @since 0.3.0
198
+ #
199
+ def directory?
200
+ is_content_type?('text/directory')
201
+ end
202
+
203
+ #
204
+ # Determines if the page is HTML document.
205
+ #
206
+ # @return [Boolean]
207
+ # Specifies whether the page is HTML document.
208
+ #
209
+ def html?
210
+ is_content_type?('text/html')
211
+ end
212
+
213
+ #
214
+ # Determines if the page is XML document.
215
+ #
216
+ # @return [Boolean]
217
+ # Specifies whether the page is XML document.
218
+ #
219
+ def xml?
220
+ is_content_type?('text/xml') || \
221
+ is_content_type?('application/xml')
222
+ end
223
+
224
+ #
225
+ # Determines if the page is XML Stylesheet (XSL).
226
+ #
227
+ # @return [Boolean]
228
+ # Specifies whether the page is XML Stylesheet (XSL).
229
+ #
230
+ def xsl?
231
+ is_content_type?('text/xsl')
232
+ end
233
+
234
+ #
235
+ # Determines if the page is JavaScript.
236
+ #
237
+ # @return [Boolean]
238
+ # Specifies whether the page is JavaScript.
239
+ #
240
+ def javascript?
241
+ is_content_type?('text/javascript') || \
242
+ is_content_type?('application/javascript')
243
+ end
244
+
245
+ #
246
+ # Determines if the page is JSON.
247
+ #
248
+ # @return [Boolean]
249
+ # Specifies whether the page is JSON.
250
+ #
251
+ # @since 0.3.0
252
+ #
253
+ def json?
254
+ is_content_type?('application/json')
255
+ end
256
+
257
+ #
258
+ # Determines if the page is a CSS stylesheet.
259
+ #
260
+ # @return [Boolean]
261
+ # Specifies whether the page is a CSS stylesheet.
262
+ #
263
+ def css?
264
+ is_content_type?('text/css')
265
+ end
266
+
267
+ #
268
+ # Determines if the page is a RSS feed.
269
+ #
270
+ # @return [Boolean]
271
+ # Specifies whether the page is a RSS feed.
272
+ #
273
+ def rss?
274
+ is_content_type?('application/rss+xml') || \
275
+ is_content_type?('application/rdf+xml')
276
+ end
277
+
278
+ #
279
+ # Determines if the page is an Atom feed.
280
+ #
281
+ # @return [Boolean]
282
+ # Specifies whether the page is an Atom feed.
283
+ #
284
+ def atom?
285
+ is_content_type?('application/atom+xml')
286
+ end
287
+
288
+ #
289
+ # Determines if the page is a MS Word document.
290
+ #
291
+ # @return [Boolean]
292
+ # Specifies whether the page is a MS Word document.
293
+ #
294
+ def ms_word?
295
+ is_content_type?('application/msword')
296
+ end
297
+
298
+ #
299
+ # Determines if the page is a PDF document.
300
+ #
301
+ # @return [Boolean]
302
+ # Specifies whether the page is a PDF document.
303
+ #
304
+ def pdf?
305
+ is_content_type?('application/pdf')
306
+ end
307
+
308
+ #
309
+ # Determines if the page is a ZIP archive.
310
+ #
311
+ # @return [Boolean]
312
+ # Specifies whether the page is a ZIP archive.
313
+ #
314
+ def zip?
315
+ is_content_type?('application/zip')
316
+ end
317
+
318
+ #
319
+ # The raw Cookie String sent along with the page.
320
+ #
321
+ # @return [String]
322
+ # The raw Cookie from the response.
323
+ #
324
+ # @since 0.2.7
325
+ #
326
+ def cookie
327
+ (response['Set-Cookie'] || '')
328
+ end
329
+
330
+ alias raw_cookie cookie
331
+
332
+ #
333
+ # The Cookie values sent along with the page.
334
+ #
335
+ # @return [Array<String>]
336
+ # The Cookies from the response.
337
+ #
338
+ # @since 0.2.2
339
+ #
340
+ def cookies
341
+ (headers['set-cookie'] || [])
342
+ end
343
+
344
+ #
345
+ # The Cookie key -> value pairs returned with the response.
346
+ #
347
+ # @return [Hash{String => String}]
348
+ # The cookie keys and values.
349
+ #
350
+ # @since 0.2.2
351
+ #
352
+ def cookie_params
353
+ params = {}
354
+
355
+ cookies.each do |value|
356
+ value.split(';').each do |param|
357
+ param.strip!
358
+
359
+ name, value = param.split('=',2)
360
+
361
+ unless RESERVED_COOKIE_NAMES.include?(name)
362
+ params[name] = (value || '')
363
+ end
364
+ end
365
+ end
366
+
367
+ return params
368
+ end
369
+ end
370
+ end
@@ -0,0 +1,229 @@
1
+ require 'spidrs/extensions/uri'
2
+ require 'uri'
3
+
4
+ module Spidr
5
+ module Links
6
+ include Enumerable
7
+
8
+ #
9
+ # Enumerates over the meta-redirect links in the page.
10
+ #
11
+ # @yield [link]
12
+ # If a block is given, it will be passed every meta-redirect link
13
+ # from the page.
14
+ #
15
+ # @yieldparam [String] link
16
+ # A meta-redirect link from the page.
17
+ #
18
+ # @return [Enumerator]
19
+ # If no block is given, an enumerator object will be returned.
20
+ #
21
+ # @since 0.3.0
22
+ #
23
+ def each_meta_redirect
24
+ return enum_for(:each_meta_redirect) unless block_given?
25
+
26
+ if (html? && doc)
27
+ search('//meta[@http-equiv and @content]').each do |node|
28
+ if node.get_attribute('http-equiv') =~ /refresh/i
29
+ content = node.get_attribute('content')
30
+
31
+ if (redirect = content.match(/url=(\S+)$/))
32
+ yield redirect[1]
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
38
+
39
+ #
40
+ # Returns a boolean indicating whether or not page-level meta
41
+ # redirects are present in this page.
42
+ #
43
+ # @return [Boolean]
44
+ # Specifies whether the page includes page-level redirects.
45
+ #
46
+ def meta_redirect?
47
+ !(each_meta_redirect.first.nil?)
48
+ end
49
+
50
+ #
51
+ # The meta-redirect links of the page.
52
+ #
53
+ # @return [Array<String>]
54
+ # All meta-redirect links in the page.
55
+ #
56
+ # @since 0.3.0
57
+ #
58
+ def meta_redirects
59
+ each_meta_redirect.to_a
60
+ end
61
+
62
+ #
63
+ # Enumerates over every HTTP or meta-redirect link in the page.
64
+ #
65
+ # @yield [link]
66
+ # The given block will be passed every redirection link from the page.
67
+ #
68
+ # @yieldparam [String] link
69
+ # A HTTP or meta-redirect link from the page.
70
+ #
71
+ # @return [Enumerator]
72
+ # If no block is given, an enumerator object will be returned.
73
+ #
74
+ # @since 0.3.0
75
+ #
76
+ def each_redirect(&block)
77
+ return enum_for(:each_redirect) unless block
78
+
79
+ location = headers['location']
80
+
81
+ if location.nil?
82
+ # check page-level meta redirects if there isn't a location header
83
+ each_meta_redirect(&block)
84
+ elsif location.kind_of?(Array)
85
+ location.each(&block)
86
+ else
87
+ # usually the location header contains a single String
88
+ yield location
89
+ end
90
+ end
91
+
92
+ #
93
+ # URLs that this document redirects to.
94
+ #
95
+ # @return [Array<String>]
96
+ # The links that this page redirects to (usually found in a
97
+ # location header or by way of a page-level meta redirect).
98
+ #
99
+ def redirects_to
100
+ each_redirect.to_a
101
+ end
102
+
103
+ #
104
+ # Enumerates over every link in the page.
105
+ #
106
+ # @yield [link]
107
+ # The given block will be passed every non-empty link in the page.
108
+ #
109
+ # @yieldparam [String] link
110
+ # A link in the page.
111
+ #
112
+ # @return [Enumerator]
113
+ # If no block is given, an enumerator object will be returned.
114
+ #
115
+ # @since 0.3.0
116
+ #
117
+ def each_link
118
+ return enum_for(:each_link) unless block_given?
119
+
120
+ filter = lambda { |url|
121
+ yield url unless (url.nil? || url.empty?)
122
+ }
123
+
124
+ each_redirect(&filter) if is_redirect?
125
+
126
+ if (html? && doc)
127
+ doc.search('//a[@href]').each do |a|
128
+ filter.call(a.get_attribute('href'))
129
+ end
130
+
131
+ doc.search('//frame[@src]').each do |iframe|
132
+ filter.call(iframe.get_attribute('src'))
133
+ end
134
+
135
+ doc.search('//iframe[@src]').each do |iframe|
136
+ filter.call(iframe.get_attribute('src'))
137
+ end
138
+
139
+ doc.search('//link[@href]').each do |link|
140
+ filter.call(link.get_attribute('href'))
141
+ end
142
+
143
+ doc.search('//script[@src]').each do |script|
144
+ filter.call(script.get_attribute('src'))
145
+ end
146
+ end
147
+ end
148
+
149
+ #
150
+ # The links from within the page.
151
+ #
152
+ # @return [Array<String>]
153
+ # All links within the HTML page, frame/iframe source URLs and any
154
+ # links in the `Location` header.
155
+ #
156
+ def links
157
+ each_link.to_a
158
+ end
159
+
160
+ #
161
+ # Enumerates over every absolute URL in the page.
162
+ #
163
+ # @yield [url]
164
+ # The given block will be passed every URL in the page.
165
+ #
166
+ # @yieldparam [URI::HTTP] url
167
+ # An absolute URL in the page.
168
+ #
169
+ # @return [Enumerator]
170
+ # If no block is given, an enumerator object will be returned.
171
+ #
172
+ # @since 0.3.0
173
+ #
174
+ def each_url
175
+ return enum_for(:each_url) unless block_given?
176
+
177
+ each_link do |link|
178
+ if (url = to_absolute(link))
179
+ yield url
180
+ end
181
+ end
182
+ end
183
+
184
+ alias each each_url
185
+
186
+ #
187
+ # Absolute URIs from within the page.
188
+ #
189
+ # @return [Array<URI::HTTP>]
190
+ # The links from within the page, converted to absolute URIs.
191
+ #
192
+ def urls
193
+ each_url.to_a
194
+ end
195
+
196
+ #
197
+ # Normalizes and expands a given link into a proper URI.
198
+ #
199
+ # @param [String] link
200
+ # The link to normalize and expand.
201
+ #
202
+ # @return [URI::HTTP]
203
+ # The normalized URI.
204
+ #
205
+ def to_absolute(link)
206
+ begin
207
+ new_url = url.merge(link.to_s)
208
+ rescue Exception
209
+ return nil
210
+ end
211
+
212
+ if new_url.path
213
+ path = new_url.path
214
+
215
+ # ensure that paths begin with a leading '/' for URI::FTP
216
+ if (new_url.scheme == 'ftp' && path[0,1] != '/')
217
+ path.insert(0,'/')
218
+ end
219
+
220
+ # make sure the path does not contain any .. or . directories,
221
+ # since URI::Generic#merge cannot normalize paths such as
222
+ # "/stuff/../"
223
+ new_url.path = URI.expand_path(path)
224
+ end
225
+
226
+ return new_url
227
+ end
228
+ end
229
+ end
data/lib/spidr/page.rb ADDED
@@ -0,0 +1,108 @@
1
+ require 'spidrs/headers'
2
+ require 'spidrs/body'
3
+ require 'spidrs/links'
4
+
5
+ module Spidr
6
+ #
7
+ # Represents a requested page from a website.
8
+ #
9
+ class Page
10
+
11
+ include Headers
12
+ include Body
13
+ include Links
14
+
15
+ # URL of the page
16
+ attr_reader :url
17
+
18
+ # HTTP Response
19
+ attr_reader :response
20
+
21
+ # Headers returned with the body
22
+ attr_reader :headers
23
+
24
+ #
25
+ # Creates a new Page object.
26
+ #
27
+ # @param [URI::HTTP] url
28
+ # The URL of the page.
29
+ #
30
+ # @param [Net::HTTP::Response] response
31
+ # The response from the request for the page.
32
+ #
33
+ def initialize(url,response)
34
+ @url = url
35
+ @response = response
36
+ @headers = response.to_hash
37
+ @doc = nil
38
+ end
39
+
40
+ #
41
+ # The meta-redirect links of the page.
42
+ #
43
+ # @return [Array<String>]
44
+ # All meta-redirect links in the page.
45
+ #
46
+ # @deprecated
47
+ # Deprecated in 0.3.0 and will be removed in 0.4.0.
48
+ # Use {#meta_redirects} instead.
49
+ #
50
+ def meta_redirect
51
+ STDERR.puts 'DEPRECATION: Spidr::Page#meta_redirect will be removed in 0.3.0'
52
+ STDERR.puts 'DEPRECATION: Use Spidr::Page#meta_redirects instead'
53
+
54
+ meta_redirects
55
+ end
56
+
57
+ #
58
+ # Determines if the response code is `300`, `301`, `302`, `303`
59
+ # or `307`. Also checks for "soft" redirects added at the page
60
+ # level by a meta refresh tag.
61
+ #
62
+ # @return [Boolean]
63
+ # Specifies whether the response code is a HTTP Redirect code.
64
+ #
65
+ def is_redirect?
66
+ case code
67
+ when 300..303, 307
68
+ true
69
+ when 200
70
+ meta_redirect?
71
+ else
72
+ false
73
+ end
74
+ end
75
+
76
+ alias redirect? is_redirect?
77
+
78
+ protected
79
+
80
+ #
81
+ # Provides transparent access to the values in {#headers}.
82
+ #
83
+ # @param [Symbol] name
84
+ # The name of the missing method.
85
+ #
86
+ # @param [Array] arguments
87
+ # Additional arguments for the missing method.
88
+ #
89
+ # @return [String]
90
+ # The missing method mapped to a header in {#headers}.
91
+ #
92
+ # @raise [NoMethodError]
93
+ # The missing method did not map to a header in {#headers}.
94
+ #
95
+ def method_missing(name,*arguments,&block)
96
+ if (arguments.empty? && block.nil?)
97
+ header_name = name.to_s.sub('_','-')
98
+
99
+ if @response.key?(header_name)
100
+ return @response[header_name]
101
+ end
102
+ end
103
+
104
+ return super(name,*arguments,&block)
105
+ end
106
+
107
+ end
108
+ end