raakt 0.4 → 0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/raakt.rb CHANGED
@@ -1,101 +1,110 @@
1
- # :title: Ruby Accessibility Analysis Kit
2
- # =Ruby Accessibility Analysis Kit
1
+ # == The Ruby Accessibility Analysis Kit (RAAKT)
2
+ # :title: Ruby Accessibility Analysis Kit (RAAKT)
3
+ # Author:: Peter Krantz (http://www.peterkrantz.com/)
4
+ # License:: See LICENSE file
3
5
  #
4
- # See README for a full explanation of this library.
5
-
6
+ # RAAKT is a toolkit to find accessibility issues in HTML documents. RAAKT can be used as part of a an automatic test procedure or as a standalone module for mass validation of all pages in a site.
7
+ #
8
+ # The ambition has been to provide tests that can be fully automated. Currently, none of the included tests should fail for any web page.
9
+ #
10
+ # Many of the tests included here map to tests defined in the Unified Web Evaluation Methodology (UWEM[http://www.wabcluster.org/uwem/tests/]). See note for each test to find the corresponding UWEM test.
11
+ #
12
+ # == Output
13
+ # RAAKT output is in the form of an array of Raakt::ErrorMessage objects.
14
+ #
15
+ # == Contributions
16
+ # Thanks to Derek Perrault for refactoring RAAKT to use Hpricot[http://code.whytheluckystiff.net/hpricot/] while at the same time making the code more readable.
17
+ #
18
+ # == Example usage
19
+ # See the examples folder for a small script that shows how to retrieve a remote web page and perform an accessibility test on it.
6
20
  module Raakt
7
- require 'rubyful_soup'
21
+ require 'hpricot'
8
22
 
9
23
  MESSAGES = {
10
- "missingtitle" => "The title element is missing. Provide a descriptive title for your document.",
11
- "emptytitle" => "The title element is empty. Provide a descriptive title for your document.",
12
- "missingalt" => "Missing alt attribute for image (with src '%s').",
13
- "missingheading" => "Missing first level heading (h1). Provide at least one first level heading describing document content.",
14
- "wronghstructure" => "Document heading structure is wrong.",
15
- "firsthnoth1" => "The first heading is not h1.",
16
- "hasnestedtables" => "You have one or more nested tables.",
17
- "missingsemantics"=> "You have used %s for visual formatting. Use CSS instead.",
18
- "hasflicker" => "You have used <blink> or <marquee>. These may create accessibility issues and should be avoided.",
19
- "missinglanginfo" => "Document language information is missing. Use the lang attribute on the html element.",
20
- "missingth" => "Missing table headings (th) for table #%s.",
21
- "ambiguouslinktext" => "One or more links have the same link text ('%s'). Make sure each link is unambiguous.",
22
- "fieldmissinglabel" => "A field (with id/name '%s') is missing a corresponding label element. Make sure a label exists for all visible fields.",
23
- "missingframetitle" => "Missing title attribute for frame with url %s",
24
- "hasmetarefresh" => "Client side redirect (meta refresh) detected. Use server side redirection instead."
24
+ :missing_title => "The title element is missing. Provide a descriptive title for your document.",
25
+ :empty_title => "The title element is empty. Provide a descriptive title for your document.",
26
+ :missing_alt => "Missing alt attribute for image (with src '%s').",
27
+ :missing_heading => "Missing first level heading (h1). Provide at least one first level heading describing document content.",
28
+ :wrong_h_structure => "Document heading structure is wrong.",
29
+ :first_h_not_h1 => "The first heading is not h1.",
30
+ :has_nested_tables => "You have one or more nested tables.",
31
+ :missing_semantics => "You have used %s for visual formatting. Use CSS instead.",
32
+ :has_flicker => "You have used <blink> and/or <marquee>. These may create accessibility issues and should be avoided.",
33
+ :missing_lang_info => "Document language information is missing. Use the lang attribute on the html element.",
34
+ :missing_th => "Missing table headings (th) for table #%s.",
35
+ :ambiguous_link_text => "One or more links have the same link text ('%s'). Make sure each link is unambiguous.",
36
+ :field_missing_label => "A field (with id/name '%s') is missing a corresponding label element. Make sure a label exists for all visible fields.",
37
+ :missing_frame_title => "Missing title attribute for frame with url %s",
38
+ :has_meta_refresh => "Client side redirect (meta refresh) detected. Use server side redirection instead.",
39
+ :charset_mismatch => "The character set specified in the HTTP headers does not match that specified in the markup.",
40
+ :embed_used => "You have used the embed element. It does not provide a way to express a text representation.",
41
+ :wrong_lang_code => "You have used a language code ('%s') not recognized in the ISO 639 standard.",
42
+ :fieldset_missing_legend => "Missing legend element for fieldset #%s.",
43
+ :missing_input_alt => "Missing alt attribute for image button with id/name '%s'.",
44
+ :missing_input_alt_text => "Missing alt text for image button with id/name '%s'.",
45
+ :missing_area_alt => "Missing alt attribute for area with id/name '%s'.",
46
+ :missing_area_alt_text => "Missing alt text for area with id/name '%s'."
25
47
  }
26
48
 
27
- VERSION = "0.4"
49
+ VERSION = "0.5"
28
50
 
29
51
  class ErrorMessage
30
52
 
31
53
  attr_reader :eid, :text, :note
32
-
54
+
33
55
  def initialize(eid, note=nil)
34
56
  @eid = eid
57
+
35
58
  if note
36
- @text = MESSAGES[eid].sub(/%s/, note)
59
+ @text = MESSAGES[@eid].sub(/%s/, note)
37
60
  else
38
- @text = MESSAGES[eid]
61
+ @text = MESSAGES[@eid]
39
62
  end
40
63
  @note = note
41
64
  end
42
-
65
+
43
66
  def to_s
44
- @eid + ": " + @text
67
+ "#{@eid}: #{@text}"
45
68
  end
46
- end
47
-
48
69
 
70
+ # Return single error message as an xml element.
71
+ def to_xml
72
+ "<message id=\"#{@eid}\">#{@text}</message>"
73
+ end
74
+ end
49
75
 
50
76
 
51
77
 
52
78
  class Test
53
79
 
54
- attr_accessor :soup, :html, :user_agent
55
-
56
- def initialize(html=nil)
80
+ attr_accessor :html, :headers, :user_agent, :ignore_bi
81
+
82
+ def initialize(html=nil, headers=nil)
57
83
  @html = html
58
- @soup = BeautifulSoup.new(@html) if html
59
- @user_agent = "Mozilla/5.0 (RAAKT v#{VERSION}; http://raakt.rubyforge.org; The Ruby Accessibility Analysis Kit)"
60
- end
61
-
62
- def feed(html)
63
- @html = html || ""
64
- if @html.length > 0
65
- @soup = BeautifulSoup.new(@html)
66
- else
67
- raise "You called feed with no data. There is nothing to check."
68
- end
84
+ @headers = headers
85
+ self.doc = @html if html
86
+ self.headers = @headers if headers
87
+ @ignore_bi = false
69
88
  end
70
-
71
-
72
-
73
- def feedurl(url)
74
- if url.length == 0
75
- raise "You called feedurl with a blank url. There is nothing to check."
76
- end
77
-
78
- #Clean the url and make sure protocol and trailing slash is available
79
- url = "http://" + url unless url[0..3] == "http"
80
-
81
- require 'open-uri'
82
89
 
83
- open(url, "User-Agent" => @user_agent) { |f|
84
- @html = f.read || ""
85
- }
86
-
87
- if @html.length == 0
88
- raise "Could not fetch html from the url #{url}. There is nothing to check."
89
- else
90
- @soup = BeautifulSoup.new(@html)
91
- end
92
-
90
+ # Set the HTML used in the test.
91
+ def doc=(html)
92
+ Hpricot.buffer_size = 262144 #Allow for asp.net bastard-sized viewstate attributes...
93
+ @doc = Hpricot(html)
93
94
  end
94
95
 
95
-
96
-
96
+ # Set HTML headers to be used in the test. Headers are necessary for some tests (e.g. to check encoding).
97
+ def headers=(headers)
98
+ if headers
99
+ @headers = downcase_hash_keys(headers)
100
+ else
101
+ @headers = nil
102
+ end
103
+ end
104
+
105
+
106
+ # Call all check methods.
97
107
  def all
98
- #Call all check methods
99
108
  messages = []
100
109
 
101
110
  self.methods.each do |method|
@@ -107,69 +116,121 @@ module Raakt
107
116
  return messages
108
117
  end
109
118
 
119
+
120
+ # Verify that all fieldset elements have a legend child element. See UWEM 1.0 Test 12.3_HTML_01.
121
+ def check_fieldset_legend
122
+ messages = []
123
+ fieldsets = (@doc/"fieldset")
124
+ fieldset_instance = 1
125
+ for fieldset in fieldsets
126
+ if (fieldset/"legend").empty?
127
+ messages << ErrorMessage.new(:fieldset_missing_legend, fieldset_instance.to_s)
128
+ end
129
+ fieldset_instance += 1
130
+ end
131
+ messages
132
+ end
133
+
134
+
135
+ # Verify that the embed element isn't used. See UWEM 1.0 Test 1.1_HTML_06.
136
+ def check_embed
137
+ return [ErrorMessage.new(:embed_used)] unless (@doc/'embed').empty?
138
+ []
139
+ end
140
+
110
141
 
142
+ # Verify that the charater set specified in HTTP headers match that specidied in the HTML meta element.
143
+ def check_character_set
144
+ messages = []
145
+ header_charset = meta_charset = ""
146
+ if @headers and @headers.length > 0 then
147
+ if @headers.has_key?("content-type")
148
+ header_charset = parse_charset(@headers["content-type"].to_s)
149
+ end
150
+
151
+ #get meta element charset
152
+ meta_elements = @doc.search("//meta[@http-equiv]")
153
+ for element in meta_elements do
154
+ if element["http-equiv"].downcase == "content-type" then
155
+ meta_charset = parse_charset(element["content"])
156
+ end
157
+ end
158
+
159
+ if header_charset.length > 0 and meta_charset.length > 0
160
+ unless meta_charset == header_charset
161
+ messages << ErrorMessage.new(:charset_mismatch)
162
+ end
163
+ end
164
+ end
165
+
166
+ return messages
167
+
168
+ end
169
+
170
+
171
+ # Verify that all input type=image elements have an alt attribute.
172
+ def check_input_type_img
173
+ #Covers UWEM 1.0 Test 1.1_HTML_01
174
+
175
+ messages = []
176
+ image_input_buttons = @doc.search("input").select { |element| element['type'] =~ /image/i }
177
+ image_input_buttons.map { |element|
178
+ unless element['alt']
179
+ messages << ErrorMessage.new(:missing_input_alt, element['name'] || element['id'] || "")
180
+ else
181
+ if element['alt'].length == 0
182
+ messages << ErrorMessage.new(:missing_input_alt_text, element['name'] || element['id'] || "")
183
+ end
184
+ end
185
+ }
186
+
187
+ messages
188
+ end
189
+
190
+
191
+ # Verify that all img elements have an alt attribute.
111
192
  def check_images
112
- #soup = BeautifulSoup.new(html)
113
- images = @soup.find_all("img")
114
- messages = []
115
-
116
- for image in images:
117
- if image["alt"] == nil:
118
- img_src = image["src"] || ""
119
- messages << ErrorMessage.new("missingalt", img_src)
120
- end
121
- end
122
-
123
- return messages
193
+ no_alt_images = (@doc/"img:not([@alt])")
194
+ no_alt_images.map { |img| ErrorMessage.new(:missing_alt, img['src']) }
124
195
  end
125
196
 
126
197
 
127
- def check_title
128
- title = @soup.find("title")
129
- messages = []
130
-
131
- if title
132
- titletext = normalize_text(title.string)
133
- if titletext.length == 0
134
- messages << ErrorMessage.new("emptytitle")
135
- end
136
- else
137
- messages << ErrorMessage.new("missingtitle")
138
- end
139
-
140
- return messages
141
- end
198
+ # Verify that all area elements have a non-empty alt attribute. See UWEM 1.0 Test 1.1_HTML_01 (together with check_images)
199
+ def check_areas
200
+ messages = []
201
+ area_elements = (@doc/"area")
202
+ area_elements.map { |element|
203
+ unless element['alt']
204
+ messages << ErrorMessage.new(:missing_area_alt, element['name'] || element['id'] || "unknown")
205
+ else
206
+ if element['alt'].length == 0
207
+ messages << ErrorMessage.new(:missing_area_alt_text, element['name'] || element['id'] || "unknown")
208
+ end
209
+ end
210
+ }
142
211
 
143
-
144
- def check_has_heading
145
- messages = []
146
-
147
- if @soup.find_all("h1").length == 0
148
- messages << ErrorMessage.new("missingheading")
149
- end
150
-
151
- return messages
212
+ messages
152
213
  end
153
214
 
154
215
 
155
- def headings
156
- headings = []
157
- headings.push(@soup.find_all("h1")) if @soup.find_all("h1").length > 0
158
- headings.push(@soup.find_all("h2")) if @soup.find_all("h2").length > 0
159
- headings.push(@soup.find_all("h3")) if @soup.find_all("h3").length > 0
160
- headings.push(@soup.find_all("h4")) if @soup.find_all("h4").length > 0
161
- headings.push(@soup.find_all("h5")) if @soup.find_all("h5").length > 0
162
- headings.push(@soup.find_all("h6")) if @soup.find_all("h6").length > 0
163
-
164
- return headings.flatten
216
+
217
+ # Verify that the document has a non-empty title element.
218
+ def check_title
219
+ title = @doc.at('title')
220
+ return [ErrorMessage.new(:missing_title)] unless title
221
+ return [ErrorMessage.new(:empty_title)] if normalize_text(title.inner_html).empty?
222
+ []
165
223
  end
166
224
 
167
-
168
- def level(heading)
169
- Integer(heading[1,1])
225
+
226
+ # Verify that the document has at least one h1 element.
227
+ def check_has_heading
228
+ return [ErrorMessage.new(:missing_heading)] if (@doc/'h1').empty?
229
+ []
170
230
  end
171
231
 
172
-
232
+
233
+ # Verify that heading elements (h1-h6) appear in the correct order (no levels skipped). See UWEM 1.0 Test 3.5_HTML_03.
173
234
  def check_document_structure
174
235
  messages = []
175
236
  currentitem = 0
@@ -178,11 +239,11 @@ module Raakt
178
239
  for heading in docheadings
179
240
  if currentitem == 0
180
241
  if level(heading.name) != 1
181
- messages << ErrorMessage.new("firsthnoth1", "h" + heading.name[1,1])
242
+ messages << ErrorMessage.new(:first_h_not_h1, "h" + heading.name[1,1])
182
243
  end
183
244
  else
184
245
  if level(heading.name) - level(docheadings[currentitem - 1].name) > 1
185
- messages << ErrorMessage.new("wronghstructure")
246
+ messages << ErrorMessage.new(:wrong_h_structure)
186
247
  break
187
248
  end
188
249
  end
@@ -191,122 +252,125 @@ module Raakt
191
252
 
192
253
  end
193
254
 
194
- return messages
255
+ messages
195
256
  end
196
257
 
197
258
 
259
+ # Verify that the document does not have any nested tabled. This is indicative of a table-based layout.
198
260
  def check_for_nested_tables
199
261
 
200
262
  messages = []
201
- tables = @soup.find_all("table")
263
+ tables = (@doc/"table")
202
264
 
203
265
  for table in tables
204
- if table.find_all("table").length > 0
205
- messages << ErrorMessage.new("hasnestedtables")
206
- break
266
+ unless (table/"table").empty?
267
+ return messages << ErrorMessage.new(:has_nested_tables)
207
268
  end
208
269
  end
209
270
 
210
- return messages
271
+ messages
211
272
  end
212
273
 
213
274
 
275
+ # Verify that all tables have at least on table header (th) element.
214
276
  def check_tables
215
-
216
277
  messages = []
217
- tables = @soup.find_all("table")
218
- hasth = false
278
+ tables = (@doc/"table")
219
279
  currenttable = 1
220
280
 
221
281
  for table in tables
222
- if table.thead
223
- if table.thead.tr
224
- if table.thead.tr.th
225
- hasth = true
226
- end
227
- end
228
- end
229
-
230
- if table.tr
231
- if table.tr.th
232
- hasth = true
233
- end
234
- end
235
-
236
- unless hasth
237
- messages << ErrorMessage.new("missingth", currenttable.to_s)
238
- end
282
+ hasth = false
283
+ hasth = true unless (table/">tr>th").empty?
284
+ hasth = true unless (table/">thead>tr>th").empty?
239
285
 
286
+ messages << ErrorMessage.new(:missing_th, currenttable.to_s) unless hasth
287
+
240
288
  currenttable += 1
241
289
  end
242
290
 
243
- return messages
291
+ messages
244
292
  end
245
293
 
246
294
 
295
+
296
+ # Verify that no formatting elements have been used. See UWEM 1.0 Test 7.2_HTML_01 and Test 7.3_HTML_01.
247
297
  def check_for_formatting_elements
248
298
 
249
- messages = []
250
- formatting_items = @soup.find_all(%w(font b i u tt small big strike s))
251
- flicker_items = @soup.find_all(["blink", "marquee"])
252
-
253
- formatting_items.each do |element|
254
- messages << ErrorMessage.new("missingsemantics", "<#{element.name}>")
255
- end
299
+ messages = []
256
300
 
257
- if flicker_items.length > 0
258
- messages << ErrorMessage.new("hasflicker")
259
- end
260
-
261
- return messages
301
+ formatting_elements = %w(font b i u tt small big strike s)
302
+ formatting_elements = %w(font u tt small big strike s) if @ignore_bi
303
+
304
+ formatting_items = (@doc/formatting_elements.join('|'))
305
+
306
+ unless formatting_items.empty?
307
+ messages << ErrorMessage.new(:missing_semantics, "#{formatting_items.join(', ')}")
308
+ end
309
+
310
+ flicker_elements = %w(blink marquee)
311
+ flicker_items = (@doc/flicker_elements.join('|'))
312
+
313
+ unless flicker_items.empty?
314
+ messages << ErrorMessage.new(:has_flicker)
315
+ end
316
+
317
+ messages
262
318
  end
263
319
 
264
320
 
321
+ # Verify that the root documet html element as a lang attribute.
265
322
  def check_for_language_info
266
- messages = []
267
-
268
- htmlelement = @soup.find("html")
269
-
270
- lang = langinfo(htmlelement) || ""
271
-
272
- unless lang.length > 1
273
- messages << ErrorMessage.new("missinglanginfo")
274
- end
275
-
276
- return messages
323
+ messages = []
324
+ unless (@doc/'html[@lang]').empty?
325
+ lang_code = (@doc/"html").first["lang"].to_s
326
+ if lang_code.length < 2
327
+ messages << ErrorMessage.new(:missing_lang_info)
328
+ end
329
+ else
330
+ messages << ErrorMessage.new(:missing_lang_info)
331
+ end
332
+ messages
277
333
  end
334
+
335
+
336
+ # Verify that the html element has a valid lang code.
337
+ def check_valid_language_code
338
+ messages = []
339
+ unless (@doc/"html[@lang]").empty?
340
+ #load list of valid language codes
341
+ iso_lang_codes = []
342
+ IO.foreach(File.dirname(__FILE__) + "/iso_language_codes.txt") { |code| iso_lang_codes << code.chomp }
343
+
344
+ doc_main_lang_code = (@doc/"html").first["lang"].to_s.downcase
345
+ unless iso_lang_codes.include?(doc_main_lang_code[0..1])
346
+ messages << ErrorMessage.new(:wrong_lang_code, doc_main_lang_code)
347
+ end
348
+ end
349
+
350
+ messages
351
+ end
278
352
 
279
353
 
354
+ # Verify that no link texts are ambiguous. A typical example is the presence of multiple "Read more" links.
280
355
  def check_link_text
281
- messages = []
282
356
  links = get_links
283
- linktexts = links.collect { |el| el[3] }
284
357
 
285
- for link_a in links
286
- #compare to other links in collection
287
- for link_b in links
288
- if link_a[0] != link_b[0]
289
- if is_ambiguous_link(link_a, link_b)
290
- #add message if not added already for link text
291
- unless find_errormsg_with_text(messages, link_a[3])
292
- messages << ErrorMessage.new("ambiguouslinktext", link_a[3])
293
- end
294
- end
295
- end
296
- end
358
+ link = links.find do |link|
359
+ links.find { |cmp_link| is_ambiguous_link(link, cmp_link) }
297
360
  end
298
361
 
299
- return messages
362
+ return [] unless link
363
+ [ErrorMessage.new(:ambiguous_link_text, get_link_text(link))]
300
364
  end
301
365
 
302
366
 
367
+ # Verify that all form fields have a corresponding label element. See UWEM 1.0 Test 12.4_HTML_02.
303
368
  def check_form
304
369
  messages = []
305
370
  labels = get_labels
306
371
  fields = get_editable_fields
307
372
 
308
373
  #make sure all fields have associated labels
309
-
310
374
  label_for_ids = []
311
375
  for label in labels
312
376
  if label["for"]
@@ -320,94 +384,80 @@ module Raakt
320
384
  field_id = (field["id"] || "")
321
385
  field_identifier = (field["id"] || field["name"] || "unknown")
322
386
  if not label_for_ids.include?(field_id)
323
- messages << ErrorMessage.new("fieldmissinglabel", field_identifier)
387
+ messages << ErrorMessage.new(:field_missing_label, field_identifier)
324
388
  end
325
389
  end
326
390
 
327
- return messages
391
+ messages
328
392
  end
329
393
 
330
394
 
395
+ # Verify that all frame elements have a title atribute.
331
396
  def check_frames
332
- #Verify frame titles
333
-
334
- messages = []
335
- if is_frameset
336
- frames = @soup.find_all("frame")
337
- frame_title = ""
397
+ # Covers UWEM Test 12.1_HTML_01
398
+ return [] unless is_frameset
338
399
 
339
- for frame in frames
340
- frame_title = frame["title"] || ""
341
- if normalize_text(frame_title).length == 0
342
- messages << ErrorMessage.new("missingframetitle", frame["src"])
343
- end
344
- end
345
- end
346
-
347
- return messages
400
+ (@doc/"frame").find_all do |frame|
401
+ frame_title = frame['title'] || ''
402
+ normalize_text(frame_title).empty?
403
+ end.map { |frame| ErrorMessage.new(:missing_frame_title, frame['src']) }
348
404
  end
349
405
 
350
406
 
407
+ # Verify that the document does not use meta-refresh to redirect the user away after a period of time.
351
408
  def check_refresh
409
+ meta_elements = (@doc/'meta')
352
410
 
353
- messages = []
354
- meta_elements = @soup.find_all("meta")
355
-
356
- for element in meta_elements
357
- if element["http-equiv"] == "refresh"
358
- messages << ErrorMessage.new("hasmetarefresh")
359
- end
360
- end
361
-
362
- return messages
411
+ meta_elements.find_all do |element|
412
+ element["http-equiv"] == "refresh"
413
+ end.map { ErrorMessage.new(:has_meta_refresh) }
363
414
  end
364
415
 
365
416
 
366
417
  #Utility methods
367
-
368
- def is_ambiguous_link(link_a, link_b)
369
- #Link A and B are ambiguous if:
370
- #1. The url differs
371
- #2. The link text is identical
372
- #3. The title text is identical (if present)
373
- if link_a[1] != link_b[1] and
374
- normalize_text(link_a[2]) == normalize_text(link_b[2]) and
375
- normalize_text(link_a[3]) == normalize_text(link_b[3]) then
376
- return true
418
+
419
+ def headings
420
+ headings = []
421
+ 1.upto(6) do |i|
422
+ headings.push((@doc/"h#{i}")) if (@doc/"h#{i}").length > 0
377
423
  end
378
-
379
- return false
424
+ headings.flatten
380
425
  end
426
+
381
427
 
428
+ def level(heading)
429
+ Integer(heading[1].chr)
430
+ end
431
+
382
432
 
383
- def find_errormsg_with_text(messages, text)
384
- for errormessage in messages
385
- if errormessage.note == text
386
- return errormessage
387
- end
388
- end
433
+ def downcase_hash_keys(a_hash)
434
+ downcased_hash = {}
435
+ a_hash.collect {|key,value| downcased_hash[key.downcase] = value}
436
+ return downcased_hash
437
+ end
438
+
439
+ def parse_charset(contenttype)
440
+ # get charset identifier from content type string
441
+ if contenttype=~/charset=(.*)\w?/ then
442
+ return $1.downcase.strip
443
+ end
444
+
445
+ return ""
446
+ end
447
+
448
+
449
+ def is_ambiguous_link(link_a, link_b)
450
+ return false if links_point_to_same_resource?(link_a, link_b)
451
+ return true if link_text_identical?(link_a, link_b) &&
452
+ link_title_identical?(link_a, link_b)
389
453
 
390
- return nil
454
+ false
391
455
  end
392
456
 
393
-
394
457
  def get_links
395
- linkelements = @soup.find_all("a")
396
- links = []
397
- currentlink = 0
398
-
399
- for element in linkelements
400
- title = normalize_text((element['title'] || "").strip)
401
- linktext = normalize_text((elements_to_text(element) || "").strip)
402
- url = element['href']
403
- links << [currentlink, url, title, linktext]
404
- currentlink += 1
405
- end
406
-
407
- return links
458
+ (@doc/'a')
408
459
  end
409
460
 
410
-
411
461
  def langinfo(element)
412
462
  langval = ""
413
463
 
@@ -423,39 +473,33 @@ module Raakt
423
473
  end
424
474
 
425
475
 
426
- def img_to_text(imgtag)
427
- return (imgtag['alt'] || "")
476
+ def alt_to_text(element)
477
+ if element.kind_of?(Hpricot::Elem) then
478
+ element.has_attribute?("alt") ? element['alt'] : ""
479
+ else
480
+ ""
481
+ end
428
482
  end
429
-
430
-
483
+
431
484
  def elements_to_text(element)
432
- retval = ""
433
-
434
- for el in element.contents
435
- if el.class.to_s == 'NavigableString'
436
- retval += el
437
- else
438
- if el.name == "img"
439
- retval += img_to_text(el)
440
- else
441
- retval += elements_to_text(el)
442
- end
443
- end
485
+ str = ''
486
+ element.traverse_all_element do |elem|
487
+ elem.kind_of?(Hpricot::Text) ? str += "#{elem}" : str += alt_to_text(elem)
444
488
  end
445
489
 
446
- return retval
490
+ str
447
491
  end
448
492
 
449
493
 
450
494
  def normalize_text(text)
451
- text = (text || "")
452
- retval = text.gsub(/&nbsp;/, " ")
453
- retval = retval.gsub(/&#160;/, " ")
454
- retval = retval.gsub(/\n/, "")
455
- retval = retval.gsub(/\r/, "")
456
- retval = retval.gsub(/\t/, "")
495
+ text ||= ''
496
+ retval = text.gsub(/&nbsp;/, ' ')
497
+ retval = retval.gsub(/&#160;/, ' ')
498
+ retval = retval.gsub(/\n/, '')
499
+ retval = retval.gsub(/\r/, '')
500
+ retval = retval.gsub(/\t/, '')
457
501
  while / /.match(retval) do
458
- retval = retval.gsub(/ /, " ")
502
+ retval = retval.gsub(/ /, ' ')
459
503
  end
460
504
 
461
505
  retval = retval.strip
@@ -465,12 +509,12 @@ module Raakt
465
509
 
466
510
 
467
511
  def get_labels
468
- return @soup.find_all("label")
512
+ @doc/'label'
469
513
  end
470
514
 
471
515
 
472
516
  def get_editable_fields
473
- allfields = @soup.find_all(["textarea", "select", "input"])
517
+ allfields = (@doc/"textarea|select|input")
474
518
  fields = []
475
519
  field_type = ""
476
520
 
@@ -487,9 +531,37 @@ module Raakt
487
531
 
488
532
 
489
533
  def is_frameset
490
- return (@soup.find("frameset") != nil)
534
+ (@doc/"frameset").length > 0
491
535
  end
492
536
 
537
+
538
+ def link_text_identical?(link_a, link_b)
539
+ get_link_text(link_a) == get_link_text(link_b)
540
+ end
541
+
542
+ def link_title_identical?(link_a, link_b)
543
+ get_link_title(link_a) == get_link_title(link_b)
544
+ end
545
+
546
+ def links_point_to_same_resource?(link_a, link_b)
547
+ (link_a == link_b) ||
548
+ (get_link_url(link_a) == get_link_url(link_b))
549
+ end
550
+
551
+ def get_link_text(link)
552
+ text = (elements_to_text(link) || '').strip
553
+ normalize_text(text)
554
+ end
555
+
556
+ def get_link_url(link)
557
+ link['href']
558
+ end
559
+
560
+ def get_link_title(link)
561
+ text = (link['title'] || '').strip
562
+ normalize_text(text)
563
+ end
564
+
493
565
  end
494
566
 
495
- end
567
+ end