raakt 0.4 → 0.5

Sign up to get free protection for your applications and to get access to all the features.
data/lib/raakt.rb CHANGED
@@ -1,101 +1,110 @@
1
- # :title: Ruby Accessibility Analysis Kit
2
- # =Ruby Accessibility Analysis Kit
1
+ # == The Ruby Accessibility Analysis Kit (RAAKT)
2
+ # :title: Ruby Accessibility Analysis Kit (RAAKT)
3
+ # Author:: Peter Krantz (http://www.peterkrantz.com/)
4
+ # License:: See LICENSE file
3
5
  #
4
- # See README for a full explanation of this library.
5
-
6
+ # RAAKT is a toolkit to find accessibility issues in HTML documents. RAAKT can be used as part of a an automatic test procedure or as a standalone module for mass validation of all pages in a site.
7
+ #
8
+ # The ambition has been to provide tests that can be fully automated. Currently, none of the included tests should fail for any web page.
9
+ #
10
+ # Many of the tests included here map to tests defined in the Unified Web Evaluation Methodology (UWEM[http://www.wabcluster.org/uwem/tests/]). See note for each test to find the corresponding UWEM test.
11
+ #
12
+ # == Output
13
+ # RAAKT output is in the form of an array of Raakt::ErrorMessage objects.
14
+ #
15
+ # == Contributions
16
+ # Thanks to Derek Perrault for refactoring RAAKT to use Hpricot[http://code.whytheluckystiff.net/hpricot/] while at the same time making the code more readable.
17
+ #
18
+ # == Example usage
19
+ # See the examples folder for a small script that shows how to retrieve a remote web page and perform an accessibility test on it.
6
20
  module Raakt
7
- require 'rubyful_soup'
21
+ require 'hpricot'
8
22
 
9
23
  MESSAGES = {
10
- "missingtitle" => "The title element is missing. Provide a descriptive title for your document.",
11
- "emptytitle" => "The title element is empty. Provide a descriptive title for your document.",
12
- "missingalt" => "Missing alt attribute for image (with src '%s').",
13
- "missingheading" => "Missing first level heading (h1). Provide at least one first level heading describing document content.",
14
- "wronghstructure" => "Document heading structure is wrong.",
15
- "firsthnoth1" => "The first heading is not h1.",
16
- "hasnestedtables" => "You have one or more nested tables.",
17
- "missingsemantics"=> "You have used %s for visual formatting. Use CSS instead.",
18
- "hasflicker" => "You have used <blink> or <marquee>. These may create accessibility issues and should be avoided.",
19
- "missinglanginfo" => "Document language information is missing. Use the lang attribute on the html element.",
20
- "missingth" => "Missing table headings (th) for table #%s.",
21
- "ambiguouslinktext" => "One or more links have the same link text ('%s'). Make sure each link is unambiguous.",
22
- "fieldmissinglabel" => "A field (with id/name '%s') is missing a corresponding label element. Make sure a label exists for all visible fields.",
23
- "missingframetitle" => "Missing title attribute for frame with url %s",
24
- "hasmetarefresh" => "Client side redirect (meta refresh) detected. Use server side redirection instead."
24
+ :missing_title => "The title element is missing. Provide a descriptive title for your document.",
25
+ :empty_title => "The title element is empty. Provide a descriptive title for your document.",
26
+ :missing_alt => "Missing alt attribute for image (with src '%s').",
27
+ :missing_heading => "Missing first level heading (h1). Provide at least one first level heading describing document content.",
28
+ :wrong_h_structure => "Document heading structure is wrong.",
29
+ :first_h_not_h1 => "The first heading is not h1.",
30
+ :has_nested_tables => "You have one or more nested tables.",
31
+ :missing_semantics => "You have used %s for visual formatting. Use CSS instead.",
32
+ :has_flicker => "You have used <blink> and/or <marquee>. These may create accessibility issues and should be avoided.",
33
+ :missing_lang_info => "Document language information is missing. Use the lang attribute on the html element.",
34
+ :missing_th => "Missing table headings (th) for table #%s.",
35
+ :ambiguous_link_text => "One or more links have the same link text ('%s'). Make sure each link is unambiguous.",
36
+ :field_missing_label => "A field (with id/name '%s') is missing a corresponding label element. Make sure a label exists for all visible fields.",
37
+ :missing_frame_title => "Missing title attribute for frame with url %s",
38
+ :has_meta_refresh => "Client side redirect (meta refresh) detected. Use server side redirection instead.",
39
+ :charset_mismatch => "The character set specified in the HTTP headers does not match that specified in the markup.",
40
+ :embed_used => "You have used the embed element. It does not provide a way to express a text representation.",
41
+ :wrong_lang_code => "You have used a language code ('%s') not recognized in the ISO 639 standard.",
42
+ :fieldset_missing_legend => "Missing legend element for fieldset #%s.",
43
+ :missing_input_alt => "Missing alt attribute for image button with id/name '%s'.",
44
+ :missing_input_alt_text => "Missing alt text for image button with id/name '%s'.",
45
+ :missing_area_alt => "Missing alt attribute for area with id/name '%s'.",
46
+ :missing_area_alt_text => "Missing alt text for area with id/name '%s'."
25
47
  }
26
48
 
27
- VERSION = "0.4"
49
+ VERSION = "0.5"
28
50
 
29
51
  class ErrorMessage
30
52
 
31
53
  attr_reader :eid, :text, :note
32
-
54
+
33
55
  def initialize(eid, note=nil)
34
56
  @eid = eid
57
+
35
58
  if note
36
- @text = MESSAGES[eid].sub(/%s/, note)
59
+ @text = MESSAGES[@eid].sub(/%s/, note)
37
60
  else
38
- @text = MESSAGES[eid]
61
+ @text = MESSAGES[@eid]
39
62
  end
40
63
  @note = note
41
64
  end
42
-
65
+
43
66
  def to_s
44
- @eid + ": " + @text
67
+ "#{@eid}: #{@text}"
45
68
  end
46
- end
47
-
48
69
 
70
+ # Return single error message as an xml element.
71
+ def to_xml
72
+ "<message id=\"#{@eid}\">#{@text}</message>"
73
+ end
74
+ end
49
75
 
50
76
 
51
77
 
52
78
  class Test
53
79
 
54
- attr_accessor :soup, :html, :user_agent
55
-
56
- def initialize(html=nil)
80
+ attr_accessor :html, :headers, :user_agent, :ignore_bi
81
+
82
+ def initialize(html=nil, headers=nil)
57
83
  @html = html
58
- @soup = BeautifulSoup.new(@html) if html
59
- @user_agent = "Mozilla/5.0 (RAAKT v#{VERSION}; http://raakt.rubyforge.org; The Ruby Accessibility Analysis Kit)"
60
- end
61
-
62
- def feed(html)
63
- @html = html || ""
64
- if @html.length > 0
65
- @soup = BeautifulSoup.new(@html)
66
- else
67
- raise "You called feed with no data. There is nothing to check."
68
- end
84
+ @headers = headers
85
+ self.doc = @html if html
86
+ self.headers = @headers if headers
87
+ @ignore_bi = false
69
88
  end
70
-
71
-
72
-
73
- def feedurl(url)
74
- if url.length == 0
75
- raise "You called feedurl with a blank url. There is nothing to check."
76
- end
77
-
78
- #Clean the url and make sure protocol and trailing slash is available
79
- url = "http://" + url unless url[0..3] == "http"
80
-
81
- require 'open-uri'
82
89
 
83
- open(url, "User-Agent" => @user_agent) { |f|
84
- @html = f.read || ""
85
- }
86
-
87
- if @html.length == 0
88
- raise "Could not fetch html from the url #{url}. There is nothing to check."
89
- else
90
- @soup = BeautifulSoup.new(@html)
91
- end
92
-
90
+ # Set the HTML used in the test.
91
+ def doc=(html)
92
+ Hpricot.buffer_size = 262144 #Allow for asp.net bastard-sized viewstate attributes...
93
+ @doc = Hpricot(html)
93
94
  end
94
95
 
95
-
96
-
96
+ # Set HTML headers to be used in the test. Headers are necessary for some tests (e.g. to check encoding).
97
+ def headers=(headers)
98
+ if headers
99
+ @headers = downcase_hash_keys(headers)
100
+ else
101
+ @headers = nil
102
+ end
103
+ end
104
+
105
+
106
+ # Call all check methods.
97
107
  def all
98
- #Call all check methods
99
108
  messages = []
100
109
 
101
110
  self.methods.each do |method|
@@ -107,69 +116,121 @@ module Raakt
107
116
  return messages
108
117
  end
109
118
 
119
+
120
+ # Verify that all fieldset elements have a legend child element. See UWEM 1.0 Test 12.3_HTML_01.
121
+ def check_fieldset_legend
122
+ messages = []
123
+ fieldsets = (@doc/"fieldset")
124
+ fieldset_instance = 1
125
+ for fieldset in fieldsets
126
+ if (fieldset/"legend").empty?
127
+ messages << ErrorMessage.new(:fieldset_missing_legend, fieldset_instance.to_s)
128
+ end
129
+ fieldset_instance += 1
130
+ end
131
+ messages
132
+ end
133
+
134
+
135
+ # Verify that the embed element isn't used. See UWEM 1.0 Test 1.1_HTML_06.
136
+ def check_embed
137
+ return [ErrorMessage.new(:embed_used)] unless (@doc/'embed').empty?
138
+ []
139
+ end
140
+
110
141
 
142
+ # Verify that the charater set specified in HTTP headers match that specidied in the HTML meta element.
143
+ def check_character_set
144
+ messages = []
145
+ header_charset = meta_charset = ""
146
+ if @headers and @headers.length > 0 then
147
+ if @headers.has_key?("content-type")
148
+ header_charset = parse_charset(@headers["content-type"].to_s)
149
+ end
150
+
151
+ #get meta element charset
152
+ meta_elements = @doc.search("//meta[@http-equiv]")
153
+ for element in meta_elements do
154
+ if element["http-equiv"].downcase == "content-type" then
155
+ meta_charset = parse_charset(element["content"])
156
+ end
157
+ end
158
+
159
+ if header_charset.length > 0 and meta_charset.length > 0
160
+ unless meta_charset == header_charset
161
+ messages << ErrorMessage.new(:charset_mismatch)
162
+ end
163
+ end
164
+ end
165
+
166
+ return messages
167
+
168
+ end
169
+
170
+
171
+ # Verify that all input type=image elements have an alt attribute.
172
+ def check_input_type_img
173
+ #Covers UWEM 1.0 Test 1.1_HTML_01
174
+
175
+ messages = []
176
+ image_input_buttons = @doc.search("input").select { |element| element['type'] =~ /image/i }
177
+ image_input_buttons.map { |element|
178
+ unless element['alt']
179
+ messages << ErrorMessage.new(:missing_input_alt, element['name'] || element['id'] || "")
180
+ else
181
+ if element['alt'].length == 0
182
+ messages << ErrorMessage.new(:missing_input_alt_text, element['name'] || element['id'] || "")
183
+ end
184
+ end
185
+ }
186
+
187
+ messages
188
+ end
189
+
190
+
191
+ # Verify that all img elements have an alt attribute.
111
192
  def check_images
112
- #soup = BeautifulSoup.new(html)
113
- images = @soup.find_all("img")
114
- messages = []
115
-
116
- for image in images:
117
- if image["alt"] == nil:
118
- img_src = image["src"] || ""
119
- messages << ErrorMessage.new("missingalt", img_src)
120
- end
121
- end
122
-
123
- return messages
193
+ no_alt_images = (@doc/"img:not([@alt])")
194
+ no_alt_images.map { |img| ErrorMessage.new(:missing_alt, img['src']) }
124
195
  end
125
196
 
126
197
 
127
- def check_title
128
- title = @soup.find("title")
129
- messages = []
130
-
131
- if title
132
- titletext = normalize_text(title.string)
133
- if titletext.length == 0
134
- messages << ErrorMessage.new("emptytitle")
135
- end
136
- else
137
- messages << ErrorMessage.new("missingtitle")
138
- end
139
-
140
- return messages
141
- end
198
+ # Verify that all area elements have a non-empty alt attribute. See UWEM 1.0 Test 1.1_HTML_01 (together with check_images)
199
+ def check_areas
200
+ messages = []
201
+ area_elements = (@doc/"area")
202
+ area_elements.map { |element|
203
+ unless element['alt']
204
+ messages << ErrorMessage.new(:missing_area_alt, element['name'] || element['id'] || "unknown")
205
+ else
206
+ if element['alt'].length == 0
207
+ messages << ErrorMessage.new(:missing_area_alt_text, element['name'] || element['id'] || "unknown")
208
+ end
209
+ end
210
+ }
142
211
 
143
-
144
- def check_has_heading
145
- messages = []
146
-
147
- if @soup.find_all("h1").length == 0
148
- messages << ErrorMessage.new("missingheading")
149
- end
150
-
151
- return messages
212
+ messages
152
213
  end
153
214
 
154
215
 
155
- def headings
156
- headings = []
157
- headings.push(@soup.find_all("h1")) if @soup.find_all("h1").length > 0
158
- headings.push(@soup.find_all("h2")) if @soup.find_all("h2").length > 0
159
- headings.push(@soup.find_all("h3")) if @soup.find_all("h3").length > 0
160
- headings.push(@soup.find_all("h4")) if @soup.find_all("h4").length > 0
161
- headings.push(@soup.find_all("h5")) if @soup.find_all("h5").length > 0
162
- headings.push(@soup.find_all("h6")) if @soup.find_all("h6").length > 0
163
-
164
- return headings.flatten
216
+
217
+ # Verify that the document has a non-empty title element.
218
+ def check_title
219
+ title = @doc.at('title')
220
+ return [ErrorMessage.new(:missing_title)] unless title
221
+ return [ErrorMessage.new(:empty_title)] if normalize_text(title.inner_html).empty?
222
+ []
165
223
  end
166
224
 
167
-
168
- def level(heading)
169
- Integer(heading[1,1])
225
+
226
+ # Verify that the document has at least one h1 element.
227
+ def check_has_heading
228
+ return [ErrorMessage.new(:missing_heading)] if (@doc/'h1').empty?
229
+ []
170
230
  end
171
231
 
172
-
232
+
233
+ # Verify that heading elements (h1-h6) appear in the correct order (no levels skipped). See UWEM 1.0 Test 3.5_HTML_03.
173
234
  def check_document_structure
174
235
  messages = []
175
236
  currentitem = 0
@@ -178,11 +239,11 @@ module Raakt
178
239
  for heading in docheadings
179
240
  if currentitem == 0
180
241
  if level(heading.name) != 1
181
- messages << ErrorMessage.new("firsthnoth1", "h" + heading.name[1,1])
242
+ messages << ErrorMessage.new(:first_h_not_h1, "h" + heading.name[1,1])
182
243
  end
183
244
  else
184
245
  if level(heading.name) - level(docheadings[currentitem - 1].name) > 1
185
- messages << ErrorMessage.new("wronghstructure")
246
+ messages << ErrorMessage.new(:wrong_h_structure)
186
247
  break
187
248
  end
188
249
  end
@@ -191,122 +252,125 @@ module Raakt
191
252
 
192
253
  end
193
254
 
194
- return messages
255
+ messages
195
256
  end
196
257
 
197
258
 
259
+ # Verify that the document does not have any nested tabled. This is indicative of a table-based layout.
198
260
  def check_for_nested_tables
199
261
 
200
262
  messages = []
201
- tables = @soup.find_all("table")
263
+ tables = (@doc/"table")
202
264
 
203
265
  for table in tables
204
- if table.find_all("table").length > 0
205
- messages << ErrorMessage.new("hasnestedtables")
206
- break
266
+ unless (table/"table").empty?
267
+ return messages << ErrorMessage.new(:has_nested_tables)
207
268
  end
208
269
  end
209
270
 
210
- return messages
271
+ messages
211
272
  end
212
273
 
213
274
 
275
+ # Verify that all tables have at least on table header (th) element.
214
276
  def check_tables
215
-
216
277
  messages = []
217
- tables = @soup.find_all("table")
218
- hasth = false
278
+ tables = (@doc/"table")
219
279
  currenttable = 1
220
280
 
221
281
  for table in tables
222
- if table.thead
223
- if table.thead.tr
224
- if table.thead.tr.th
225
- hasth = true
226
- end
227
- end
228
- end
229
-
230
- if table.tr
231
- if table.tr.th
232
- hasth = true
233
- end
234
- end
235
-
236
- unless hasth
237
- messages << ErrorMessage.new("missingth", currenttable.to_s)
238
- end
282
+ hasth = false
283
+ hasth = true unless (table/">tr>th").empty?
284
+ hasth = true unless (table/">thead>tr>th").empty?
239
285
 
286
+ messages << ErrorMessage.new(:missing_th, currenttable.to_s) unless hasth
287
+
240
288
  currenttable += 1
241
289
  end
242
290
 
243
- return messages
291
+ messages
244
292
  end
245
293
 
246
294
 
295
+
296
+ # Verify that no formatting elements have been used. See UWEM 1.0 Test 7.2_HTML_01 and Test 7.3_HTML_01.
247
297
  def check_for_formatting_elements
248
298
 
249
- messages = []
250
- formatting_items = @soup.find_all(%w(font b i u tt small big strike s))
251
- flicker_items = @soup.find_all(["blink", "marquee"])
252
-
253
- formatting_items.each do |element|
254
- messages << ErrorMessage.new("missingsemantics", "<#{element.name}>")
255
- end
299
+ messages = []
256
300
 
257
- if flicker_items.length > 0
258
- messages << ErrorMessage.new("hasflicker")
259
- end
260
-
261
- return messages
301
+ formatting_elements = %w(font b i u tt small big strike s)
302
+ formatting_elements = %w(font u tt small big strike s) if @ignore_bi
303
+
304
+ formatting_items = (@doc/formatting_elements.join('|'))
305
+
306
+ unless formatting_items.empty?
307
+ messages << ErrorMessage.new(:missing_semantics, "#{formatting_items.join(', ')}")
308
+ end
309
+
310
+ flicker_elements = %w(blink marquee)
311
+ flicker_items = (@doc/flicker_elements.join('|'))
312
+
313
+ unless flicker_items.empty?
314
+ messages << ErrorMessage.new(:has_flicker)
315
+ end
316
+
317
+ messages
262
318
  end
263
319
 
264
320
 
321
+ # Verify that the root documet html element as a lang attribute.
265
322
  def check_for_language_info
266
- messages = []
267
-
268
- htmlelement = @soup.find("html")
269
-
270
- lang = langinfo(htmlelement) || ""
271
-
272
- unless lang.length > 1
273
- messages << ErrorMessage.new("missinglanginfo")
274
- end
275
-
276
- return messages
323
+ messages = []
324
+ unless (@doc/'html[@lang]').empty?
325
+ lang_code = (@doc/"html").first["lang"].to_s
326
+ if lang_code.length < 2
327
+ messages << ErrorMessage.new(:missing_lang_info)
328
+ end
329
+ else
330
+ messages << ErrorMessage.new(:missing_lang_info)
331
+ end
332
+ messages
277
333
  end
334
+
335
+
336
+ # Verify that the html element has a valid lang code.
337
+ def check_valid_language_code
338
+ messages = []
339
+ unless (@doc/"html[@lang]").empty?
340
+ #load list of valid language codes
341
+ iso_lang_codes = []
342
+ IO.foreach(File.dirname(__FILE__) + "/iso_language_codes.txt") { |code| iso_lang_codes << code.chomp }
343
+
344
+ doc_main_lang_code = (@doc/"html").first["lang"].to_s.downcase
345
+ unless iso_lang_codes.include?(doc_main_lang_code[0..1])
346
+ messages << ErrorMessage.new(:wrong_lang_code, doc_main_lang_code)
347
+ end
348
+ end
349
+
350
+ messages
351
+ end
278
352
 
279
353
 
354
+ # Verify that no link texts are ambiguous. A typical example is the presence of multiple "Read more" links.
280
355
  def check_link_text
281
- messages = []
282
356
  links = get_links
283
- linktexts = links.collect { |el| el[3] }
284
357
 
285
- for link_a in links
286
- #compare to other links in collection
287
- for link_b in links
288
- if link_a[0] != link_b[0]
289
- if is_ambiguous_link(link_a, link_b)
290
- #add message if not added already for link text
291
- unless find_errormsg_with_text(messages, link_a[3])
292
- messages << ErrorMessage.new("ambiguouslinktext", link_a[3])
293
- end
294
- end
295
- end
296
- end
358
+ link = links.find do |link|
359
+ links.find { |cmp_link| is_ambiguous_link(link, cmp_link) }
297
360
  end
298
361
 
299
- return messages
362
+ return [] unless link
363
+ [ErrorMessage.new(:ambiguous_link_text, get_link_text(link))]
300
364
  end
301
365
 
302
366
 
367
+ # Verify that all form fields have a corresponding label element. See UWEM 1.0 Test 12.4_HTML_02.
303
368
  def check_form
304
369
  messages = []
305
370
  labels = get_labels
306
371
  fields = get_editable_fields
307
372
 
308
373
  #make sure all fields have associated labels
309
-
310
374
  label_for_ids = []
311
375
  for label in labels
312
376
  if label["for"]
@@ -320,94 +384,80 @@ module Raakt
320
384
  field_id = (field["id"] || "")
321
385
  field_identifier = (field["id"] || field["name"] || "unknown")
322
386
  if not label_for_ids.include?(field_id)
323
- messages << ErrorMessage.new("fieldmissinglabel", field_identifier)
387
+ messages << ErrorMessage.new(:field_missing_label, field_identifier)
324
388
  end
325
389
  end
326
390
 
327
- return messages
391
+ messages
328
392
  end
329
393
 
330
394
 
395
+ # Verify that all frame elements have a title atribute.
331
396
  def check_frames
332
- #Verify frame titles
333
-
334
- messages = []
335
- if is_frameset
336
- frames = @soup.find_all("frame")
337
- frame_title = ""
397
+ # Covers UWEM Test 12.1_HTML_01
398
+ return [] unless is_frameset
338
399
 
339
- for frame in frames
340
- frame_title = frame["title"] || ""
341
- if normalize_text(frame_title).length == 0
342
- messages << ErrorMessage.new("missingframetitle", frame["src"])
343
- end
344
- end
345
- end
346
-
347
- return messages
400
+ (@doc/"frame").find_all do |frame|
401
+ frame_title = frame['title'] || ''
402
+ normalize_text(frame_title).empty?
403
+ end.map { |frame| ErrorMessage.new(:missing_frame_title, frame['src']) }
348
404
  end
349
405
 
350
406
 
407
+ # Verify that the document does not use meta-refresh to redirect the user away after a period of time.
351
408
  def check_refresh
409
+ meta_elements = (@doc/'meta')
352
410
 
353
- messages = []
354
- meta_elements = @soup.find_all("meta")
355
-
356
- for element in meta_elements
357
- if element["http-equiv"] == "refresh"
358
- messages << ErrorMessage.new("hasmetarefresh")
359
- end
360
- end
361
-
362
- return messages
411
+ meta_elements.find_all do |element|
412
+ element["http-equiv"] == "refresh"
413
+ end.map { ErrorMessage.new(:has_meta_refresh) }
363
414
  end
364
415
 
365
416
 
366
417
  #Utility methods
367
-
368
- def is_ambiguous_link(link_a, link_b)
369
- #Link A and B are ambiguous if:
370
- #1. The url differs
371
- #2. The link text is identical
372
- #3. The title text is identical (if present)
373
- if link_a[1] != link_b[1] and
374
- normalize_text(link_a[2]) == normalize_text(link_b[2]) and
375
- normalize_text(link_a[3]) == normalize_text(link_b[3]) then
376
- return true
418
+
419
+ def headings
420
+ headings = []
421
+ 1.upto(6) do |i|
422
+ headings.push((@doc/"h#{i}")) if (@doc/"h#{i}").length > 0
377
423
  end
378
-
379
- return false
424
+ headings.flatten
380
425
  end
426
+
381
427
 
428
+ def level(heading)
429
+ Integer(heading[1].chr)
430
+ end
431
+
382
432
 
383
- def find_errormsg_with_text(messages, text)
384
- for errormessage in messages
385
- if errormessage.note == text
386
- return errormessage
387
- end
388
- end
433
+ def downcase_hash_keys(a_hash)
434
+ downcased_hash = {}
435
+ a_hash.collect {|key,value| downcased_hash[key.downcase] = value}
436
+ return downcased_hash
437
+ end
438
+
439
+ def parse_charset(contenttype)
440
+ # get charset identifier from content type string
441
+ if contenttype=~/charset=(.*)\w?/ then
442
+ return $1.downcase.strip
443
+ end
444
+
445
+ return ""
446
+ end
447
+
448
+
449
+ def is_ambiguous_link(link_a, link_b)
450
+ return false if links_point_to_same_resource?(link_a, link_b)
451
+ return true if link_text_identical?(link_a, link_b) &&
452
+ link_title_identical?(link_a, link_b)
389
453
 
390
- return nil
454
+ false
391
455
  end
392
456
 
393
-
394
457
  def get_links
395
- linkelements = @soup.find_all("a")
396
- links = []
397
- currentlink = 0
398
-
399
- for element in linkelements
400
- title = normalize_text((element['title'] || "").strip)
401
- linktext = normalize_text((elements_to_text(element) || "").strip)
402
- url = element['href']
403
- links << [currentlink, url, title, linktext]
404
- currentlink += 1
405
- end
406
-
407
- return links
458
+ (@doc/'a')
408
459
  end
409
460
 
410
-
411
461
  def langinfo(element)
412
462
  langval = ""
413
463
 
@@ -423,39 +473,33 @@ module Raakt
423
473
  end
424
474
 
425
475
 
426
- def img_to_text(imgtag)
427
- return (imgtag['alt'] || "")
476
+ def alt_to_text(element)
477
+ if element.kind_of?(Hpricot::Elem) then
478
+ element.has_attribute?("alt") ? element['alt'] : ""
479
+ else
480
+ ""
481
+ end
428
482
  end
429
-
430
-
483
+
431
484
  def elements_to_text(element)
432
- retval = ""
433
-
434
- for el in element.contents
435
- if el.class.to_s == 'NavigableString'
436
- retval += el
437
- else
438
- if el.name == "img"
439
- retval += img_to_text(el)
440
- else
441
- retval += elements_to_text(el)
442
- end
443
- end
485
+ str = ''
486
+ element.traverse_all_element do |elem|
487
+ elem.kind_of?(Hpricot::Text) ? str += "#{elem}" : str += alt_to_text(elem)
444
488
  end
445
489
 
446
- return retval
490
+ str
447
491
  end
448
492
 
449
493
 
450
494
  def normalize_text(text)
451
- text = (text || "")
452
- retval = text.gsub(/&nbsp;/, " ")
453
- retval = retval.gsub(/&#160;/, " ")
454
- retval = retval.gsub(/\n/, "")
455
- retval = retval.gsub(/\r/, "")
456
- retval = retval.gsub(/\t/, "")
495
+ text ||= ''
496
+ retval = text.gsub(/&nbsp;/, ' ')
497
+ retval = retval.gsub(/&#160;/, ' ')
498
+ retval = retval.gsub(/\n/, '')
499
+ retval = retval.gsub(/\r/, '')
500
+ retval = retval.gsub(/\t/, '')
457
501
  while / /.match(retval) do
458
- retval = retval.gsub(/ /, " ")
502
+ retval = retval.gsub(/ /, ' ')
459
503
  end
460
504
 
461
505
  retval = retval.strip
@@ -465,12 +509,12 @@ module Raakt
465
509
 
466
510
 
467
511
  def get_labels
468
- return @soup.find_all("label")
512
+ @doc/'label'
469
513
  end
470
514
 
471
515
 
472
516
  def get_editable_fields
473
- allfields = @soup.find_all(["textarea", "select", "input"])
517
+ allfields = (@doc/"textarea|select|input")
474
518
  fields = []
475
519
  field_type = ""
476
520
 
@@ -487,9 +531,37 @@ module Raakt
487
531
 
488
532
 
489
533
  def is_frameset
490
- return (@soup.find("frameset") != nil)
534
+ (@doc/"frameset").length > 0
491
535
  end
492
536
 
537
+
538
+ def link_text_identical?(link_a, link_b)
539
+ get_link_text(link_a) == get_link_text(link_b)
540
+ end
541
+
542
+ def link_title_identical?(link_a, link_b)
543
+ get_link_title(link_a) == get_link_title(link_b)
544
+ end
545
+
546
+ def links_point_to_same_resource?(link_a, link_b)
547
+ (link_a == link_b) ||
548
+ (get_link_url(link_a) == get_link_url(link_b))
549
+ end
550
+
551
+ def get_link_text(link)
552
+ text = (elements_to_text(link) || '').strip
553
+ normalize_text(text)
554
+ end
555
+
556
+ def get_link_url(link)
557
+ link['href']
558
+ end
559
+
560
+ def get_link_title(link)
561
+ text = (link['title'] || '').strip
562
+ normalize_text(text)
563
+ end
564
+
493
565
  end
494
566
 
495
- end
567
+ end