raakt 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. data/lib/raakt.rb +454 -0
  2. data/tests/empty.htm +1 -0
  3. data/tests/emptytitledoc.htm +8 -0
  4. data/tests/fielddoc1.htm +2 -0
  5. data/tests/fielddoc2.htm +11 -0
  6. data/tests/fielddoc3.htm +14 -0
  7. data/tests/flickerdoc1.htm +0 -0
  8. data/tests/framedoc1.htm +22 -0
  9. data/tests/framedoc2.htm +8 -0
  10. data/tests/full_google.htm +17 -0
  11. data/tests/headingsdoc1.htm +17 -0
  12. data/tests/headingsdoc2.htm +14 -0
  13. data/tests/headingsdoc3.htm +6 -0
  14. data/tests/headingsdoc4.htm +9 -0
  15. data/tests/headingsdoc5.htm +9 -0
  16. data/tests/headingsdoc6.htm +6 -0
  17. data/tests/headingsdoc7.htm +8 -0
  18. data/tests/headingsdoc8.htm +12 -0
  19. data/tests/headingsdoc9.htm +20 -0
  20. data/tests/imagedoc1.htm +8 -0
  21. data/tests/imagedoc2.htm +1 -0
  22. data/tests/imagedoc3.htm +11 -0
  23. data/tests/imagedoc4.htm +7 -0
  24. data/tests/invalidelements1.htm +18 -0
  25. data/tests/invalidhtmldoc1.htm +10 -0
  26. data/tests/invalidhtmldoc2.htm +20 -0
  27. data/tests/invalidxhtmldoc1.htm +17 -0
  28. data/tests/linkdoc1.htm +18 -0
  29. data/tests/linkdoc2.htm +12 -0
  30. data/tests/linkdoc3.htm +16 -0
  31. data/tests/linkdoc4.htm +10 -0
  32. data/tests/metarefreshdoc1.htm +10 -0
  33. data/tests/metarefreshdoc2.htm +14 -0
  34. data/tests/metarefreshdoc3.htm +10 -0
  35. data/tests/nestedcomment.htm +7 -0
  36. data/tests/newlinetext.txt +3 -0
  37. data/tests/raakt_test.rb +224 -0
  38. data/tests/scriptdoc1.htm +15 -0
  39. data/tests/scriptdoc2.htm +10 -0
  40. data/tests/tabledoc1.htm +5 -0
  41. data/tests/tabledoc2.htm +9 -0
  42. data/tests/tabledoc3.htm +6 -0
  43. data/tests/tabledoc4.htm +17 -0
  44. data/tests/tabledoc5.htm +11 -0
  45. data/tests/tabledoc6.htm +11 -0
  46. data/tests/tablelayoutdoc.htm +16 -0
  47. data/tests/test_helper.rb +21 -0
  48. data/tests/xhtmldoc1.htm +14 -0
  49. metadata +100 -0
@@ -0,0 +1 @@
1
+ <empty>Blank document</empty>
@@ -0,0 +1,8 @@
1
+ <html lang="en">
2
+ <head>
3
+ <title></title>
4
+ </head>
5
+ <body>
6
+ <p>This is a document with en amty title element.</p>
7
+ </body>
8
+ </html>
@@ -0,0 +1,2 @@
1
+ <label for="myid">My label</label>
2
+ <input type="text" id="myid" />
@@ -0,0 +1,11 @@
1
+ <html>
2
+ <body>
3
+ <h3>First heading</h3>
4
+ <label for="textid">My label</label>
5
+ <p id="pid">This is a minimal <a href="http://www.w3.org/TR/xhtml1/">image document.</a>
6
+ <input type="text" id="textid" />
7
+ <input type="hidden" id="hiddenid"/>
8
+ <p>This is a minimal <a href="http://www.w3.org/TR/xhtml1/">image document.</a>
9
+ <INPUT TYPE='text' ID='myid'>
10
+ </body>
11
+ </html>
@@ -0,0 +1,14 @@
1
+ <LABEL for='myid' lang='en'>My input
2
+ </LABEL>
3
+ <input
4
+ type="text"
5
+ id="myid"
6
+ >
7
+ <textarea>value</textarea>
8
+ <label for="selectid">
9
+ My select
10
+ </label>
11
+ <select lang='en'
12
+ id='selectid'>
13
+ <option>My value</opiton>
14
+ </select>
File without changes
@@ -0,0 +1,22 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN"
2
+ "http://www.w3.org/TR/html4/frameset.dtd">
3
+ <HTML>
4
+ <HEAD>
5
+ <TITLE>A simple frameset document</TITLE>
6
+ </HEAD>
7
+ <FRAMESET cols="20%, 80%">
8
+ <FRAMESET rows="100, 200">
9
+ <FRAME src="contents_of_frame1.html">
10
+ <FRAME src="contents_of_frame2.gif">
11
+ </FRAMESET>
12
+ <FRAME src="contents_of_frame3.html">
13
+ <NOFRAMES>
14
+ <P>This frameset document contains:
15
+ <UL>
16
+ <LI><A href="contents_of_frame1.html">Some neat contents</A>
17
+ <LI><IMG src="contents_of_frame2.gif" alt="A neat image">
18
+ <LI><A href="contents_of_frame3.html">Some other neat contents</A>
19
+ </UL>
20
+ </NOFRAMES>
21
+ </FRAMESET>
22
+ </HTML>
@@ -0,0 +1,8 @@
1
+ <frameset
2
+ cols="20%, 80%">
3
+ <FRAMESET rows="100, 200">
4
+ <FRAME src="contents_of_frame1.html" title="menu">
5
+ <FRAME src="contents_of_frame2.gif" title="content">
6
+ </FRAMESET>
7
+ </frameset>
8
+ </HTML>
@@ -0,0 +1,17 @@
1
+ <html><head><meta http-equiv="content-type" content="text/html; charset=UTF-8"><title>Google</title><style><!--
2
+ body,td,a,p,.h{font-family:arial,sans-serif;}
3
+ .h{font-size: 20px;}
4
+ .q{color:#0000cc;}
5
+ -->
6
+ </style>
7
+ <script>
8
+ <!--
9
+ function sf(){document.f.q.focus();}
10
+ function asq(event,el,oi,cad,ct,cd,sg){if(window.XMLHttpRequest){if(el.handledFirstTime){el.handledFirstTime=false;return false;}el.handledFirstTime=true;var e = window.encodeURIComponent ? encodeURIComponent : escape;var oi_param="";var cad_param="";if (oi) oi_param="&oi="+e(oi);if (cad) cad_param="&cad="+e(cad);var x=new XMLHttpRequest();x.open("GET","/url?sa=T"+oi_param+cad_param+"&ct="+e(ct)+"&cd="+e(cd)+"&url="+e(el.href).replace(/\+/g,"%2B")+"&ei="+sg,true);var m=event.altKey||event.metaKey;if(!m){x.onreadystatechange=function(){if(x.readyState==4){clearTimeout(timeoutid);el.dispatchEvent(event);}};var timeoutid=setTimeout(function(){x.abort();el.dispatchEvent(event);},2000);}x.send(null);return m;}return true;}
11
+ // -->
12
+ </script>
13
+ </head><body bgcolor=#ffffff text=#000000 link=#0000cc vlink=#551a8b alink=#ff0000 onLoad=sf() topmargin=3 marginheight=3><center><table border=0 cellspacing=0 cellpadding=0 width=100%><tr><td align=right nowrap><font size=-1><b>...</b>&nbsp;|&nbsp;<a href="/url?sa=p&pref=ig&pval=3&q=http://www.google.com/" onmousedown="return asq(event,this,'promos','hppphou:def','pro','1','&sig2=')">Personalized Home</a>&nbsp;|&nbsp;<a href="/searchhistory/?hl=en">Search History</a>&nbsp;|&nbsp;<a href="https://www.google.com/accounts/ManageAccount">My Account</a>&nbsp;|&nbsp;<a href="http://www.google.com/accounts/Logout?continue=http://www.google.com/">Sign out</a></font></td></tr><tr height=4><td><img alt="" width=1 height=1></td></tr></table><img src="/intl/en/images/logo.gif" width=276 height=110 alt="Google"><br><br>
14
+ <form action=/search name=f><script><!--
15
+ function qs(el) {if (window.RegExp && window.encodeURIComponent) {var ue=el.href;var qe=encodeURIComponent(document.f.q.value);if(ue.indexOf("q=")!=-1){el.href=ue.replace(new RegExp("q=[^&$]*"),"q="+qe);}else{el.href=ue+"&q="+qe;}}return 1;}
16
+ // -->
17
+ </script><table border=0 cellspacing=0 cellpadding=4><tr><td nowrap><font size=-1><b>Web</b>&nbsp;&nbsp;&nbsp;&nbsp;<a id=1a class=q href="/imghp?hl=en&tab=wi" onClick="return qs(this);">Images</a>&nbsp;&nbsp;&nbsp;&nbsp;<a id=2a class=q href="http://groups.google.com/grphp?hl=en&tab=wg" onClick="return qs(this);">Groups</a>&nbsp;&nbsp;&nbsp;&nbsp;<a id=4a class=q href="http://news.google.com/nwshp?hl=en&tab=wn" onClick="return qs(this);">News</a>&nbsp;&nbsp;&nbsp;&nbsp;<a id=5a class=q href="http://froogle.google.com/frghp?hl=en&tab=wf" onClick="return qs(this);">Froogle</a>&nbsp;&nbsp;&nbsp;&nbsp;<a id=7a class=q href="/maphp?hl=en&tab=wl" onClick="return qs(this);">Maps</a>&nbsp;&nbsp;&nbsp;&nbsp;<b><a href="/intl/en/options/" class=q>more&nbsp;&raquo;</a></b></font></td></tr></table><table cellspacing=0 cellpadding=0><tr><td width=25%>&nbsp;</td><td align=center><input type=hidden name=hl value=en><input maxlength=2048 size=55 name=q value="" title="Google Search"><br><input type=submit value="Google Search" name=btnG><input type=submit value="I'm Feeling Lucky" name=btnI></td><td valign=top nowrap width=25%><font size=-2>&nbsp;&nbsp;<a href=/advanced_search?hl=en>Advanced Search</a><br>&nbsp;&nbsp;<a href=/preferences?hl=en>Preferences</a><br>&nbsp;&nbsp;<a href=/language_tools?hl=en>Language Tools</a></font></td></tr></table></form><br><br><font size=-1><a href="/intl/en/ads/">Advertising&nbsp;Programs</a> - <a href=/intl/en/services/>Business Solutions</a> - <a href=/intl/en/about.html>About Google</a> - <b><a href=http://www.google.se/>Go to Google Sverige</a></b></font><p><font size=-2>&copy;2006 Google</font></p></center></body></html>
@@ -0,0 +1,17 @@
1
+ <!DOCTYPE html
2
+ PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
3
+ "DTD/xhtml1-strict.dtd">
4
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
5
+ <head>
6
+ <title>This is the title</title>
7
+ </head>
8
+ <body>
9
+ <h1>First h1 heading</h1>
10
+ <p>This is a minimal <a href="http://www.w3.org/TR/xhtml1/">XHTML 1.0</a>
11
+ document.</p>
12
+ <h1>Second h1
13
+ heading</h1>
14
+ <table><tr><td>Test table</td></tr></table>
15
+ <H1>Third h1 heading</H1>
16
+ </body>
17
+ </html>
@@ -0,0 +1,14 @@
1
+ <html>
2
+ <head>
3
+ <title>This is the title</title>
4
+ </head>
5
+ <body>
6
+ <h1>First h1 heading</h1>
7
+ <p>This is a minimal <a href="http://www.w3.org/TR/xhtml1/">XHTML 1.0</a>
8
+ document.</p>
9
+ <h2>First h2
10
+ heading</h2>
11
+ <table><tr><td>Test table</td></tr></table>
12
+ <H3>First h3 heading</H3>
13
+ </body>
14
+ </html>
@@ -0,0 +1,6 @@
1
+ <html>
2
+ <body>
3
+ <h3>First heading</h3>
4
+ <p>This is a minimal <a href="http://www.w3.org/TR/xhtml1/">XHTML 1.0</a>
5
+ </body>
6
+ </html>
@@ -0,0 +1,9 @@
1
+ <html>
2
+ <body>
3
+ <h1>First h1 heading</h1>
4
+ <p>This is a minimal <a href="http://www.w3.org/TR/xhtml1/">XHTML 1.0</a>
5
+ document.</p>
6
+ <table><tr><td>Test table</td></tr></table>
7
+ <H3>First h3 heading</H3>
8
+ </body>
9
+ </html>
@@ -0,0 +1,9 @@
1
+ <html>
2
+ <body>
3
+ <h2>First h1 heading</h2>
4
+ <p>This is a minimal <a href="http://www.w3.org/TR/xhtml1/">XHTML 1.0</a>
5
+ document.</p>
6
+ <table><tr><td>Test table</td></tr></table>
7
+ <H4>First h3 heading</H4>
8
+ </body>
9
+ </html>
@@ -0,0 +1,6 @@
1
+ <h1>h1 heading</h1>
2
+ <h2>h2 heading</h2>
3
+ <h1>h1 heading 2</h1>
4
+ <h2>h2 heading 2</h2>
5
+ <h3>h3 heading</h3>
6
+ <h1>h1 heading 3</h1>
@@ -0,0 +1,8 @@
1
+ <html>
2
+ <body>
3
+ <h2></h2>
4
+ <p>This is a minimal <a href="http://www.w3.org/TR/xhtml1/">XHTML 1.0</a>
5
+ document.</p>
6
+ <table><tr><td>Test table</td></tr></table>
7
+ </body>
8
+ </html>
@@ -0,0 +1,12 @@
1
+ <script type="text/javascript">
2
+ function printpopup(url){
3
+ MyWin = window.open("","","scrollbars=no,resizable=yes,toolbar=no,location=no,directories=no,status=no,menubar=no,width=100,height=100");
4
+ with(MyWin.document){
5
+ open();
6
+ write("<html>\n<head><scr"+"ipt>awidth=(document.layers)?0:8;awidth+=(document.all)?4:0;aheight=(document.layers)?0:29;</scr"+"ipt>\n<title>This is not the title</title>\n</head>\n<body onLoad=\"window.resizeTo(document.images[0].width+awidth,document.images[0].height+aheight)\"marginwidth=0 marginheight=0 leftmargin=0 topmargin=0 rightmargin=0 style=\"overflow:hidden;\">\n");
7
+ write("<h1>This is not a document heading</h1>\n");
8
+ write("</body>\n</html>\n");
9
+ close();
10
+ }
11
+ }
12
+ </script>
@@ -0,0 +1,20 @@
1
+ <html lang="en">
2
+ <head>
3
+ <title>Sample</title>
4
+ </head>
5
+ <body>
6
+ <p>This is a document with an empty title element.</p>
7
+ <script type="text/javascript">
8
+ function printpopup(url){
9
+ MyWin = window.open("","","");
10
+ with(MyWin.document){
11
+ open();
12
+ write("<html><head><title>This is not the title</title>\n</head>\n<body>\n");
13
+ write("<h1>This is not a document heading</h1>\n");
14
+ write("</body>\n</html>\n");
15
+ close();
16
+ }
17
+ }
18
+ </script>
19
+ </body>
20
+ </html>
@@ -0,0 +1,8 @@
1
+ <img
2
+ src="image.png" />
3
+ <img src="image.png" alt="">
4
+ <img src=image.png alt=mytext>
5
+ <IMG
6
+ SRC="image.png"
7
+ ALT="mytext"
8
+ />
@@ -0,0 +1 @@
1
+ <p><img src="image.png" alt=''></p>
@@ -0,0 +1,11 @@
1
+ <html>
2
+ <body>
3
+ <h3>First heading</h3>
4
+ <p>This is a minimal <a href="http://www.w3.org/TR/xhtml1/">image document.</a>
5
+ <img src="noimagealt.png" border="0">
6
+ <p>This is a minimal <a href="http://www.w3.org/TR/xhtml1/">image document.</a>
7
+ <img src="/noimagealt2.png">
8
+ <p>This is a minimal <a href="http://www.w3.org/TR/xhtml1/">image document.</a>
9
+ <img src="../folder/noimagealt3.png" border="0">
10
+ </body>
11
+ </html>
@@ -0,0 +1,7 @@
1
+ <html>
2
+ <body>
3
+ <h3>First heading</h3>
4
+ <p>This is a minimal <a href="http://www.w3.org/TR/xhtml1/">image document.</a>
5
+ <IMG src="noimagealt.png" border="0">
6
+ </body>
7
+ </html>
@@ -0,0 +1,18 @@
1
+ <html lang="en">
2
+ <head>
3
+ <title>This is the title</title>
4
+ </head>
5
+ <body>
6
+ <h1>First h1 heading</h1>
7
+ <p>This is a <b>minimal</b> <a href="http://www.w3.org/TR/xhtml1/">XHTML 1.0</a>
8
+ document.</p>
9
+ <p>This is a <font style="bold">bold</font> <a href="http://www.w3.org/TR/xhtml1/">XHTML 1.0</a>
10
+ document.</p>
11
+ <h1>Second h1
12
+ heading</h1>
13
+ <blink>No blinking text!</blink>
14
+ <table><tr><td>Test table</td></tr></table>
15
+ <H1>Third h1 heading</H1>
16
+ <marquee>Marquees are so nineties!</marquee>
17
+ </body>
18
+ </html>
@@ -0,0 +1,10 @@
1
+ <HTML>
2
+ <head>
3
+ <TITLE>This is the title</title>
4
+ </head>
5
+ <body>
6
+ <H1>Heading</H1>
7
+ <p>S small document</p>
8
+ <!-- comment -->
9
+ </body>
10
+ </HTML>
@@ -0,0 +1,20 @@
1
+ <HTML>
2
+ <head>
3
+ <TITLE>This is the
4
+ title
5
+
6
+ </title>
7
+ <link rel="schema.DC" href="http://purl.org/DC/elements/1.0">
8
+ </head>
9
+ <body>
10
+
11
+ <!-- h3>Heading 3
12
+ </h3>
13
+ <h1>Heading 1</h1 -->
14
+ <p>S small document</p>
15
+ <p> more content </p>
16
+ <!-- second comment
17
+
18
+ -->
19
+ </body>
20
+ </HTML>
@@ -0,0 +1,17 @@
1
+ <!DOCTYPE html
2
+ PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
3
+ "DTD/xhtml1-strict.dtd">
4
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
5
+ <head>
6
+ <?xml version="1.0" encoding="UTF-8"?>
7
+ <title>This is the title</title>
8
+ <link rel="schema.DC" href="http://purl.org/DC/elements/1.0/" />
9
+ </head>
10
+ <body>
11
+ <h1>First h1 heading</h1>
12
+ <p>This is a minimal <a href="http://www.w3.org/TR/xhtml1/">XHTML 1.0</a>
13
+ document.</p>
14
+ <h1>Second h1 heading</h1>
15
+ <table><tr><td>Test table</td></tr></table>
16
+ </body>
17
+ </html>
@@ -0,0 +1,18 @@
1
+ <html>
2
+ <head>
3
+ <title>This is a link document</title>
4
+ </head>
5
+ <body>
6
+ <h1>The first heading</h1>
7
+ <h2><a href="/news1">New sitem 1</a></h2>
8
+ <p>This is the text for the first news item. <a href="/news1">Read more </a></p>
9
+ <h2><a href="/news2">New sitem 2</a></h2>
10
+ <p>This is the text for the second news item. <a href="/news2">
11
+ Read more</a></p>
12
+ <h2><a href="/news1">New sitem 3</a></h2>
13
+ <p>This is the text for the third news item. <a href="/news3">Read
14
+ more</a></p>
15
+ <h2><a href="/news1">New sitem 4</a></h2>
16
+ <p>This is the text for the fourth news item. <a href="/news4">Read&nbsp;more</a></p>
17
+ </body>
18
+ </html>
@@ -0,0 +1,12 @@
1
+ <html>
2
+ <head>
3
+ <title>This is a link document</title>
4
+ </head>
5
+ <body>
6
+ <h1>The first heading</h1>
7
+ <h2><a href="/news1">New sitem 1</a></h2>
8
+ <p>This is the text for the first news item. <a href="/news1" title="More on item 1">Read more</a></p>
9
+ <h2><a href="/news2">New sitem 2</a></h2>
10
+ <p>This is the text for the second news item. <a href="/news2">Read more</a></p>
11
+ </body>
12
+ </html>
@@ -0,0 +1,16 @@
1
+ <html>
2
+ <head>
3
+ <title>This is a link document</title>
4
+ </head>
5
+ <body>
6
+ <h1>The first heading</h1>
7
+ <h2><a href="/news1">New sitem 1</a></h2>
8
+ <p>This is the text for the first news item.</p>
9
+ <h2><a href="/news2">New sitem 2</a></h2>
10
+ <p>This is the text for the second news item.</p>
11
+ <h2><a href="/news1">New sitem 3</a></h2>
12
+ <p>This is the text for the third news item.</p>
13
+ <h2><a href="/news1">New sitem 4</a></h2>
14
+ <p>This is the text for the fourth news item.</p>
15
+ </body>
16
+ </html>
@@ -0,0 +1,10 @@
1
+ <html>
2
+ <head>
3
+ <title>This is a link document</title>
4
+ </head>
5
+ <body>
6
+ <h1>The first heading</h1>
7
+ <p><a href="/news1"><img src="/example.gif" alt="Read more"></a></p>
8
+ <p><a href="/news2">Read more</a></p>
9
+ </body>
10
+ </html>
@@ -0,0 +1,10 @@
1
+ <html lang="en">
2
+ <head>
3
+ <title>This is the title</title>
4
+ <meta http-equiv="refresh" content="5"/>
5
+ </head>
6
+ <body>
7
+ <p>This is a minimal <a href="http://www.w3.org/TR/xhtml1/">XHTML 1.0</a>
8
+ document.</p>
9
+ </body>
10
+ </html>
@@ -0,0 +1,14 @@
1
+ <html lang="en">
2
+ <head>
3
+ <title>This is the title</title>
4
+ <meta http-equiv="cache-control" content="no-cache" >
5
+ <meta http-equiv=
6
+ "refresh"
7
+ content=
8
+ "5" />
9
+ </head>
10
+ <body>
11
+ <p>This is a minimal <a href="http://www.w3.org/TR/xhtml1/">XHTML 1.0</a>
12
+ document.</p>
13
+ </body>
14
+ </html>
@@ -0,0 +1,10 @@
1
+ <html lang="en">
2
+ <head>
3
+ <title>This is the title</title>
4
+ <META HTTP-equiv="refresh" CONTENT="5">
5
+ </head>
6
+ <body>
7
+ <p>This is a minimal <a href="http://www.w3.org/TR/xhtml1/">XHTML 1.0</a>
8
+ document.</p>
9
+ </body>
10
+ </html>
@@ -0,0 +1,7 @@
1
+ <script language="JavaScript">
2
+ <!-- function BrowserCheck() {
3
+ test = "<!--my" + "broken"
4
+ + " comment" +
5
+ "-->
6
+ -->
7
+ </script>
@@ -0,0 +1,3 @@
1
+ line 1
2
+ line 2
3
+ line 3
@@ -0,0 +1,224 @@
1
+ require File.dirname(__FILE__) + '/test_helper'
2
+ require File.dirname(__FILE__) + '/../lib/raakt'
3
+ require 'rubyful_soup'
4
+
5
+ class RaaktTest < Test::Unit::TestCase
6
+
7
+ def setup
8
+ @raakt = Raakt::Test.new
9
+ end
10
+
11
+ def test_all
12
+ puts @raakt.all(data_full_google)
13
+ end
14
+
15
+ def test_check_images
16
+ assert_equal 1, @raakt.check_images(data_imagedoc1).length
17
+ assert_equal "missingalt", @raakt.check_images(data_imagedoc1)[0].eid
18
+
19
+ assert_equal 0, @raakt.check_images(data_imagedoc2).length
20
+
21
+ assert_equal 3, @raakt.check_images(data_imagedoc3).length
22
+
23
+ assert_equal 1, @raakt.check_images(data_imagedoc4).length
24
+ end
25
+
26
+ def test_check_images_in_blank_doc
27
+ assert_equal 0, @raakt.check_images(data_empty).length
28
+ end
29
+
30
+
31
+ def test_check_title
32
+ assert_equal 0, @raakt.check_title(data_xhtmldoc1).length
33
+ assert_equal 1, @raakt.check_title(data_empty).length
34
+ assert_equal "missingtitle", @raakt.check_title(data_empty)[0].eid
35
+
36
+ assert_equal 1, @raakt.check_title(data_emptytitledoc).length
37
+ assert_equal "emptytitle", @raakt.check_title(data_emptytitledoc)[0].eid
38
+
39
+ assert_equal 0, @raakt.check_title(data_invalidhtmldoc1).length
40
+ assert_equal 0, @raakt.check_title(data_invalidhtmldoc2).length
41
+ end
42
+
43
+
44
+ def test_headings
45
+ assert_equal 3, @raakt.headings(data_headingsdoc1).length
46
+ assert_equal 0, @raakt.headings(data_invalidhtmldoc2).length
47
+ end
48
+
49
+ def test_level
50
+ assert_equal 1, @raakt.level("h1")
51
+ assert_equal 2, @raakt.level("h2")
52
+ assert_equal 6, @raakt.level("h6")
53
+ end
54
+
55
+ def test_check_has_heading
56
+ assert_equal 1, @raakt.check_has_heading(data_empty).length
57
+ assert_equal "missingheading", @raakt.check_has_heading(data_empty)[0].eid
58
+ assert_equal 0, @raakt.check_has_heading(data_headingsdoc1).length
59
+ assert_equal 0, @raakt.check_has_heading(data_headingsdoc9).length
60
+
61
+ assert_equal 1, @raakt.check_has_heading(data_invalidhtmldoc2).length
62
+ assert_equal "missingheading", @raakt.check_has_heading(data_invalidhtmldoc2)[0].eid
63
+ end
64
+
65
+ def test_check_document_structure
66
+ assert_equal 0, @raakt.check_document_structure(data_headingsdoc1).length
67
+ assert_equal 1, @raakt.check_document_structure(data_headingsdoc3).length
68
+ assert_equal "firsthnoth1", @raakt.check_document_structure(data_headingsdoc3)[0].eid
69
+ assert_equal "wronghstructure", @raakt.check_document_structure(data_headingsdoc4)[0].eid
70
+ assert_equal "firsthnoth1", @raakt.check_document_structure(data_headingsdoc5)[0].eid
71
+ assert_equal "wronghstructure", @raakt.check_document_structure(data_headingsdoc5)[1].eid
72
+ assert_equal 0, @raakt.check_document_structure(data_headingsdoc6).length
73
+ assert_equal 0, @raakt.check_document_structure("").length
74
+ end
75
+
76
+
77
+ def test_check_for_nested_tables
78
+ assert_equal 0, @raakt.check_for_nested_tables(data_tabledoc1).length
79
+ assert_equal 0, @raakt.check_for_nested_tables(data_tabledoc2).length
80
+ assert_equal 1, @raakt.check_for_nested_tables(data_tabledoc3).length
81
+ assert_equal 0, @raakt.check_for_nested_tables(data_tabledoc4).length
82
+ assert_equal 1, @raakt.check_for_nested_tables(data_tabledoc5).length
83
+ assert_equal "hasnestedtables", @raakt.check_for_nested_tables(data_tabledoc3)[0].eid
84
+ end
85
+
86
+ def test_check_tables
87
+ puts @raakt.check_tables(data_tabledoc4).to_s
88
+ assert_equal 0, @raakt.check_tables(data_tabledoc4).length
89
+ assert_equal 0, @raakt.check_tables(data_tabledoc1).length
90
+ assert_equal 2, @raakt.check_tables(data_tabledoc2).length
91
+ end
92
+
93
+
94
+ def test_check_for_formatting_elements
95
+ assert_equal 1, @raakt.check_for_formatting_elements(data_invalidelements1).length
96
+ assert_equal "boldused", @raakt.check_for_formatting_elements(data_invalidelements1)[0].eid
97
+ end
98
+
99
+
100
+ def test_check_for_language_info
101
+ assert_equal 0, @raakt.check_for_language_info(data_xhtmldoc1).length
102
+ assert_equal 1, @raakt.check_for_language_info(data_tabledoc2).length
103
+ assert_equal 1, @raakt.check_for_language_info(data_tablelayoutdoc).length
104
+ end
105
+
106
+
107
+ def test_check_link_text
108
+ assert_equal 1, @raakt.check_link_text(data_linkdoc1).length
109
+ assert_equal "ambiguouslinktext", @raakt.check_link_text(data_linkdoc1)[0].eid
110
+ assert_equal 0, @raakt.check_link_text(data_linkdoc3).length
111
+ assert_equal 0, @raakt.check_link_text(data_linkdoc2).length
112
+ assert_equal 1, @raakt.check_link_text(data_linkdoc4).length
113
+ end
114
+
115
+
116
+ def test_get_links
117
+ assert_equal 8, @raakt.get_links(data_linkdoc1).length
118
+ assert_equal 2, @raakt.get_links(data_linkdoc4).length
119
+ assert_equal "Read more", @raakt.get_links(data_linkdoc4)[0][3]
120
+ end
121
+
122
+ def test_img_to_text
123
+ assert_equal "Read more", @raakt.img_to_text(BeautifulSoup.new("<img src='123' alt='Read more' />").img)
124
+ end
125
+
126
+ def test_elements_to_text
127
+ assert_equal "Read more about", @raakt.elements_to_text(BeautifulSoup.new("<a href='rrr'>Read <img src='123' alt='more' /> about</a>").a)
128
+ assert_equal "A sample text here", @raakt.elements_to_text(BeautifulSoup.new("<a href='r'><strong>A</strong> sample <img src='123' alt='text' /> <b>here</b></a>").a)
129
+ end
130
+
131
+ def test_normalize_text
132
+ assert_equal "Read more", @raakt.normalize_text("Read&nbsp;more")
133
+ assert_equal "Read more", @raakt.normalize_text("Read&#160;more")
134
+ assert_equal "Read more", @raakt.normalize_text("Read more")
135
+ assert_equal "Read more", @raakt.normalize_text("Read more")
136
+ assert_equal "Read more", @raakt.normalize_text("Read more")
137
+ assert_equal "Read more", @raakt.normalize_text("Read\n more")
138
+ assert_equal "L�s mer", @raakt.normalize_text("L�s\n mer")
139
+ assert_equal "L�s mer", @raakt.normalize_text("L�s \nmer")
140
+ assert_equal "Read more", @raakt.normalize_text("Read \n\n\nmore")
141
+ assert_equal "Read more", @raakt.normalize_text("Read \tmore")
142
+ assert_equal "Read more", @raakt.normalize_text(" Read more")
143
+ end
144
+
145
+ def test_is_ambiguous_link
146
+ link_a = [1, "/news/1", "", "Read more"]
147
+ link_b = [2, "/news/2", "", "Read more"]
148
+ assert_equal true, @raakt.is_ambiguous_link(link_a, link_b)
149
+
150
+ link_c = [1, "/news/1", "More about first news item", "Read more"]
151
+ link_d = [2, "/news/2", "More about second news item", "Read more"]
152
+ assert_equal false, @raakt.is_ambiguous_link(link_c, link_d)
153
+
154
+ link_a = [1, "/news/1", nil, "Read more"]
155
+ link_b = [2, "/news/2", nil, "Read more"]
156
+ assert_equal true, @raakt.is_ambiguous_link(link_a, link_b)
157
+
158
+ link_g = [1, "/news/1", "", "Read more"]
159
+ link_h = [2, "/news/1", "", "Read more"]
160
+ assert_equal false, @raakt.is_ambiguous_link(link_g, link_h)
161
+
162
+ link_i = [1, "/news/1", "", "L�s mer"]
163
+ link_j = [2, "/news/2", "", "L�s\n mer"]
164
+ assert_equal true, @raakt.is_ambiguous_link(link_i, link_j)
165
+
166
+ link_k = [1, "/news/1", "", "L�s mer"]
167
+ link_l = [2, "/news/2", "", "L�s \nmer"]
168
+ assert_equal true, @raakt.is_ambiguous_link(link_k, link_l)
169
+ end
170
+
171
+
172
+ def test_get_labels
173
+ assert_equal 1, @raakt.get_labels(data_fielddoc1).length
174
+ assert_equal 1, @raakt.get_labels(data_fielddoc2).length
175
+ assert_equal 2, @raakt.get_labels(data_fielddoc3).length
176
+ end
177
+
178
+
179
+ def test_get_editable_fields
180
+ assert_equal 1, @raakt.get_editable_fields(data_fielddoc1).length
181
+ assert_equal 2, @raakt.get_editable_fields(data_fielddoc2).length
182
+ assert_equal 3, @raakt.get_editable_fields(data_fielddoc3).length
183
+ end
184
+
185
+
186
+ def test_check_form
187
+ assert_equal 0, @raakt.check_form(data_fielddoc1).length
188
+ assert_equal 1, @raakt.check_form(data_fielddoc2).length
189
+ assert_equal "fieldmissinglabel", @raakt.check_form(data_fielddoc2)[0].eid
190
+ assert_equal 1, @raakt.check_form(data_fielddoc3).length
191
+ assert_equal "fieldmissinglabel", @raakt.check_form(data_fielddoc3)[0].eid
192
+ end
193
+
194
+
195
+ def test_is_frameset
196
+ assert @raakt.is_frameset(data_framedoc1)
197
+ assert @raakt.is_frameset(data_framedoc2)
198
+ assert !@raakt.is_frameset(data_xhtmldoc1)
199
+ end
200
+
201
+
202
+ def test_check_frames
203
+ assert_equal 3, @raakt.check_frames(data_framedoc1).length
204
+ assert_equal 0, @raakt.check_frames(data_framedoc2).length
205
+ end
206
+
207
+
208
+ def test_check_for_formatting_elements
209
+ invaliderrs = @raakt.check_for_formatting_elements(data_invalidelements1)
210
+ assert_equal 2, invaliderrs.length
211
+ assert_equal "missingsemantics", invaliderrs[0].eid
212
+ assert_equal "hasflicker", invaliderrs[1].eid
213
+ assert_equal 0, @raakt.check_for_formatting_elements(data_xhtmldoc1).length
214
+ end
215
+
216
+
217
+ def test_refresh
218
+ assert_equal 1, @raakt.check_refresh(data_metarefreshdoc1).length
219
+ assert_equal 1, @raakt.check_refresh(data_metarefreshdoc2).length
220
+ assert_equal 1, @raakt.check_refresh(data_metarefreshdoc3).length
221
+ assert_equal 0, @raakt.check_refresh(data_xhtmldoc1).length
222
+ end
223
+
224
+ end