feedtools 0.2.26 → 0.2.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. data/CHANGELOG +232 -216
  2. data/db/migration.rb +2 -0
  3. data/db/schema.mysql.sql +2 -0
  4. data/db/schema.postgresql.sql +3 -1
  5. data/db/schema.sqlite.sql +3 -1
  6. data/lib/feed_tools.rb +37 -14
  7. data/lib/feed_tools/database_feed_cache.rb +13 -2
  8. data/lib/feed_tools/feed.rb +430 -104
  9. data/lib/feed_tools/feed_item.rb +533 -268
  10. data/lib/feed_tools/helpers/generic_helper.rb +1 -1
  11. data/lib/feed_tools/helpers/html_helper.rb +78 -116
  12. data/lib/feed_tools/helpers/retrieval_helper.rb +33 -3
  13. data/lib/feed_tools/helpers/uri_helper.rb +46 -54
  14. data/lib/feed_tools/monkey_patch.rb +27 -1
  15. data/lib/feed_tools/vendor/html5/History.txt +10 -0
  16. data/lib/feed_tools/vendor/html5/Manifest.txt +117 -0
  17. data/lib/feed_tools/vendor/html5/README +45 -0
  18. data/lib/feed_tools/vendor/html5/Rakefile.rb +33 -0
  19. data/lib/feed_tools/vendor/html5/bin/html5 +217 -0
  20. data/lib/feed_tools/vendor/html5/lib/core_ext/string.rb +17 -0
  21. data/lib/feed_tools/vendor/html5/lib/html5.rb +13 -0
  22. data/lib/feed_tools/vendor/html5/lib/html5/constants.rb +1046 -0
  23. data/lib/feed_tools/vendor/html5/lib/html5/filters/base.rb +10 -0
  24. data/lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb +82 -0
  25. data/lib/feed_tools/vendor/html5/lib/html5/filters/iso639codes.rb +752 -0
  26. data/lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb +198 -0
  27. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc2046.rb +30 -0
  28. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc3987.rb +89 -0
  29. data/lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb +15 -0
  30. data/lib/feed_tools/vendor/html5/lib/html5/filters/validator.rb +830 -0
  31. data/lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb +36 -0
  32. data/lib/feed_tools/vendor/html5/lib/html5/html5parser.rb +248 -0
  33. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb +46 -0
  34. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  35. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb +50 -0
  36. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb +41 -0
  37. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb +613 -0
  38. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb +69 -0
  39. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb +78 -0
  40. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  41. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  42. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb +138 -0
  43. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb +89 -0
  44. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb +85 -0
  45. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb +86 -0
  46. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb +115 -0
  47. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb +133 -0
  48. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb +154 -0
  49. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb +41 -0
  50. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  51. data/lib/feed_tools/vendor/html5/lib/html5/inputstream.rb +648 -0
  52. data/lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb +158 -0
  53. data/lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb +188 -0
  54. data/lib/feed_tools/vendor/html5/lib/html5/serializer.rb +2 -0
  55. data/lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb +179 -0
  56. data/lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb +20 -0
  57. data/lib/feed_tools/vendor/html5/lib/html5/sniffer.rb +45 -0
  58. data/lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb +966 -0
  59. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders.rb +24 -0
  60. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb +334 -0
  61. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb +231 -0
  62. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb +209 -0
  63. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb +185 -0
  64. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers.rb +26 -0
  65. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb +162 -0
  66. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb +48 -0
  67. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb +48 -0
  68. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb +48 -0
  69. data/lib/feed_tools/vendor/html5/lib/html5/version.rb +3 -0
  70. data/lib/feed_tools/vendor/html5/testdata/encoding/chardet/test_big5.txt +51 -0
  71. data/lib/feed_tools/vendor/html5/testdata/encoding/test-yahoo-jp.dat +10 -0
  72. data/lib/feed_tools/vendor/html5/testdata/encoding/tests1.dat +394 -0
  73. data/lib/feed_tools/vendor/html5/testdata/encoding/tests2.dat +81 -0
  74. data/lib/feed_tools/vendor/html5/testdata/sanitizer/tests1.dat +416 -0
  75. data/lib/feed_tools/vendor/html5/testdata/serializer/core.test +104 -0
  76. data/lib/feed_tools/vendor/html5/testdata/serializer/injectmeta.test +65 -0
  77. data/lib/feed_tools/vendor/html5/testdata/serializer/optionaltags.test +900 -0
  78. data/lib/feed_tools/vendor/html5/testdata/serializer/options.test +60 -0
  79. data/lib/feed_tools/vendor/html5/testdata/serializer/whitespace.test +51 -0
  80. data/lib/feed_tools/vendor/html5/testdata/sites/google-results.htm +1 -0
  81. data/lib/feed_tools/vendor/html5/testdata/sites/python-ref-import.htm +1 -0
  82. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps-old.htm +1 -0
  83. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps.htm +34275 -0
  84. data/lib/feed_tools/vendor/html5/testdata/sniffer/htmlOrFeed.json +43 -0
  85. data/lib/feed_tools/vendor/html5/testdata/tokenizer/contentModelFlags.test +48 -0
  86. data/lib/feed_tools/vendor/html5/testdata/tokenizer/entities.test +2339 -0
  87. data/lib/feed_tools/vendor/html5/testdata/tokenizer/escapeFlag.test +21 -0
  88. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test1.test +172 -0
  89. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test2.test +129 -0
  90. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test3.test +367 -0
  91. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test4.test +198 -0
  92. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests1.dat +1950 -0
  93. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests2.dat +773 -0
  94. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests3.dat +270 -0
  95. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests4.dat +60 -0
  96. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests5.dat +175 -0
  97. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests6.dat +196 -0
  98. data/lib/feed_tools/vendor/html5/testdata/validator/attributes.test +1035 -0
  99. data/lib/feed_tools/vendor/html5/testdata/validator/base-href-attribute.test +787 -0
  100. data/lib/feed_tools/vendor/html5/testdata/validator/base-target-attribute.test +35 -0
  101. data/lib/feed_tools/vendor/html5/testdata/validator/blockquote-cite-attribute.test +7 -0
  102. data/lib/feed_tools/vendor/html5/testdata/validator/classattribute.test +152 -0
  103. data/lib/feed_tools/vendor/html5/testdata/validator/contenteditableattribute.test +59 -0
  104. data/lib/feed_tools/vendor/html5/testdata/validator/contextmenuattribute.test +115 -0
  105. data/lib/feed_tools/vendor/html5/testdata/validator/dirattribute.test +59 -0
  106. data/lib/feed_tools/vendor/html5/testdata/validator/draggableattribute.test +63 -0
  107. data/lib/feed_tools/vendor/html5/testdata/validator/html-xmlns-attribute.test +23 -0
  108. data/lib/feed_tools/vendor/html5/testdata/validator/idattribute.test +115 -0
  109. data/lib/feed_tools/vendor/html5/testdata/validator/inputattributes.test +2795 -0
  110. data/lib/feed_tools/vendor/html5/testdata/validator/irrelevantattribute.test +63 -0
  111. data/lib/feed_tools/vendor/html5/testdata/validator/langattribute.test +5579 -0
  112. data/lib/feed_tools/vendor/html5/testdata/validator/li-value-attribute.test +7 -0
  113. data/lib/feed_tools/vendor/html5/testdata/validator/link-href-attribute.test +7 -0
  114. data/lib/feed_tools/vendor/html5/testdata/validator/link-hreflang-attribute.test +7 -0
  115. data/lib/feed_tools/vendor/html5/testdata/validator/link-rel-attribute.test +271 -0
  116. data/lib/feed_tools/vendor/html5/testdata/validator/ol-start-attribute.test +7 -0
  117. data/lib/feed_tools/vendor/html5/testdata/validator/starttags.test +375 -0
  118. data/lib/feed_tools/vendor/html5/testdata/validator/style-scoped-attribute.test +7 -0
  119. data/lib/feed_tools/vendor/html5/testdata/validator/tabindexattribute.test +79 -0
  120. data/lib/feed_tools/vendor/html5/tests/preamble.rb +72 -0
  121. data/lib/feed_tools/vendor/html5/tests/test_encoding.rb +35 -0
  122. data/lib/feed_tools/vendor/html5/tests/test_lxp.rb +279 -0
  123. data/lib/feed_tools/vendor/html5/tests/test_parser.rb +68 -0
  124. data/lib/feed_tools/vendor/html5/tests/test_sanitizer.rb +142 -0
  125. data/lib/feed_tools/vendor/html5/tests/test_serializer.rb +68 -0
  126. data/lib/feed_tools/vendor/html5/tests/test_sniffer.rb +27 -0
  127. data/lib/feed_tools/vendor/html5/tests/test_stream.rb +62 -0
  128. data/lib/feed_tools/vendor/html5/tests/test_tokenizer.rb +94 -0
  129. data/lib/feed_tools/vendor/html5/tests/test_treewalkers.rb +135 -0
  130. data/lib/feed_tools/vendor/html5/tests/test_validator.rb +31 -0
  131. data/lib/feed_tools/vendor/html5/tests/tokenizer_test_parser.rb +63 -0
  132. data/lib/feed_tools/vendor/uri.rb +781 -0
  133. data/lib/feed_tools/version.rb +1 -1
  134. data/rakefile +27 -6
  135. data/test/unit/atom_test.rb +298 -210
  136. data/test/unit/helper_test.rb +7 -12
  137. data/test/unit/rdf_test.rb +51 -1
  138. data/test/unit/rss_test.rb +13 -3
  139. metadata +239 -116
  140. data/lib/feed_tools/vendor/htree.rb +0 -97
  141. data/lib/feed_tools/vendor/htree/container.rb +0 -10
  142. data/lib/feed_tools/vendor/htree/context.rb +0 -67
  143. data/lib/feed_tools/vendor/htree/display.rb +0 -27
  144. data/lib/feed_tools/vendor/htree/doc.rb +0 -149
  145. data/lib/feed_tools/vendor/htree/elem.rb +0 -262
  146. data/lib/feed_tools/vendor/htree/encoder.rb +0 -163
  147. data/lib/feed_tools/vendor/htree/equality.rb +0 -218
  148. data/lib/feed_tools/vendor/htree/extract_text.rb +0 -37
  149. data/lib/feed_tools/vendor/htree/fstr.rb +0 -33
  150. data/lib/feed_tools/vendor/htree/gencode.rb +0 -97
  151. data/lib/feed_tools/vendor/htree/htmlinfo.rb +0 -672
  152. data/lib/feed_tools/vendor/htree/inspect.rb +0 -108
  153. data/lib/feed_tools/vendor/htree/leaf.rb +0 -94
  154. data/lib/feed_tools/vendor/htree/loc.rb +0 -367
  155. data/lib/feed_tools/vendor/htree/modules.rb +0 -48
  156. data/lib/feed_tools/vendor/htree/name.rb +0 -124
  157. data/lib/feed_tools/vendor/htree/output.rb +0 -207
  158. data/lib/feed_tools/vendor/htree/parse.rb +0 -409
  159. data/lib/feed_tools/vendor/htree/raw_string.rb +0 -124
  160. data/lib/feed_tools/vendor/htree/regexp-util.rb +0 -15
  161. data/lib/feed_tools/vendor/htree/rexml.rb +0 -130
  162. data/lib/feed_tools/vendor/htree/scan.rb +0 -166
  163. data/lib/feed_tools/vendor/htree/tag.rb +0 -111
  164. data/lib/feed_tools/vendor/htree/template.rb +0 -909
  165. data/lib/feed_tools/vendor/htree/text.rb +0 -115
  166. data/lib/feed_tools/vendor/htree/traverse.rb +0 -465
@@ -0,0 +1,68 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+
3
+ require 'html5/treebuilders'
4
+ require 'html5/html5parser'
5
+
6
+
7
+ $tree_types_to_test = ['simpletree', 'rexml']
8
+
9
+ begin
10
+ require 'hpricot'
11
+ $tree_types_to_test.push('hpricot')
12
+ rescue LoadError
13
+ end
14
+
15
+ $CHECK_PARSER_ERRORS = ARGV.delete('-p') # TODO
16
+
17
+ puts 'Testing tree builders: ' + $tree_types_to_test * ', '
18
+
19
+
20
+ class Html5ParserTestCase < Test::Unit::TestCase
21
+ include HTML5
22
+ include TestSupport
23
+
24
+ html5_test_files('tree-construction').each do |test_file|
25
+
26
+ test_name = File.basename(test_file).sub('.dat', '')
27
+
28
+ TestData.new(test_file, %w(data errors document-fragment document)).
29
+ each_with_index do |(input, errors, inner_html, expected), index|
30
+
31
+ errors = errors.split("\n")
32
+ expected = expected.gsub("\n| ","\n")[2..-1]
33
+
34
+ $tree_types_to_test.each do |tree_name|
35
+ define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
36
+
37
+ parser = HTMLParser.new(:tree => TreeBuilders[tree_name])
38
+
39
+ if inner_html
40
+ parser.parse_fragment(input, inner_html)
41
+ else
42
+ parser.parse(input)
43
+ end
44
+
45
+ actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
46
+
47
+ assert_equal sortattrs(expected), sortattrs(actual_output), [
48
+ '', 'Input:', input,
49
+ '', 'Expected:', expected,
50
+ '', 'Recieved:', actual_output
51
+ ].join("\n")
52
+
53
+ actual_errors = parser.errors.map do |(line, col), message, datavars|
54
+ 'Line: %i Col: %i %s' % [line, col, E[message] % datavars]
55
+ end
56
+ assert_equal errors.length, parser.errors.length, [
57
+ '', 'Input', input,
58
+ '', "Expected errors (#{errors.length}):", errors.join("\n"),
59
+ '', "Actual errors (#{actual_errors.length}):",
60
+ actual_errors.join("\n")
61
+ ].join("\n")
62
+
63
+ end
64
+ end
65
+ end
66
+ end
67
+
68
+ end
@@ -0,0 +1,142 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.join(File.dirname(__FILE__), 'preamble')
4
+
5
+ require 'html5/html5parser'
6
+ require 'html5/liberalxmlparser'
7
+ require 'html5/treewalkers'
8
+ require 'html5/serializer'
9
+ require 'html5/sanitizer'
10
+
11
+ class SanitizeTest < Test::Unit::TestCase
12
+ include HTML5
13
+
14
+ def sanitize_xhtml stream
15
+ XHTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).to_s
16
+ end
17
+
18
+ def sanitize_html stream
19
+ HTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).to_s
20
+ end
21
+
22
+ def sanitize_rexml stream
23
+ require 'rexml/document'
24
+ doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{stream}</div>")
25
+ tokens = TreeWalkers.get_tree_walker('rexml').new(doc)
26
+ XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
27
+ :quote_char => "'",
28
+ :inject_meta_charset => false,
29
+ :sanitize => true}).gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
30
+ rescue REXML::ParseException
31
+ return "Ill-formed XHTML!"
32
+ end
33
+
34
+ def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
35
+ assert_equal htmloutput, sanitize_html(input)
36
+ assert_equal xhtmloutput, sanitize_xhtml(input)
37
+ assert_equal rexmloutput, sanitize_rexml(input)
38
+ end
39
+
40
+ HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
41
+ define_method "test_should_allow_#{tag_name}_tag" do
42
+ input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
43
+ htmloutput = "<#{tag_name.downcase} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name.downcase}>"
44
+ xhtmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name}>"
45
+ rexmloutput = xhtmloutput
46
+
47
+ if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
48
+ htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
49
+ xhtmloutput = htmloutput
50
+ elsif tag_name == 'col'
51
+ htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
52
+ xhtmloutput = htmloutput
53
+ rexmloutput = "<col title='1' />"
54
+ elsif tag_name == 'table'
55
+ htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt;baz<table title='1'> </table>"
56
+ xhtmloutput = htmloutput
57
+ elsif tag_name == 'image'
58
+ htmloutput = "<img title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
59
+ xhtmloutput = htmloutput
60
+ rexmloutput = "<image title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</image>"
61
+ elsif VOID_ELEMENTS.include?(tag_name)
62
+ htmloutput = "<#{tag_name} title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
63
+ xhtmloutput = htmloutput
64
+ htmloutput += '<br/>' if tag_name == 'br'
65
+ rexmloutput = "<#{tag_name} title='1' />"
66
+ end
67
+ check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
68
+ end
69
+ end
70
+
71
+ HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
72
+ define_method "test_should_forbid_#{tag_name.upcase}_tag" do
73
+ input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
74
+ output = "&lt;#{tag_name.upcase} title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/#{tag_name.upcase}&gt;"
75
+ check_sanitization(input, output, output, output)
76
+ end
77
+ end
78
+
79
+ HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
80
+ next if attribute_name == 'style'
81
+ define_method "test_should_allow_#{attribute_name}_attribute" do
82
+ input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
83
+ output = "<p #{attribute_name}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
84
+ htmloutput = "<p #{attribute_name.downcase}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
85
+ check_sanitization(input, htmloutput, output, output)
86
+ end
87
+ end
88
+
89
+ HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
90
+ define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
91
+ input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
92
+ output = "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
93
+ check_sanitization(input, output, output, output)
94
+ end
95
+ end
96
+
97
+ HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
98
+ define_method "test_should_allow_#{protocol}_uris" do
99
+ input = %(<a href="#{protocol}">foo</a>)
100
+ output = "<a href='#{protocol}'>foo</a>"
101
+ check_sanitization(input, output, output, output)
102
+ end
103
+ end
104
+
105
+ HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
106
+ define_method "test_should_allow_uppercase_#{protocol}_uris" do
107
+ input = %(<a href="#{protocol.upcase}">foo</a>)
108
+ output = "<a href='#{protocol.upcase}'>foo</a>"
109
+ check_sanitization(input, output, output, output)
110
+ end
111
+ end
112
+
113
+ def test_should_handle_astral_plane_characters
114
+ input = "<p>&#x1d4b5; &#x1d538;</p>"
115
+ output = "<p>\360\235\222\265 \360\235\224\270</p>"
116
+ check_sanitization(input, output, output, output)
117
+
118
+ input = "<p><tspan>\360\235\224\270</tspan> a</p>"
119
+ output = "<p><tspan>\360\235\224\270</tspan> a</p>"
120
+ check_sanitization(input, output, output, output)
121
+ end
122
+
123
+ # This affects only NS4. Is it worth fixing?
124
+ # def test_javascript_includes
125
+ # input = %(<div size="&{alert('XSS')}">foo</div>)
126
+ # output = "<div>foo</div>"
127
+ # check_sanitization(input, output, output, output)
128
+ # end
129
+
130
+ html5_test_files('sanitizer').each do |filename|
131
+ JSON::parse(open(filename).read).each do |test|
132
+ define_method "test_#{test['name']}" do
133
+ check_sanitization(
134
+ test['input'],
135
+ test['output'],
136
+ test['xhtml'] || test['output'],
137
+ test['rexml'] || test['output']
138
+ )
139
+ end
140
+ end
141
+ end
142
+ end
@@ -0,0 +1,68 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+
3
+ require 'html5/html5parser'
4
+ require 'html5/serializer'
5
+ require 'html5/treewalkers'
6
+
7
+ #Run the serialize error checks
8
+ checkSerializeErrors = false
9
+
10
+ class JsonWalker < HTML5::TreeWalkers::Base
11
+ def each
12
+ @tree.each do |token|
13
+ case token[0]
14
+ when 'StartTag'
15
+ yield start_tag(token[1], token[2])
16
+ when 'EndTag'
17
+ yield end_tag(token[1])
18
+ when 'EmptyTag'
19
+ yield empty_tag(token[1], token[2])
20
+ when 'Comment'
21
+ yield comment(token[1])
22
+ when 'Characters', 'SpaceCharacters'
23
+ text(token[1]) {|textToken| yield textToken}
24
+ when 'Doctype'
25
+ yield doctype(token[1], token[2], token[3])
26
+ else
27
+ raise "Unknown token type: " + token[0]
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ class Html5SerializeTestcase < Test::Unit::TestCase
34
+ html5_test_files('serializer').each do |filename|
35
+ test_name = File.basename(filename).sub('.test', '')
36
+ tests = JSON::parse(open(filename).read)
37
+ tests['tests'].each_with_index do |test, index|
38
+
39
+ define_method "test_#{test_name}_#{index+1}" do
40
+ if test["options"] and test["options"]["encoding"]
41
+ test["options"][:encoding] = test["options"]["encoding"]
42
+ end
43
+
44
+ result = HTML5::HTMLSerializer.
45
+ serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
46
+ expected = test["expected"]
47
+ if expected.length == 1
48
+ assert_equal(expected[0], result, test["description"])
49
+ elsif !expected.include?(result)
50
+ flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
51
+ end
52
+
53
+ return if test_name == 'optionaltags'
54
+
55
+ result = HTML5::XHTMLSerializer.
56
+ serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
57
+ expected = test["xhtml"] || test["expected"]
58
+ if expected.length == 1
59
+ assert_equal(expected[0], result, test["description"])
60
+ elsif !expected.include?(result)
61
+ flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
62
+ end
63
+
64
+ end
65
+
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,27 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+ require "html5/sniffer"
3
+
4
+ class TestFeedTypeSniffer < Test::Unit::TestCase
5
+ include HTML5
6
+ include TestSupport
7
+ include Sniffer
8
+
9
+ html5_test_files('sniffer').each do |test_file|
10
+ test_name = File.basename(test_file).sub('.test', '')
11
+
12
+ tests = JSON.parse(File.read(test_file))
13
+
14
+ tests.each_with_index do |data, index|
15
+ define_method('test_%s_%d' % [test_name, index + 1]) do
16
+ assert_equal data['type'], html_or_feed(data['input'])
17
+ end
18
+ end
19
+ end
20
+ # each_with_index do |t, i|
21
+ # define_method "test_#{i}" do
22
+ # assert_equal t[0], sniff_feed_type(t[1])
23
+ # end
24
+ # end
25
+
26
+
27
+ end
@@ -0,0 +1,62 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+
3
+ require 'html5/inputstream'
4
+
5
+ class HTMLInputStreamTest < Test::Unit::TestCase
6
+ include HTML5
7
+
8
+ def test_char_ascii
9
+ stream = HTMLInputStream.new("'", :encoding=>'ascii')
10
+ assert_equal('ascii', stream.char_encoding)
11
+ assert_equal("'", stream.char)
12
+ end
13
+
14
+ def test_char_null
15
+ stream = HTMLInputStream.new("\x00")
16
+ assert_equal("\xef\xbf\xbd", stream.char)
17
+ end
18
+
19
+ def test_char_utf8
20
+ stream = HTMLInputStream.new("\xe2\x80\x98", :encoding=>'utf-8')
21
+ assert_equal('utf-8', stream.char_encoding)
22
+ assert_equal("\xe2\x80\x98", stream.char)
23
+ end
24
+
25
+ def test_char_win1252
26
+ stream = HTMLInputStream.new("\xa2\xc5\xf1\x92\x86")
27
+ assert_equal('windows-1252', stream.char_encoding)
28
+ assert_equal("\xc2\xa2", stream.char)
29
+ assert_equal("\xc3\x85", stream.char)
30
+ assert_equal("\xc3\xb1", stream.char)
31
+ assert_equal("\xe2\x80\x99", stream.char)
32
+ assert_equal("\xe2\x80\xa0", stream.char)
33
+ end
34
+
35
+ def test_bom
36
+ stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
37
+ assert_equal('utf-8', stream.char_encoding)
38
+ assert_equal("'", stream.char)
39
+ end
40
+
41
+ begin
42
+ require 'iconv'
43
+
44
+ def test_utf_16
45
+ stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025)
46
+ assert(stream.char_encoding, 'utf-16-le')
47
+ assert_equal(1025, stream.chars_until(' ',true).length)
48
+ end
49
+ rescue LoadError
50
+ puts "iconv not found, skipping iconv tests"
51
+ end
52
+
53
+ def test_newlines
54
+ stream = HTMLInputStream.new("\xef\xbb\xbf" + "a\nbb\r\nccc\rdddd")
55
+ assert_equal([1,0], stream.position)
56
+ assert_equal("a\nbb\n", stream.chars_until('c'))
57
+ assert_equal([3,0], stream.position)
58
+ assert_equal("ccc\ndddd", stream.chars_until('x'))
59
+ assert_equal([4,4], stream.position)
60
+ assert_equal([1,2,3], stream.instance_eval {@line_lengths})
61
+ end
62
+ end
@@ -0,0 +1,94 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+
3
+ require 'html5/tokenizer'
4
+
5
+ require 'tokenizer_test_parser'
6
+
7
+ class Html5TokenizerTestCase < Test::Unit::TestCase
8
+
9
+ def assert_tokens_match(expectedTokens, receivedTokens, ignoreErrorOrder, message)
10
+ if !ignoreErrorOrder
11
+ return expectedTokens == receivedTokens
12
+ else
13
+ #Sort the tokens into two groups; non-parse errors and parse errors
14
+ expected = [[],[]]
15
+ received = [[],[]]
16
+
17
+ for token in expectedTokens
18
+ if token != "ParseError"
19
+ expected[0] << token
20
+ else
21
+ expected[1] << token
22
+ end
23
+ end
24
+
25
+ for token in receivedTokens
26
+ if token != "ParseError"
27
+ received[0] << token
28
+ else
29
+ received[1] << token
30
+ end
31
+ end
32
+ assert_equal expected, received, message
33
+ end
34
+ end
35
+
36
+ def type_of?(token_name, token)
37
+ token != 'ParseError' and token_name == token.first
38
+ end
39
+
40
+ def convert_attribute_arrays_to_hashes(tokens)
41
+ tokens.inject([]) do |tokens, token|
42
+ token[2] = Hash[*token[2].reverse.flatten] if type_of?('StartTag', token)
43
+ tokens << token
44
+ end
45
+ end
46
+
47
+ def concatenate_consecutive_characters(tokens)
48
+ tokens.inject([]) do |tokens, token|
49
+ if type_of?('Character', token) and tokens.any? and type_of?('Character', tokens.last)
50
+ tokens.last[1] = tokens.last[1] + token[1]
51
+ next tokens
52
+ end
53
+ tokens << token
54
+ end
55
+ end
56
+
57
+ def tokenizer_test(data)
58
+ (data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
59
+ message = [
60
+ '', 'Description:', data['description'],
61
+ '', 'Input:', data['input'],
62
+ '', 'Content Model Flag:', content_model_flag,
63
+ '' ] * "\n"
64
+
65
+ assert_nothing_raised message do
66
+ tokenizer = HTML5::HTMLTokenizer.new(data['input'])
67
+
68
+ tokenizer.content_model_flag = content_model_flag.to_sym
69
+
70
+ tokenizer.current_token = {:type => :startTag, :name => data['lastStartTag']} if data.has_key?('lastStartTag')
71
+
72
+ tokens = TokenizerTestParser.new(tokenizer).parse
73
+
74
+ actual = concatenate_consecutive_characters(convert_attribute_arrays_to_hashes(tokens))
75
+
76
+ expected = concatenate_consecutive_characters(data['output'])
77
+
78
+ assert_tokens_match expected, actual, data["ignoreErrorOrder"], message
79
+ end
80
+ end
81
+ end
82
+
83
+ html5_test_files('tokenizer').each do |test_file|
84
+ test_name = File.basename(test_file).sub('.test', '')
85
+
86
+ tests = JSON.parse(File.read(test_file))['tests']
87
+
88
+ tests.each_with_index do |data, index|
89
+ define_method('test_%s_%d' % [test_name, index + 1]) { tokenizer_test data }
90
+ end
91
+ end
92
+
93
+ end
94
+