feedtools 0.2.26 → 0.2.27

Sign up to get free protection for your applications and to get access to all the features.
Files changed (166) hide show
  1. data/CHANGELOG +232 -216
  2. data/db/migration.rb +2 -0
  3. data/db/schema.mysql.sql +2 -0
  4. data/db/schema.postgresql.sql +3 -1
  5. data/db/schema.sqlite.sql +3 -1
  6. data/lib/feed_tools.rb +37 -14
  7. data/lib/feed_tools/database_feed_cache.rb +13 -2
  8. data/lib/feed_tools/feed.rb +430 -104
  9. data/lib/feed_tools/feed_item.rb +533 -268
  10. data/lib/feed_tools/helpers/generic_helper.rb +1 -1
  11. data/lib/feed_tools/helpers/html_helper.rb +78 -116
  12. data/lib/feed_tools/helpers/retrieval_helper.rb +33 -3
  13. data/lib/feed_tools/helpers/uri_helper.rb +46 -54
  14. data/lib/feed_tools/monkey_patch.rb +27 -1
  15. data/lib/feed_tools/vendor/html5/History.txt +10 -0
  16. data/lib/feed_tools/vendor/html5/Manifest.txt +117 -0
  17. data/lib/feed_tools/vendor/html5/README +45 -0
  18. data/lib/feed_tools/vendor/html5/Rakefile.rb +33 -0
  19. data/lib/feed_tools/vendor/html5/bin/html5 +217 -0
  20. data/lib/feed_tools/vendor/html5/lib/core_ext/string.rb +17 -0
  21. data/lib/feed_tools/vendor/html5/lib/html5.rb +13 -0
  22. data/lib/feed_tools/vendor/html5/lib/html5/constants.rb +1046 -0
  23. data/lib/feed_tools/vendor/html5/lib/html5/filters/base.rb +10 -0
  24. data/lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb +82 -0
  25. data/lib/feed_tools/vendor/html5/lib/html5/filters/iso639codes.rb +752 -0
  26. data/lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb +198 -0
  27. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc2046.rb +30 -0
  28. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc3987.rb +89 -0
  29. data/lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb +15 -0
  30. data/lib/feed_tools/vendor/html5/lib/html5/filters/validator.rb +830 -0
  31. data/lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb +36 -0
  32. data/lib/feed_tools/vendor/html5/lib/html5/html5parser.rb +248 -0
  33. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb +46 -0
  34. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  35. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb +50 -0
  36. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb +41 -0
  37. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb +613 -0
  38. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb +69 -0
  39. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb +78 -0
  40. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  41. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  42. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb +138 -0
  43. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb +89 -0
  44. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb +85 -0
  45. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb +86 -0
  46. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb +115 -0
  47. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb +133 -0
  48. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb +154 -0
  49. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb +41 -0
  50. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  51. data/lib/feed_tools/vendor/html5/lib/html5/inputstream.rb +648 -0
  52. data/lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb +158 -0
  53. data/lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb +188 -0
  54. data/lib/feed_tools/vendor/html5/lib/html5/serializer.rb +2 -0
  55. data/lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb +179 -0
  56. data/lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb +20 -0
  57. data/lib/feed_tools/vendor/html5/lib/html5/sniffer.rb +45 -0
  58. data/lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb +966 -0
  59. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders.rb +24 -0
  60. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb +334 -0
  61. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb +231 -0
  62. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb +209 -0
  63. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb +185 -0
  64. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers.rb +26 -0
  65. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb +162 -0
  66. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb +48 -0
  67. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb +48 -0
  68. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb +48 -0
  69. data/lib/feed_tools/vendor/html5/lib/html5/version.rb +3 -0
  70. data/lib/feed_tools/vendor/html5/testdata/encoding/chardet/test_big5.txt +51 -0
  71. data/lib/feed_tools/vendor/html5/testdata/encoding/test-yahoo-jp.dat +10 -0
  72. data/lib/feed_tools/vendor/html5/testdata/encoding/tests1.dat +394 -0
  73. data/lib/feed_tools/vendor/html5/testdata/encoding/tests2.dat +81 -0
  74. data/lib/feed_tools/vendor/html5/testdata/sanitizer/tests1.dat +416 -0
  75. data/lib/feed_tools/vendor/html5/testdata/serializer/core.test +104 -0
  76. data/lib/feed_tools/vendor/html5/testdata/serializer/injectmeta.test +65 -0
  77. data/lib/feed_tools/vendor/html5/testdata/serializer/optionaltags.test +900 -0
  78. data/lib/feed_tools/vendor/html5/testdata/serializer/options.test +60 -0
  79. data/lib/feed_tools/vendor/html5/testdata/serializer/whitespace.test +51 -0
  80. data/lib/feed_tools/vendor/html5/testdata/sites/google-results.htm +1 -0
  81. data/lib/feed_tools/vendor/html5/testdata/sites/python-ref-import.htm +1 -0
  82. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps-old.htm +1 -0
  83. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps.htm +34275 -0
  84. data/lib/feed_tools/vendor/html5/testdata/sniffer/htmlOrFeed.json +43 -0
  85. data/lib/feed_tools/vendor/html5/testdata/tokenizer/contentModelFlags.test +48 -0
  86. data/lib/feed_tools/vendor/html5/testdata/tokenizer/entities.test +2339 -0
  87. data/lib/feed_tools/vendor/html5/testdata/tokenizer/escapeFlag.test +21 -0
  88. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test1.test +172 -0
  89. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test2.test +129 -0
  90. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test3.test +367 -0
  91. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test4.test +198 -0
  92. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests1.dat +1950 -0
  93. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests2.dat +773 -0
  94. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests3.dat +270 -0
  95. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests4.dat +60 -0
  96. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests5.dat +175 -0
  97. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests6.dat +196 -0
  98. data/lib/feed_tools/vendor/html5/testdata/validator/attributes.test +1035 -0
  99. data/lib/feed_tools/vendor/html5/testdata/validator/base-href-attribute.test +787 -0
  100. data/lib/feed_tools/vendor/html5/testdata/validator/base-target-attribute.test +35 -0
  101. data/lib/feed_tools/vendor/html5/testdata/validator/blockquote-cite-attribute.test +7 -0
  102. data/lib/feed_tools/vendor/html5/testdata/validator/classattribute.test +152 -0
  103. data/lib/feed_tools/vendor/html5/testdata/validator/contenteditableattribute.test +59 -0
  104. data/lib/feed_tools/vendor/html5/testdata/validator/contextmenuattribute.test +115 -0
  105. data/lib/feed_tools/vendor/html5/testdata/validator/dirattribute.test +59 -0
  106. data/lib/feed_tools/vendor/html5/testdata/validator/draggableattribute.test +63 -0
  107. data/lib/feed_tools/vendor/html5/testdata/validator/html-xmlns-attribute.test +23 -0
  108. data/lib/feed_tools/vendor/html5/testdata/validator/idattribute.test +115 -0
  109. data/lib/feed_tools/vendor/html5/testdata/validator/inputattributes.test +2795 -0
  110. data/lib/feed_tools/vendor/html5/testdata/validator/irrelevantattribute.test +63 -0
  111. data/lib/feed_tools/vendor/html5/testdata/validator/langattribute.test +5579 -0
  112. data/lib/feed_tools/vendor/html5/testdata/validator/li-value-attribute.test +7 -0
  113. data/lib/feed_tools/vendor/html5/testdata/validator/link-href-attribute.test +7 -0
  114. data/lib/feed_tools/vendor/html5/testdata/validator/link-hreflang-attribute.test +7 -0
  115. data/lib/feed_tools/vendor/html5/testdata/validator/link-rel-attribute.test +271 -0
  116. data/lib/feed_tools/vendor/html5/testdata/validator/ol-start-attribute.test +7 -0
  117. data/lib/feed_tools/vendor/html5/testdata/validator/starttags.test +375 -0
  118. data/lib/feed_tools/vendor/html5/testdata/validator/style-scoped-attribute.test +7 -0
  119. data/lib/feed_tools/vendor/html5/testdata/validator/tabindexattribute.test +79 -0
  120. data/lib/feed_tools/vendor/html5/tests/preamble.rb +72 -0
  121. data/lib/feed_tools/vendor/html5/tests/test_encoding.rb +35 -0
  122. data/lib/feed_tools/vendor/html5/tests/test_lxp.rb +279 -0
  123. data/lib/feed_tools/vendor/html5/tests/test_parser.rb +68 -0
  124. data/lib/feed_tools/vendor/html5/tests/test_sanitizer.rb +142 -0
  125. data/lib/feed_tools/vendor/html5/tests/test_serializer.rb +68 -0
  126. data/lib/feed_tools/vendor/html5/tests/test_sniffer.rb +27 -0
  127. data/lib/feed_tools/vendor/html5/tests/test_stream.rb +62 -0
  128. data/lib/feed_tools/vendor/html5/tests/test_tokenizer.rb +94 -0
  129. data/lib/feed_tools/vendor/html5/tests/test_treewalkers.rb +135 -0
  130. data/lib/feed_tools/vendor/html5/tests/test_validator.rb +31 -0
  131. data/lib/feed_tools/vendor/html5/tests/tokenizer_test_parser.rb +63 -0
  132. data/lib/feed_tools/vendor/uri.rb +781 -0
  133. data/lib/feed_tools/version.rb +1 -1
  134. data/rakefile +27 -6
  135. data/test/unit/atom_test.rb +298 -210
  136. data/test/unit/helper_test.rb +7 -12
  137. data/test/unit/rdf_test.rb +51 -1
  138. data/test/unit/rss_test.rb +13 -3
  139. metadata +239 -116
  140. data/lib/feed_tools/vendor/htree.rb +0 -97
  141. data/lib/feed_tools/vendor/htree/container.rb +0 -10
  142. data/lib/feed_tools/vendor/htree/context.rb +0 -67
  143. data/lib/feed_tools/vendor/htree/display.rb +0 -27
  144. data/lib/feed_tools/vendor/htree/doc.rb +0 -149
  145. data/lib/feed_tools/vendor/htree/elem.rb +0 -262
  146. data/lib/feed_tools/vendor/htree/encoder.rb +0 -163
  147. data/lib/feed_tools/vendor/htree/equality.rb +0 -218
  148. data/lib/feed_tools/vendor/htree/extract_text.rb +0 -37
  149. data/lib/feed_tools/vendor/htree/fstr.rb +0 -33
  150. data/lib/feed_tools/vendor/htree/gencode.rb +0 -97
  151. data/lib/feed_tools/vendor/htree/htmlinfo.rb +0 -672
  152. data/lib/feed_tools/vendor/htree/inspect.rb +0 -108
  153. data/lib/feed_tools/vendor/htree/leaf.rb +0 -94
  154. data/lib/feed_tools/vendor/htree/loc.rb +0 -367
  155. data/lib/feed_tools/vendor/htree/modules.rb +0 -48
  156. data/lib/feed_tools/vendor/htree/name.rb +0 -124
  157. data/lib/feed_tools/vendor/htree/output.rb +0 -207
  158. data/lib/feed_tools/vendor/htree/parse.rb +0 -409
  159. data/lib/feed_tools/vendor/htree/raw_string.rb +0 -124
  160. data/lib/feed_tools/vendor/htree/regexp-util.rb +0 -15
  161. data/lib/feed_tools/vendor/htree/rexml.rb +0 -130
  162. data/lib/feed_tools/vendor/htree/scan.rb +0 -166
  163. data/lib/feed_tools/vendor/htree/tag.rb +0 -111
  164. data/lib/feed_tools/vendor/htree/template.rb +0 -909
  165. data/lib/feed_tools/vendor/htree/text.rb +0 -115
  166. data/lib/feed_tools/vendor/htree/traverse.rb +0 -465
@@ -0,0 +1,68 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+
3
+ require 'html5/treebuilders'
4
+ require 'html5/html5parser'
5
+
6
+
7
+ $tree_types_to_test = ['simpletree', 'rexml']
8
+
9
+ begin
10
+ require 'hpricot'
11
+ $tree_types_to_test.push('hpricot')
12
+ rescue LoadError
13
+ end
14
+
15
+ $CHECK_PARSER_ERRORS = ARGV.delete('-p') # TODO
16
+
17
+ puts 'Testing tree builders: ' + $tree_types_to_test * ', '
18
+
19
+
20
+ class Html5ParserTestCase < Test::Unit::TestCase
21
+ include HTML5
22
+ include TestSupport
23
+
24
+ html5_test_files('tree-construction').each do |test_file|
25
+
26
+ test_name = File.basename(test_file).sub('.dat', '')
27
+
28
+ TestData.new(test_file, %w(data errors document-fragment document)).
29
+ each_with_index do |(input, errors, inner_html, expected), index|
30
+
31
+ errors = errors.split("\n")
32
+ expected = expected.gsub("\n| ","\n")[2..-1]
33
+
34
+ $tree_types_to_test.each do |tree_name|
35
+ define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
36
+
37
+ parser = HTMLParser.new(:tree => TreeBuilders[tree_name])
38
+
39
+ if inner_html
40
+ parser.parse_fragment(input, inner_html)
41
+ else
42
+ parser.parse(input)
43
+ end
44
+
45
+ actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
46
+
47
+ assert_equal sortattrs(expected), sortattrs(actual_output), [
48
+ '', 'Input:', input,
49
+ '', 'Expected:', expected,
50
+ '', 'Recieved:', actual_output
51
+ ].join("\n")
52
+
53
+ actual_errors = parser.errors.map do |(line, col), message, datavars|
54
+ 'Line: %i Col: %i %s' % [line, col, E[message] % datavars]
55
+ end
56
+ assert_equal errors.length, parser.errors.length, [
57
+ '', 'Input', input,
58
+ '', "Expected errors (#{errors.length}):", errors.join("\n"),
59
+ '', "Actual errors (#{actual_errors.length}):",
60
+ actual_errors.join("\n")
61
+ ].join("\n")
62
+
63
+ end
64
+ end
65
+ end
66
+ end
67
+
68
+ end
@@ -0,0 +1,142 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.join(File.dirname(__FILE__), 'preamble')
4
+
5
+ require 'html5/html5parser'
6
+ require 'html5/liberalxmlparser'
7
+ require 'html5/treewalkers'
8
+ require 'html5/serializer'
9
+ require 'html5/sanitizer'
10
+
11
+ class SanitizeTest < Test::Unit::TestCase
12
+ include HTML5
13
+
14
+ def sanitize_xhtml stream
15
+ XHTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).to_s
16
+ end
17
+
18
+ def sanitize_html stream
19
+ HTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).to_s
20
+ end
21
+
22
+ def sanitize_rexml stream
23
+ require 'rexml/document'
24
+ doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{stream}</div>")
25
+ tokens = TreeWalkers.get_tree_walker('rexml').new(doc)
26
+ XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
27
+ :quote_char => "'",
28
+ :inject_meta_charset => false,
29
+ :sanitize => true}).gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
30
+ rescue REXML::ParseException
31
+ return "Ill-formed XHTML!"
32
+ end
33
+
34
+ def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
35
+ assert_equal htmloutput, sanitize_html(input)
36
+ assert_equal xhtmloutput, sanitize_xhtml(input)
37
+ assert_equal rexmloutput, sanitize_rexml(input)
38
+ end
39
+
40
+ HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
41
+ define_method "test_should_allow_#{tag_name}_tag" do
42
+ input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
43
+ htmloutput = "<#{tag_name.downcase} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name.downcase}>"
44
+ xhtmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name}>"
45
+ rexmloutput = xhtmloutput
46
+
47
+ if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
48
+ htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
49
+ xhtmloutput = htmloutput
50
+ elsif tag_name == 'col'
51
+ htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
52
+ xhtmloutput = htmloutput
53
+ rexmloutput = "<col title='1' />"
54
+ elsif tag_name == 'table'
55
+ htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt;baz<table title='1'> </table>"
56
+ xhtmloutput = htmloutput
57
+ elsif tag_name == 'image'
58
+ htmloutput = "<img title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
59
+ xhtmloutput = htmloutput
60
+ rexmloutput = "<image title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</image>"
61
+ elsif VOID_ELEMENTS.include?(tag_name)
62
+ htmloutput = "<#{tag_name} title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
63
+ xhtmloutput = htmloutput
64
+ htmloutput += '<br/>' if tag_name == 'br'
65
+ rexmloutput = "<#{tag_name} title='1' />"
66
+ end
67
+ check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
68
+ end
69
+ end
70
+
71
+ HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
72
+ define_method "test_should_forbid_#{tag_name.upcase}_tag" do
73
+ input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
74
+ output = "&lt;#{tag_name.upcase} title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/#{tag_name.upcase}&gt;"
75
+ check_sanitization(input, output, output, output)
76
+ end
77
+ end
78
+
79
+ HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
80
+ next if attribute_name == 'style'
81
+ define_method "test_should_allow_#{attribute_name}_attribute" do
82
+ input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
83
+ output = "<p #{attribute_name}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
84
+ htmloutput = "<p #{attribute_name.downcase}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
85
+ check_sanitization(input, htmloutput, output, output)
86
+ end
87
+ end
88
+
89
+ HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
90
+ define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
91
+ input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
92
+ output = "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
93
+ check_sanitization(input, output, output, output)
94
+ end
95
+ end
96
+
97
+ HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
98
+ define_method "test_should_allow_#{protocol}_uris" do
99
+ input = %(<a href="#{protocol}">foo</a>)
100
+ output = "<a href='#{protocol}'>foo</a>"
101
+ check_sanitization(input, output, output, output)
102
+ end
103
+ end
104
+
105
+ HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
106
+ define_method "test_should_allow_uppercase_#{protocol}_uris" do
107
+ input = %(<a href="#{protocol.upcase}">foo</a>)
108
+ output = "<a href='#{protocol.upcase}'>foo</a>"
109
+ check_sanitization(input, output, output, output)
110
+ end
111
+ end
112
+
113
+ def test_should_handle_astral_plane_characters
114
+ input = "<p>&#x1d4b5; &#x1d538;</p>"
115
+ output = "<p>\360\235\222\265 \360\235\224\270</p>"
116
+ check_sanitization(input, output, output, output)
117
+
118
+ input = "<p><tspan>\360\235\224\270</tspan> a</p>"
119
+ output = "<p><tspan>\360\235\224\270</tspan> a</p>"
120
+ check_sanitization(input, output, output, output)
121
+ end
122
+
123
+ # This affects only NS4. Is it worth fixing?
124
+ # def test_javascript_includes
125
+ # input = %(<div size="&{alert('XSS')}">foo</div>)
126
+ # output = "<div>foo</div>"
127
+ # check_sanitization(input, output, output, output)
128
+ # end
129
+
130
+ html5_test_files('sanitizer').each do |filename|
131
+ JSON::parse(open(filename).read).each do |test|
132
+ define_method "test_#{test['name']}" do
133
+ check_sanitization(
134
+ test['input'],
135
+ test['output'],
136
+ test['xhtml'] || test['output'],
137
+ test['rexml'] || test['output']
138
+ )
139
+ end
140
+ end
141
+ end
142
+ end
@@ -0,0 +1,68 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+
3
+ require 'html5/html5parser'
4
+ require 'html5/serializer'
5
+ require 'html5/treewalkers'
6
+
7
+ #Run the serialize error checks
8
+ checkSerializeErrors = false
9
+
10
+ class JsonWalker < HTML5::TreeWalkers::Base
11
+ def each
12
+ @tree.each do |token|
13
+ case token[0]
14
+ when 'StartTag'
15
+ yield start_tag(token[1], token[2])
16
+ when 'EndTag'
17
+ yield end_tag(token[1])
18
+ when 'EmptyTag'
19
+ yield empty_tag(token[1], token[2])
20
+ when 'Comment'
21
+ yield comment(token[1])
22
+ when 'Characters', 'SpaceCharacters'
23
+ text(token[1]) {|textToken| yield textToken}
24
+ when 'Doctype'
25
+ yield doctype(token[1], token[2], token[3])
26
+ else
27
+ raise "Unknown token type: " + token[0]
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ class Html5SerializeTestcase < Test::Unit::TestCase
34
+ html5_test_files('serializer').each do |filename|
35
+ test_name = File.basename(filename).sub('.test', '')
36
+ tests = JSON::parse(open(filename).read)
37
+ tests['tests'].each_with_index do |test, index|
38
+
39
+ define_method "test_#{test_name}_#{index+1}" do
40
+ if test["options"] and test["options"]["encoding"]
41
+ test["options"][:encoding] = test["options"]["encoding"]
42
+ end
43
+
44
+ result = HTML5::HTMLSerializer.
45
+ serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
46
+ expected = test["expected"]
47
+ if expected.length == 1
48
+ assert_equal(expected[0], result, test["description"])
49
+ elsif !expected.include?(result)
50
+ flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
51
+ end
52
+
53
+ return if test_name == 'optionaltags'
54
+
55
+ result = HTML5::XHTMLSerializer.
56
+ serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
57
+ expected = test["xhtml"] || test["expected"]
58
+ if expected.length == 1
59
+ assert_equal(expected[0], result, test["description"])
60
+ elsif !expected.include?(result)
61
+ flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
62
+ end
63
+
64
+ end
65
+
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,27 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+ require "html5/sniffer"
3
+
4
+ class TestFeedTypeSniffer < Test::Unit::TestCase
5
+ include HTML5
6
+ include TestSupport
7
+ include Sniffer
8
+
9
+ html5_test_files('sniffer').each do |test_file|
10
+ test_name = File.basename(test_file).sub('.test', '')
11
+
12
+ tests = JSON.parse(File.read(test_file))
13
+
14
+ tests.each_with_index do |data, index|
15
+ define_method('test_%s_%d' % [test_name, index + 1]) do
16
+ assert_equal data['type'], html_or_feed(data['input'])
17
+ end
18
+ end
19
+ end
20
+ # each_with_index do |t, i|
21
+ # define_method "test_#{i}" do
22
+ # assert_equal t[0], sniff_feed_type(t[1])
23
+ # end
24
+ # end
25
+
26
+
27
+ end
@@ -0,0 +1,62 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+
3
+ require 'html5/inputstream'
4
+
5
+ class HTMLInputStreamTest < Test::Unit::TestCase
6
+ include HTML5
7
+
8
+ def test_char_ascii
9
+ stream = HTMLInputStream.new("'", :encoding=>'ascii')
10
+ assert_equal('ascii', stream.char_encoding)
11
+ assert_equal("'", stream.char)
12
+ end
13
+
14
+ def test_char_null
15
+ stream = HTMLInputStream.new("\x00")
16
+ assert_equal("\xef\xbf\xbd", stream.char)
17
+ end
18
+
19
+ def test_char_utf8
20
+ stream = HTMLInputStream.new("\xe2\x80\x98", :encoding=>'utf-8')
21
+ assert_equal('utf-8', stream.char_encoding)
22
+ assert_equal("\xe2\x80\x98", stream.char)
23
+ end
24
+
25
+ def test_char_win1252
26
+ stream = HTMLInputStream.new("\xa2\xc5\xf1\x92\x86")
27
+ assert_equal('windows-1252', stream.char_encoding)
28
+ assert_equal("\xc2\xa2", stream.char)
29
+ assert_equal("\xc3\x85", stream.char)
30
+ assert_equal("\xc3\xb1", stream.char)
31
+ assert_equal("\xe2\x80\x99", stream.char)
32
+ assert_equal("\xe2\x80\xa0", stream.char)
33
+ end
34
+
35
+ def test_bom
36
+ stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
37
+ assert_equal('utf-8', stream.char_encoding)
38
+ assert_equal("'", stream.char)
39
+ end
40
+
41
+ begin
42
+ require 'iconv'
43
+
44
+ def test_utf_16
45
+ stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025)
46
+ assert(stream.char_encoding, 'utf-16-le')
47
+ assert_equal(1025, stream.chars_until(' ',true).length)
48
+ end
49
+ rescue LoadError
50
+ puts "iconv not found, skipping iconv tests"
51
+ end
52
+
53
+ def test_newlines
54
+ stream = HTMLInputStream.new("\xef\xbb\xbf" + "a\nbb\r\nccc\rdddd")
55
+ assert_equal([1,0], stream.position)
56
+ assert_equal("a\nbb\n", stream.chars_until('c'))
57
+ assert_equal([3,0], stream.position)
58
+ assert_equal("ccc\ndddd", stream.chars_until('x'))
59
+ assert_equal([4,4], stream.position)
60
+ assert_equal([1,2,3], stream.instance_eval {@line_lengths})
61
+ end
62
+ end
@@ -0,0 +1,94 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+
3
+ require 'html5/tokenizer'
4
+
5
+ require 'tokenizer_test_parser'
6
+
7
+ class Html5TokenizerTestCase < Test::Unit::TestCase
8
+
9
+ def assert_tokens_match(expectedTokens, receivedTokens, ignoreErrorOrder, message)
10
+ if !ignoreErrorOrder
11
+ return expectedTokens == receivedTokens
12
+ else
13
+ #Sort the tokens into two groups; non-parse errors and parse errors
14
+ expected = [[],[]]
15
+ received = [[],[]]
16
+
17
+ for token in expectedTokens
18
+ if token != "ParseError"
19
+ expected[0] << token
20
+ else
21
+ expected[1] << token
22
+ end
23
+ end
24
+
25
+ for token in receivedTokens
26
+ if token != "ParseError"
27
+ received[0] << token
28
+ else
29
+ received[1] << token
30
+ end
31
+ end
32
+ assert_equal expected, received, message
33
+ end
34
+ end
35
+
36
+ def type_of?(token_name, token)
37
+ token != 'ParseError' and token_name == token.first
38
+ end
39
+
40
+ def convert_attribute_arrays_to_hashes(tokens)
41
+ tokens.inject([]) do |tokens, token|
42
+ token[2] = Hash[*token[2].reverse.flatten] if type_of?('StartTag', token)
43
+ tokens << token
44
+ end
45
+ end
46
+
47
+ def concatenate_consecutive_characters(tokens)
48
+ tokens.inject([]) do |tokens, token|
49
+ if type_of?('Character', token) and tokens.any? and type_of?('Character', tokens.last)
50
+ tokens.last[1] = tokens.last[1] + token[1]
51
+ next tokens
52
+ end
53
+ tokens << token
54
+ end
55
+ end
56
+
57
+ def tokenizer_test(data)
58
+ (data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
59
+ message = [
60
+ '', 'Description:', data['description'],
61
+ '', 'Input:', data['input'],
62
+ '', 'Content Model Flag:', content_model_flag,
63
+ '' ] * "\n"
64
+
65
+ assert_nothing_raised message do
66
+ tokenizer = HTML5::HTMLTokenizer.new(data['input'])
67
+
68
+ tokenizer.content_model_flag = content_model_flag.to_sym
69
+
70
+ tokenizer.current_token = {:type => :startTag, :name => data['lastStartTag']} if data.has_key?('lastStartTag')
71
+
72
+ tokens = TokenizerTestParser.new(tokenizer).parse
73
+
74
+ actual = concatenate_consecutive_characters(convert_attribute_arrays_to_hashes(tokens))
75
+
76
+ expected = concatenate_consecutive_characters(data['output'])
77
+
78
+ assert_tokens_match expected, actual, data["ignoreErrorOrder"], message
79
+ end
80
+ end
81
+ end
82
+
83
+ html5_test_files('tokenizer').each do |test_file|
84
+ test_name = File.basename(test_file).sub('.test', '')
85
+
86
+ tests = JSON.parse(File.read(test_file))['tests']
87
+
88
+ tests.each_with_index do |data, index|
89
+ define_method('test_%s_%d' % [test_name, index + 1]) { tokenizer_test data }
90
+ end
91
+ end
92
+
93
+ end
94
+