html5 0.1.0 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (98) hide show
  1. data/History.txt +9 -2
  2. data/Manifest.txt +61 -2
  3. data/README +41 -5
  4. data/Rakefile.rb +22 -6
  5. data/{parse.rb → bin/html5} +11 -11
  6. data/lib/core_ext/string.rb +17 -0
  7. data/lib/html5/constants.rb +228 -0
  8. data/lib/html5/filters/iso639codes.rb +752 -0
  9. data/lib/html5/filters/rfc2046.rb +30 -0
  10. data/lib/html5/filters/rfc3987.rb +89 -0
  11. data/lib/html5/filters/validator.rb +830 -0
  12. data/lib/html5/html5parser.rb +25 -25
  13. data/lib/html5/html5parser/after_body_phase.rb +3 -3
  14. data/lib/html5/html5parser/after_frameset_phase.rb +3 -4
  15. data/lib/html5/html5parser/after_head_phase.rb +6 -6
  16. data/lib/html5/html5parser/before_head_phase.rb +1 -1
  17. data/lib/html5/html5parser/in_body_phase.rb +54 -48
  18. data/lib/html5/html5parser/in_caption_phase.rb +7 -6
  19. data/lib/html5/html5parser/in_cell_phase.rb +3 -3
  20. data/lib/html5/html5parser/in_column_group_phase.rb +1 -1
  21. data/lib/html5/html5parser/in_frameset_phase.rb +5 -5
  22. data/lib/html5/html5parser/in_head_phase.rb +10 -10
  23. data/lib/html5/html5parser/in_row_phase.rb +4 -2
  24. data/lib/html5/html5parser/in_select_phase.rb +7 -6
  25. data/lib/html5/html5parser/in_table_body_phase.rb +8 -5
  26. data/lib/html5/html5parser/in_table_phase.rb +12 -7
  27. data/lib/html5/html5parser/initial_phase.rb +5 -6
  28. data/lib/html5/html5parser/phase.rb +5 -9
  29. data/lib/html5/html5parser/root_element_phase.rb +1 -2
  30. data/lib/html5/html5parser/trailing_end_phase.rb +3 -3
  31. data/lib/html5/inputstream.rb +25 -31
  32. data/lib/html5/liberalxmlparser.rb +2 -2
  33. data/lib/html5/sanitizer.rb +6 -6
  34. data/lib/html5/serializer/htmlserializer.rb +2 -3
  35. data/lib/html5/sniffer.rb +45 -0
  36. data/lib/html5/tokenizer.rb +57 -59
  37. data/lib/html5/treebuilders/rexml.rb +7 -6
  38. data/lib/html5/treebuilders/simpletree.rb +1 -1
  39. data/lib/html5/treewalkers/base.rb +8 -0
  40. data/lib/html5/version.rb +3 -0
  41. data/testdata/encoding/chardet/test_big5.txt +51 -0
  42. data/testdata/encoding/test-yahoo-jp.dat +10 -0
  43. data/testdata/encoding/tests1.dat +394 -0
  44. data/testdata/encoding/tests2.dat +81 -0
  45. data/testdata/sanitizer/tests1.dat +416 -0
  46. data/testdata/serializer/core.test +104 -0
  47. data/testdata/serializer/injectmeta.test +65 -0
  48. data/testdata/serializer/optionaltags.test +900 -0
  49. data/testdata/serializer/options.test +60 -0
  50. data/testdata/serializer/whitespace.test +51 -0
  51. data/testdata/sites/google-results.htm +1 -0
  52. data/testdata/sites/python-ref-import.htm +1 -0
  53. data/testdata/sites/web-apps-old.htm +1 -0
  54. data/testdata/sites/web-apps.htm +34275 -0
  55. data/testdata/sniffer/htmlOrFeed.json +43 -0
  56. data/testdata/tokenizer/contentModelFlags.test +48 -0
  57. data/testdata/tokenizer/entities.test +2339 -0
  58. data/testdata/tokenizer/escapeFlag.test +21 -0
  59. data/testdata/tokenizer/test1.test +172 -0
  60. data/testdata/tokenizer/test2.test +129 -0
  61. data/testdata/tokenizer/test3.test +367 -0
  62. data/testdata/tokenizer/test4.test +198 -0
  63. data/testdata/tree-construction/tests1.dat +1950 -0
  64. data/testdata/tree-construction/tests2.dat +773 -0
  65. data/testdata/tree-construction/tests3.dat +270 -0
  66. data/testdata/tree-construction/tests4.dat +60 -0
  67. data/testdata/tree-construction/tests5.dat +175 -0
  68. data/testdata/tree-construction/tests6.dat +196 -0
  69. data/testdata/validator/attributes.test +1035 -0
  70. data/testdata/validator/base-href-attribute.test +787 -0
  71. data/testdata/validator/base-target-attribute.test +35 -0
  72. data/testdata/validator/blockquote-cite-attribute.test +7 -0
  73. data/testdata/validator/classattribute.test +152 -0
  74. data/testdata/validator/contenteditableattribute.test +59 -0
  75. data/testdata/validator/contextmenuattribute.test +115 -0
  76. data/testdata/validator/dirattribute.test +59 -0
  77. data/testdata/validator/draggableattribute.test +63 -0
  78. data/testdata/validator/html-xmlns-attribute.test +23 -0
  79. data/testdata/validator/idattribute.test +115 -0
  80. data/testdata/validator/inputattributes.test +2795 -0
  81. data/testdata/validator/irrelevantattribute.test +63 -0
  82. data/testdata/validator/langattribute.test +5579 -0
  83. data/testdata/validator/li-value-attribute.test +7 -0
  84. data/testdata/validator/link-href-attribute.test +7 -0
  85. data/testdata/validator/link-hreflang-attribute.test +7 -0
  86. data/testdata/validator/link-rel-attribute.test +271 -0
  87. data/testdata/validator/ol-start-attribute.test +7 -0
  88. data/testdata/validator/starttags.test +375 -0
  89. data/testdata/validator/style-scoped-attribute.test +7 -0
  90. data/testdata/validator/tabindexattribute.test +79 -0
  91. data/tests/preamble.rb +7 -17
  92. data/tests/test_encoding.rb +1 -1
  93. data/tests/test_lxp.rb +16 -0
  94. data/tests/test_parser.rb +2 -2
  95. data/tests/test_sniffer.rb +27 -0
  96. data/tests/test_treewalkers.rb +41 -22
  97. data/tests/test_validator.rb +31 -0
  98. metadata +65 -6
@@ -0,0 +1,270 @@
1
+ #data
2
+ <head></head><style></style>
3
+ #errors
4
+ No DOCTYPE
5
+ <style> in after-head mode
6
+ #document
7
+ | <html>
8
+ | <head>
9
+ | <style>
10
+ | <body>
11
+
12
+ #data
13
+ <head></head><script></script>
14
+ #errors
15
+ No DOCTYPE
16
+ <script> in after-head mode
17
+ #document
18
+ | <html>
19
+ | <head>
20
+ | <script>
21
+ | <body>
22
+
23
+ #data
24
+ <head></head><!-- --><style></style><!-- --><script></script>
25
+ #errors
26
+ No DOCTYPE
27
+ <style> in after-head mode
28
+ #document
29
+ | <html>
30
+ | <head>
31
+ | <style>
32
+ | <script>
33
+ | <!-- -->
34
+ | <!-- -->
35
+ | <body>
36
+
37
+ #data
38
+ <head></head><!-- -->x<style></style><!-- --><script></script>
39
+ #errors
40
+ No DOCTYPE
41
+ #document
42
+ | <html>
43
+ | <head>
44
+ | <!-- -->
45
+ | <body>
46
+ | "x"
47
+ | <style>
48
+ | <!-- -->
49
+ | <script>
50
+
51
+ #data
52
+ <!DOCTYPE htML><html><head></head><body><pre>
53
+ </pre></body></html>
54
+ #errors
55
+ #document
56
+ | <!DOCTYPE htML>
57
+ | <html>
58
+ | <head>
59
+ | <body>
60
+ | <pre>
61
+
62
+ #data
63
+ <!DOCTYPE htML><html><head></head><body><pre>
64
+ foo</pre></body></html>
65
+ #errors
66
+ #document
67
+ | <!DOCTYPE htML>
68
+ | <html>
69
+ | <head>
70
+ | <body>
71
+ | <pre>
72
+ | "foo"
73
+
74
+ #data
75
+ <!DOCTYPE htML><html><head></head><body><pre>
76
+
77
+ foo</pre></body></html>
78
+ #errors
79
+ #document
80
+ | <!DOCTYPE htML>
81
+ | <html>
82
+ | <head>
83
+ | <body>
84
+ | <pre>
85
+ | "
86
+ foo"
87
+
88
+ #data
89
+ <!DOCTYPE htML><html><head></head><body><pre>
90
+ foo
91
+ </pre></body></html>
92
+ #errors
93
+ #document
94
+ | <!DOCTYPE htML>
95
+ | <html>
96
+ | <head>
97
+ | <body>
98
+ | <pre>
99
+ | "foo
100
+ "
101
+
102
+ #data
103
+ <!DOCTYPE htML><html><head></head><body><pre>x</pre><span>
104
+ </span></body></html>
105
+ #errors
106
+ #document
107
+ | <!DOCTYPE htML>
108
+ | <html>
109
+ | <head>
110
+ | <body>
111
+ | <pre>
112
+ | "x"
113
+ | <span>
114
+ | "
115
+ "
116
+
117
+ #data
118
+ <!DOCTYPE htML><html><head></head><body><pre>x
119
+ y</pre></body></html>
120
+ #errors
121
+ #document
122
+ | <!DOCTYPE htML>
123
+ | <html>
124
+ | <head>
125
+ | <body>
126
+ | <pre>
127
+ | "x
128
+ y"
129
+
130
+ #data
131
+ <!DOCTYPE htML><html><head></head><body><pre>x<div>
132
+ y</pre></body></html>
133
+ #errors
134
+ End tag <pre> seen too early. Expected other end tag.
135
+ #document
136
+ | <!DOCTYPE htML>
137
+ | <html>
138
+ | <head>
139
+ | <body>
140
+ | <pre>
141
+ | "x"
142
+ | <div>
143
+ | "
144
+ y"
145
+
146
+ #data
147
+ <!DOCTYPE htML><HTML><META><HEAD></HEAD></HTML>
148
+ #errors
149
+ Unexpected start tag HEAD in HEAD. Ignored.
150
+ #document
151
+ | <!DOCTYPE htML>
152
+ | <html>
153
+ | <head>
154
+ | <meta>
155
+ | <body>
156
+
157
+ #data
158
+ <!DOCTYPE htML><HTML><HEAD><head></HEAD></HTML>
159
+ #errors
160
+ Unexpected start tag HEAD in HEAD. Ignored.
161
+ #document
162
+ | <!DOCTYPE htML>
163
+ | <html>
164
+ | <head>
165
+ | <body>
166
+
167
+ #data
168
+ <textarea>foo<span>bar</span><i>baz
169
+ #errors
170
+ Unexpected start tag. Expected DOCTYPE.
171
+ Unexpected end of file.
172
+ #document
173
+ | <html>
174
+ | <head>
175
+ | <body>
176
+ | <textarea>
177
+ | "foo<span>bar</span><i>baz"
178
+
179
+ #data
180
+ <title>foo<span>bar</em><i>baz
181
+ #errors
182
+ Unexpected start tag. Expected DOCTYPE.
183
+ Unexpected end of file.
184
+ #document
185
+ | <html>
186
+ | <head>
187
+ | <title>
188
+ | "foo<span>bar</em><i>baz"
189
+ | <body>
190
+
191
+ #data
192
+ <!DOCTYPE htML><textarea>
193
+ </textarea>
194
+ #errors
195
+ #document
196
+ | <!DOCTYPE htML>
197
+ | <html>
198
+ | <head>
199
+ | <body>
200
+ | <textarea>
201
+
202
+ #data
203
+ <!DOCTYPE htML><textarea>
204
+ foo</textarea>
205
+ #errors
206
+ #document
207
+ | <!DOCTYPE htML>
208
+ | <html>
209
+ | <head>
210
+ | <body>
211
+ | <textarea>
212
+ | "foo"
213
+
214
+ #data
215
+ <!DOCTYPE htML><textarea>
216
+
217
+ foo</textarea>
218
+ #errors
219
+ #document
220
+ | <!DOCTYPE htML>
221
+ | <html>
222
+ | <head>
223
+ | <body>
224
+ | <textarea>
225
+ | "
226
+ foo"
227
+
228
+ #data
229
+ <!DOCTYPE htML><html><head></head><body><ul><li><div><p><li></ul></body></html>
230
+ #errors
231
+ Missing end tag (div)
232
+ #document
233
+ | <!DOCTYPE htML>
234
+ | <html>
235
+ | <head>
236
+ | <body>
237
+ | <ul>
238
+ | <li>
239
+ | <div>
240
+ | <p>
241
+ | <li>
242
+
243
+ #data
244
+ <!doctype html><nobr><nobr><nobr>
245
+ #errors
246
+ Unexpected <nobr> tag.
247
+ Unexpected <nobr> tag.
248
+ Unexpected end of file.
249
+ #document
250
+ | <!DOCTYPE html>
251
+ | <html>
252
+ | <head>
253
+ | <body>
254
+ | <nobr>
255
+ | <nobr>
256
+ | <nobr>
257
+
258
+ #data
259
+ <!doctype html><nobr><nobr></nobr><nobr>
260
+ #errors
261
+ Unexpected <nobr> tag.
262
+ Unexpected end of file.
263
+ #document
264
+ | <!DOCTYPE html>
265
+ | <html>
266
+ | <head>
267
+ | <body>
268
+ | <nobr>
269
+ | <nobr>
270
+ | <nobr>
@@ -0,0 +1,60 @@
1
+ #data
2
+ direct div content
3
+ #errors
4
+ #document-fragment
5
+ div
6
+ #document
7
+ | "direct div content"
8
+
9
+ #data
10
+ direct textarea content
11
+ #errors
12
+ #document-fragment
13
+ textarea
14
+ #document
15
+ | "direct textarea content"
16
+
17
+ #data
18
+ textarea content with <em>pseudo</em> <foo>markup
19
+ #errors
20
+ #document-fragment
21
+ textarea
22
+ #document
23
+ | "textarea content with <em>pseudo</em> <foo>markup"
24
+
25
+ #data
26
+ this is &#x0043;DATA inside a <style> element
27
+ #errors
28
+ #document-fragment
29
+ style
30
+ #document
31
+ | "this is &#x0043;DATA inside a <style> element"
32
+
33
+ #data
34
+ </plaintext>
35
+ #errors
36
+ #document-fragment
37
+ plaintext
38
+ #document
39
+ | "</plaintext>"
40
+
41
+ #data
42
+ setting html's innerHTML
43
+ #errors
44
+ XXX innerHTML EOF
45
+ #document-fragment
46
+ html
47
+ #document
48
+ | <head>
49
+ | <body>
50
+ | "setting html's innerHTML"
51
+
52
+ #data
53
+ <title>setting head's innerHTML</title>
54
+ #errors
55
+ Unexpected title element that belongs in head.
56
+ #document-fragment
57
+ head
58
+ #document
59
+ | <title>
60
+ | "setting head's innerHTML"
@@ -0,0 +1,175 @@
1
+ #data
2
+ <style> <!-- </style>x
3
+ #errors
4
+ No DOCTYPE
5
+ Unexpected end of file
6
+ #document
7
+ | <html>
8
+ | <head>
9
+ | <style>
10
+ | " <!-- </style>x"
11
+ | <body>
12
+
13
+ #data
14
+ <style> <!-- </style> --> </style>x
15
+ #errors
16
+ No DOCTYPE
17
+ #document
18
+ | <html>
19
+ | <head>
20
+ | <style>
21
+ | " <!-- </style> --> "
22
+ | <body>
23
+ | "x"
24
+
25
+ #data
26
+ <style> <!--> </style>x
27
+ #errors
28
+ No DOCTYPE
29
+ #document
30
+ | <html>
31
+ | <head>
32
+ | <style>
33
+ | " <!--> "
34
+ | <body>
35
+ | "x"
36
+
37
+ #data
38
+ <style> <!---> </style>x
39
+ #errors
40
+ No DOCTYPE
41
+ #document
42
+ | <html>
43
+ | <head>
44
+ | <style>
45
+ | " <!---> "
46
+ | <body>
47
+ | "x"
48
+
49
+ #data
50
+ <iframe> <!---> </iframe>x
51
+ #errors
52
+ No DOCTYPE
53
+ #document
54
+ | <html>
55
+ | <head>
56
+ | <body>
57
+ | <iframe>
58
+ | " <!---> "
59
+ | "x"
60
+
61
+ #data
62
+ <iframe> <!--- </iframe>->x</iframe> --> </iframe>x
63
+ #errors
64
+ No DOCTYPE
65
+ #document
66
+ | <html>
67
+ | <head>
68
+ | <body>
69
+ | <iframe>
70
+ | " <!--- </iframe>->x</iframe> --> "
71
+ | "x"
72
+
73
+ #data
74
+ <script> <!-- </script> --> </script>x
75
+ #errors
76
+ No DOCTYPE
77
+ #document
78
+ | <html>
79
+ | <head>
80
+ | <script>
81
+ | " <!-- </script> --> "
82
+ | <body>
83
+ | "x"
84
+
85
+ #data
86
+ <title> <!-- </title> --> </title>x
87
+ #errors
88
+ No DOCTYPE
89
+ #document
90
+ | <html>
91
+ | <head>
92
+ | <title>
93
+ | " <!-- </title> --> "
94
+ | <body>
95
+ | "x"
96
+
97
+ #data
98
+ <textarea> <!--- </textarea>->x</textarea> --> </textarea>x
99
+ #errors
100
+ No DOCTYPE
101
+ #document
102
+ | <html>
103
+ | <head>
104
+ | <body>
105
+ | <textarea>
106
+ | " <!--- </textarea>->x</textarea> --> "
107
+ | "x"
108
+
109
+ #data
110
+ <style> <!</-- </style>x
111
+ #errors
112
+ No DOCTYPE
113
+ #document
114
+ | <html>
115
+ | <head>
116
+ | <style>
117
+ | " <!</-- "
118
+ | <body>
119
+ | "x"
120
+
121
+ #data
122
+ <xmp> <!-- > --> </xmp>
123
+ #errors
124
+ No DOCTYPE
125
+ #document
126
+ | <html>
127
+ | <head>
128
+ | <body>
129
+ | <xmp>
130
+ | " <!-- > --> "
131
+
132
+ #data
133
+ <title>&amp;</title>
134
+ #errors
135
+ No DOCTYPE
136
+ #document
137
+ | <html>
138
+ | <head>
139
+ | <title>
140
+ | "&"
141
+ | <body>
142
+
143
+ #data
144
+ <title><!--&amp;--></title>
145
+ #errors
146
+ No DOCTYPE
147
+ #document
148
+ | <html>
149
+ | <head>
150
+ | <title>
151
+ | "<!--&amp;-->"
152
+ | <body>
153
+
154
+ #data
155
+ <title><!--</title>
156
+ #errors
157
+ No DOCTYPE
158
+ Unexpected EOF
159
+ #document
160
+ | <html>
161
+ | <head>
162
+ | <title>
163
+ | "<!--</title>"
164
+ | <body>
165
+
166
+ #data
167
+ <noscript><!--</noscript>--></noscript>
168
+ #errors
169
+ No DOCTYPE
170
+ #document
171
+ | <html>
172
+ | <head>
173
+ | <noscript>
174
+ | "<!--</noscript>-->"
175
+ | <body>