html5 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +58 -0
  3. data/README +9 -0
  4. data/Rakefile.rb +17 -0
  5. data/lib/html5/constants.rb +818 -0
  6. data/lib/html5/filters/base.rb +10 -0
  7. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  8. data/lib/html5/filters/optionaltags.rb +198 -0
  9. data/lib/html5/filters/sanitizer.rb +15 -0
  10. data/lib/html5/filters/whitespace.rb +36 -0
  11. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  12. data/lib/html5/html5parser/after_frameset_phase.rb +34 -0
  13. data/lib/html5/html5parser/after_head_phase.rb +50 -0
  14. data/lib/html5/html5parser/before_head_phase.rb +41 -0
  15. data/lib/html5/html5parser/in_body_phase.rb +607 -0
  16. data/lib/html5/html5parser/in_caption_phase.rb +68 -0
  17. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  18. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  19. data/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  20. data/lib/html5/html5parser/in_head_phase.rb +138 -0
  21. data/lib/html5/html5parser/in_row_phase.rb +87 -0
  22. data/lib/html5/html5parser/in_select_phase.rb +84 -0
  23. data/lib/html5/html5parser/in_table_body_phase.rb +83 -0
  24. data/lib/html5/html5parser/in_table_phase.rb +110 -0
  25. data/lib/html5/html5parser/initial_phase.rb +134 -0
  26. data/lib/html5/html5parser/phase.rb +158 -0
  27. data/lib/html5/html5parser/root_element_phase.rb +42 -0
  28. data/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  29. data/lib/html5/html5parser.rb +248 -0
  30. data/lib/html5/inputstream.rb +654 -0
  31. data/lib/html5/liberalxmlparser.rb +158 -0
  32. data/lib/html5/sanitizer.rb +188 -0
  33. data/lib/html5/serializer/htmlserializer.rb +180 -0
  34. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  35. data/lib/html5/serializer.rb +2 -0
  36. data/lib/html5/tokenizer.rb +968 -0
  37. data/lib/html5/treebuilders/base.rb +334 -0
  38. data/lib/html5/treebuilders/hpricot.rb +231 -0
  39. data/lib/html5/treebuilders/rexml.rb +208 -0
  40. data/lib/html5/treebuilders/simpletree.rb +185 -0
  41. data/lib/html5/treebuilders.rb +24 -0
  42. data/lib/html5/treewalkers/base.rb +154 -0
  43. data/lib/html5/treewalkers/hpricot.rb +48 -0
  44. data/lib/html5/treewalkers/rexml.rb +48 -0
  45. data/lib/html5/treewalkers/simpletree.rb +48 -0
  46. data/lib/html5/treewalkers.rb +26 -0
  47. data/lib/html5.rb +13 -0
  48. data/parse.rb +217 -0
  49. data/tests/preamble.rb +82 -0
  50. data/tests/test_encoding.rb +35 -0
  51. data/tests/test_lxp.rb +263 -0
  52. data/tests/test_parser.rb +68 -0
  53. data/tests/test_sanitizer.rb +142 -0
  54. data/tests/test_serializer.rb +68 -0
  55. data/tests/test_stream.rb +62 -0
  56. data/tests/test_tokenizer.rb +94 -0
  57. data/tests/test_treewalkers.rb +116 -0
  58. data/tests/tokenizer_test_parser.rb +63 -0
  59. metadata +120 -0
@@ -0,0 +1,10 @@
1
+ require 'delegate'
2
+ require 'enumerator'
3
+
4
+ module HTML5
5
+ module Filters
6
+ class Base < SimpleDelegator
7
+ include Enumerable
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,82 @@
1
+ require 'html5/filters/base'
2
+
3
+ module HTML5
4
+ module Filters
5
+ class InjectMetaCharset < Base
6
+ def initialize(source, encoding)
7
+ super(source)
8
+ @encoding = encoding
9
+ end
10
+
11
+ def each
12
+ state = :pre_head
13
+ meta_found = @encoding.nil?
14
+ pending = []
15
+
16
+ __getobj__.each do |token|
17
+ case token[:type]
18
+ when :StartTag
19
+ state = :in_head if token[:name].downcase == "head"
20
+
21
+ when :EmptyTag
22
+ if token[:name].downcase == "meta"
23
+ # replace charset with actual encoding
24
+ token[:data].each_with_index do |(name, value), index|
25
+ if name == 'charset'
26
+ token[:data][index][1] = @encoding
27
+ meta_found = true
28
+ end
29
+ end
30
+
31
+ # replace charset with actual encoding
32
+ has_http_equiv_content_type = false
33
+ content_index = -1
34
+ token[:data].each_with_index do |(name, value), i|
35
+ if name.downcase == 'charset'
36
+ token[:data][i] = ['charset', @encoding]
37
+ meta_found = true
38
+ break
39
+ elsif name == 'http-equiv' and value.downcase == 'content-type'
40
+ has_http_equiv_content_type = true
41
+ elsif name == 'content'
42
+ content_index = i
43
+ end
44
+ end
45
+
46
+ if !meta_found
47
+ if has_http_equiv_content_type && content_index >= 0
48
+ token[:data][content_index][1] = 'text/html; charset=%s' % @encoding
49
+ meta_found = true
50
+ end
51
+ end
52
+
53
+ elsif token[:name].downcase == "head" && !meta_found
54
+ # insert meta into empty head
55
+ yield :type => :StartTag, :name => "head", :data => token[:data]
56
+ yield :type => :EmptyTag, :name => "meta", :data => [["charset", @encoding]]
57
+ yield :type => :EndTag, :name => "head"
58
+ meta_found = true
59
+ next
60
+ end
61
+
62
+ when :EndTag
63
+ if token[:name].downcase == "head" && pending.any?
64
+ # insert meta into head (if necessary) and flush pending queue
65
+ yield pending.shift
66
+ yield :type => :EmptyTag, :name => "meta", :data => [["charset", @encoding]] if !meta_found
67
+ yield pending.shift while pending.any?
68
+ meta_found = true
69
+ state = :post_head
70
+ end
71
+ end
72
+
73
+ if state == :in_head
74
+ pending << token
75
+ else
76
+ yield token
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,198 @@
1
+ require 'html5/constants'
2
+ require 'html5/filters/base'
3
+
4
+ module HTML5
5
+ module Filters
6
+
7
+ class OptionalTagFilter < Base
8
+ def slider
9
+ previous1 = previous2 = nil
10
+ __getobj__.each do |token|
11
+ yield previous2, previous1, token if previous1 != nil
12
+ previous2 = previous1
13
+ previous1 = token
14
+ end
15
+ yield previous2, previous1, nil
16
+ end
17
+
18
+ def each
19
+ slider do |previous, token, nexttok|
20
+ type = token[:type]
21
+ if type == :StartTag
22
+ yield token unless token[:data].empty? and is_optional_start(token[:name], previous, nexttok)
23
+ elsif type == :EndTag
24
+ yield token unless is_optional_end(token[:name], nexttok)
25
+ else
26
+ yield token
27
+ end
28
+ end
29
+ end
30
+
31
+ def is_optional_start(tagname, previous, nexttok)
32
+ type = nexttok ? nexttok[:type] : nil
33
+ if tagname == 'html'
34
+ # An html element's start tag may be omitted if the first thing
35
+ # inside the html element is not a space character or a comment.
36
+ return ![:Comment, :SpaceCharacters].include?(type)
37
+ elsif tagname == 'head'
38
+ # A head element's start tag may be omitted if the first thing
39
+ # inside the head element is an element.
40
+ return type == :StartTag
41
+ elsif tagname == 'body'
42
+ # A body element's start tag may be omitted if the first thing
43
+ # inside the body element is not a space character or a comment,
44
+ # except if the first thing inside the body element is a script
45
+ # or style element and the node immediately preceding the body
46
+ # element is a head element whose end tag has been omitted.
47
+ if [:Comment, :SpaceCharacters].include?(type)
48
+ return false
49
+ elsif type == :StartTag
50
+ # XXX: we do not look at the preceding event, so we never omit
51
+ # the body element's start tag if it's followed by a script or
52
+ # a style element.
53
+ return !%w[script style].include?(nexttok[:name])
54
+ else
55
+ return true
56
+ end
57
+ elsif tagname == 'colgroup'
58
+ # A colgroup element's start tag may be omitted if the first thing
59
+ # inside the colgroup element is a col element, and if the element
60
+ # is not immediately preceeded by another colgroup element whose
61
+ # end tag has been omitted.
62
+ if type == :StartTag
63
+ # XXX: we do not look at the preceding event, so instead we never
64
+ # omit the colgroup element's end tag when it is immediately
65
+ # followed by another colgroup element. See is_optional_end.
66
+ return nexttok[:name] == "col"
67
+ else
68
+ return false
69
+ end
70
+ elsif tagname == 'tbody'
71
+ # A tbody element's start tag may be omitted if the first thing
72
+ # inside the tbody element is a tr element, and if the element is
73
+ # not immediately preceeded by a tbody, thead, or tfoot element
74
+ # whose end tag has been omitted.
75
+ if type == :StartTag
76
+ # omit the thead and tfoot elements' end tag when they are
77
+ # immediately followed by a tbody element. See is_optional_end.
78
+ if previous and previous[:type] == :EndTag && %w(tbody thead tfoot).include?(previous[:name])
79
+ return false
80
+ end
81
+
82
+ return nexttok[:name] == 'tr'
83
+ else
84
+ return false
85
+ end
86
+ end
87
+ return false
88
+ end
89
+
90
+ def is_optional_end(tagname, nexttok)
91
+ type = nexttok ? nexttok[:type] : nil
92
+ if %w[html head body].include?(tagname)
93
+ # An html element's end tag may be omitted if the html element
94
+ # is not immediately followed by a space character or a comment.
95
+ return ![:Comment, :SpaceCharacters].include?(type)
96
+ elsif %w[li optgroup option tr].include?(tagname)
97
+ # A li element's end tag may be omitted if the li element is
98
+ # immediately followed by another li element or if there is
99
+ # no more content in the parent element.
100
+ # An optgroup element's end tag may be omitted if the optgroup
101
+ # element is immediately followed by another optgroup element,
102
+ # or if there is no more content in the parent element.
103
+ # An option element's end tag may be omitted if the option
104
+ # element is immediately followed by another option element,
105
+ # or if there is no more content in the parent element.
106
+ # A tr element's end tag may be omitted if the tr element is
107
+ # immediately followed by another tr element, or if there is
108
+ # no more content in the parent element.
109
+ if type == :StartTag
110
+ return nexttok[:name] == tagname
111
+ else
112
+ return type == :EndTag || type == nil
113
+ end
114
+ elsif %w(dt dd).include?(tagname)
115
+ # A dt element's end tag may be omitted if the dt element is
116
+ # immediately followed by another dt element or a dd element.
117
+ # A dd element's end tag may be omitted if the dd element is
118
+ # immediately followed by another dd element or a dt element,
119
+ # or if there is no more content in the parent element.
120
+ if type == :StartTag
121
+ return %w(dt dd).include?(nexttok[:name])
122
+ elsif tagname == 'dd'
123
+ return type == :EndTag || type == nil
124
+ else
125
+ return false
126
+ end
127
+ elsif tagname == 'p'
128
+ # A p element's end tag may be omitted if the p element is
129
+ # immediately followed by an address, blockquote, dl, fieldset,
130
+ # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
131
+ # or ul element, or if there is no more content in the parent
132
+ # element.
133
+ if type == :StartTag
134
+ return %w(address blockquote dl fieldset form h1 h2 h3 h4 h5
135
+ h6 hr menu ol p pre table ul).include?(nexttok[:name])
136
+ else
137
+ return type == :EndTag || type == nil
138
+ end
139
+ elsif tagname == 'colgroup'
140
+ # A colgroup element's end tag may be omitted if the colgroup
141
+ # element is not immediately followed by a space character or
142
+ # a comment.
143
+ if [:Comment, :SpaceCharacters].include?(type)
144
+ return false
145
+ elsif type == :StartTag
146
+ # XXX: we also look for an immediately following colgroup
147
+ # element. See is_optional_start.
148
+ return nexttok[:name] != 'colgroup'
149
+ else
150
+ return true
151
+ end
152
+ elsif %w(thead tbody).include? tagname
153
+ # A thead element's end tag may be omitted if the thead element
154
+ # is immediately followed by a tbody or tfoot element.
155
+ # A tbody element's end tag may be omitted if the tbody element
156
+ # is immediately followed by a tbody or tfoot element, or if
157
+ # there is no more content in the parent element.
158
+ # A tfoot element's end tag may be omitted if the tfoot element
159
+ # is immediately followed by a tbody element, or if there is no
160
+ # more content in the parent element.
161
+ # XXX: we never omit the end tag when the following element is
162
+ # a tbody. See is_optional_start.
163
+ if type == :StartTag
164
+ return %w(tbody tfoot).include?(nexttok[:name])
165
+ elsif tagname == 'tbody'
166
+ return (type == :EndTag or type == nil)
167
+ else
168
+ return false
169
+ end
170
+ elsif tagname == 'tfoot'
171
+ # A tfoot element's end tag may be omitted if the tfoot element
172
+ # is immediately followed by a tbody element, or if there is no
173
+ # more content in the parent element.
174
+ # XXX: we never omit the end tag when the following element is
175
+ # a tbody. See is_optional_start.
176
+ if type == :StartTag
177
+ return nexttok[:name] == 'tbody'
178
+ else
179
+ return type == :EndTag || type == nil
180
+ end
181
+ elsif %w(td th).include? tagname
182
+ # A td element's end tag may be omitted if the td element is
183
+ # immediately followed by a td or th element, or if there is
184
+ # no more content in the parent element.
185
+ # A th element's end tag may be omitted if the th element is
186
+ # immediately followed by a td or th element, or if there is
187
+ # no more content in the parent element.
188
+ if type == :StartTag
189
+ return %w(td th).include?(nexttok[:name])
190
+ else
191
+ return type == :EndTag || type == nil
192
+ end
193
+ end
194
+ return false
195
+ end
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,15 @@
1
+ require 'html5/filters/base'
2
+ require 'html5/sanitizer'
3
+
4
+ module HTML5
5
+ module Filters
6
+ class HTMLSanitizeFilter < Base
7
+ include HTMLSanitizeModule
8
+ def each
9
+ __getobj__.each do |token|
10
+ yield(sanitize_token(token))
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,36 @@
1
+ require 'html5/constants'
2
+ require 'html5/filters/base'
3
+
4
+ module HTML5
5
+ module Filters
6
+ class WhitespaceFilter < Base
7
+
8
+ SPACE_PRESERVE_ELEMENTS = %w[pre textarea] + RCDATA_ELEMENTS
9
+ SPACES = /[#{SPACE_CHARACTERS.join('')}]+/m
10
+
11
+ def each
12
+ preserve = 0
13
+ __getobj__.each do |token|
14
+ case token[:type]
15
+ when :StartTag
16
+ if preserve > 0 or SPACE_PRESERVE_ELEMENTS.include?(token[:name])
17
+ preserve += 1
18
+ end
19
+
20
+ when :EndTag
21
+ preserve -= 1 if preserve > 0
22
+
23
+ when :SpaceCharacters
24
+ token[:data] = " " if preserve == 0 && token[:data]
25
+
26
+ when :Characters
27
+ token[:data] = token[:data].sub(SPACES,' ') if preserve == 0
28
+ end
29
+
30
+ yield token
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+
@@ -0,0 +1,46 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class AfterBodyPhase < Phase
5
+
6
+ handle_end 'html'
7
+
8
+ def processComment(data)
9
+ # This is needed because data is to be appended to the <html> element
10
+ # here and not to whatever is currently open.
11
+ @tree.insert_comment(data, @tree.open_elements.first)
12
+ end
13
+
14
+ def processCharacters(data)
15
+ parse_error(_('Unexpected non-space characters in the after body phase.'))
16
+ @parser.phase = @parser.phases[:inBody]
17
+ @parser.phase.processCharacters(data)
18
+ end
19
+
20
+ def processStartTag(name, attributes)
21
+ parse_error(_("Unexpected start tag token (#{name}) in the after body phase."))
22
+ @parser.phase = @parser.phases[:inBody]
23
+ @parser.phase.processStartTag(name, attributes)
24
+ end
25
+
26
+ def endTagHtml(name)
27
+ if @parser.inner_html
28
+ parse_error
29
+ else
30
+ # XXX: This may need to be done, not sure
31
+ # Don't set last_phase to the current phase but to the inBody phase
32
+ # instead. No need for extra parse errors if there's something after </html>.
33
+ # Try "<!doctype html>X</html>X" for instance.
34
+ @parser.last_phase = @parser.phase
35
+ @parser.phase = @parser.phases[:trailingEnd]
36
+ end
37
+ end
38
+
39
+ def endTagOther(name)
40
+ parse_error(_("Unexpected end tag token (#{name}) in the after body phase."))
41
+ @parser.phase = @parser.phases[:inBody]
42
+ @parser.phase.processEndTag(name)
43
+ end
44
+
45
+ end
46
+ end
@@ -0,0 +1,34 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class AfterFramesetPhase < Phase
5
+
6
+ # http://www.whatwg.org/specs/web-apps/current-work/#after3
7
+
8
+ handle_start 'html', 'noframes'
9
+
10
+ handle_end 'html'
11
+
12
+ def processCharacters(data)
13
+ parse_error(_('Unexpected non-space characters in the after frameset phase. Ignored.'))
14
+ end
15
+
16
+ def startTagNoframes(name, attributes)
17
+ @parser.phases[:inBody].processStartTag(name, attributes)
18
+ end
19
+
20
+ def startTagOther(name, attributes)
21
+ parse_error(_("Unexpected start tag (#{name}) in the after frameset phase. Ignored."))
22
+ end
23
+
24
+ def endTagHtml(name)
25
+ @parser.last_phase = @parser.phase
26
+ @parser.phase = @parser.phases[:trailingEnd]
27
+ end
28
+
29
+ def endTagOther(name)
30
+ parse_error(_("Unexpected end tag (#{name}) in the after frameset phase. Ignored."))
31
+ end
32
+
33
+ end
34
+ end
@@ -0,0 +1,50 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class AfterHeadPhase < Phase
5
+
6
+ handle_start 'html', 'body', 'frameset', %w( base link meta script style title ) => 'FromHead'
7
+
8
+ def process_eof
9
+ anythingElse
10
+ @parser.phase.process_eof
11
+ end
12
+
13
+ def processCharacters(data)
14
+ anythingElse
15
+ @parser.phase.processCharacters(data)
16
+ end
17
+
18
+ def startTagBody(name, attributes)
19
+ @tree.insert_element(name, attributes)
20
+ @parser.phase = @parser.phases[:inBody]
21
+ end
22
+
23
+ def startTagFrameset(name, attributes)
24
+ @tree.insert_element(name, attributes)
25
+ @parser.phase = @parser.phases[:inFrameset]
26
+ end
27
+
28
+ def startTagFromHead(name, attributes)
29
+ parse_error(_("Unexpected start tag (#{name}) that can be in head. Moved."))
30
+ @parser.phase = @parser.phases[:inHead]
31
+ @parser.phase.processStartTag(name, attributes)
32
+ end
33
+
34
+ def startTagOther(name, attributes)
35
+ anythingElse
36
+ @parser.phase.processStartTag(name, attributes)
37
+ end
38
+
39
+ def processEndTag(name)
40
+ anythingElse
41
+ @parser.phase.processEndTag(name)
42
+ end
43
+
44
+ def anythingElse
45
+ @tree.insert_element('body', {})
46
+ @parser.phase = @parser.phases[:inBody]
47
+ end
48
+
49
+ end
50
+ end
@@ -0,0 +1,41 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class BeforeHeadPhase < Phase
5
+
6
+ handle_start 'html', 'head'
7
+
8
+ handle_end %w( html head body br p ) => 'ImplyHead'
9
+
10
+ def process_eof
11
+ startTagHead('head', {})
12
+ @parser.phase.process_eof
13
+ end
14
+
15
+ def processCharacters(data)
16
+ startTagHead('head', {})
17
+ @parser.phase.processCharacters(data)
18
+ end
19
+
20
+ def startTagHead(name, attributes)
21
+ @tree.insert_element(name, attributes)
22
+ @tree.head_pointer = @tree.open_elements[-1]
23
+ @parser.phase = @parser.phases[:inHead]
24
+ end
25
+
26
+ def startTagOther(name, attributes)
27
+ startTagHead('head', {})
28
+ @parser.phase.processStartTag(name, attributes)
29
+ end
30
+
31
+ def endTagImplyHead(name)
32
+ startTagHead('head', {})
33
+ @parser.phase.processEndTag(name)
34
+ end
35
+
36
+ def endTagOther(name)
37
+ parse_error(_("Unexpected end tag (#{name}) after the (implied) root element."))
38
+ end
39
+
40
+ end
41
+ end