html5 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +58 -0
  3. data/README +9 -0
  4. data/Rakefile.rb +17 -0
  5. data/lib/html5/constants.rb +818 -0
  6. data/lib/html5/filters/base.rb +10 -0
  7. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  8. data/lib/html5/filters/optionaltags.rb +198 -0
  9. data/lib/html5/filters/sanitizer.rb +15 -0
  10. data/lib/html5/filters/whitespace.rb +36 -0
  11. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  12. data/lib/html5/html5parser/after_frameset_phase.rb +34 -0
  13. data/lib/html5/html5parser/after_head_phase.rb +50 -0
  14. data/lib/html5/html5parser/before_head_phase.rb +41 -0
  15. data/lib/html5/html5parser/in_body_phase.rb +607 -0
  16. data/lib/html5/html5parser/in_caption_phase.rb +68 -0
  17. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  18. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  19. data/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  20. data/lib/html5/html5parser/in_head_phase.rb +138 -0
  21. data/lib/html5/html5parser/in_row_phase.rb +87 -0
  22. data/lib/html5/html5parser/in_select_phase.rb +84 -0
  23. data/lib/html5/html5parser/in_table_body_phase.rb +83 -0
  24. data/lib/html5/html5parser/in_table_phase.rb +110 -0
  25. data/lib/html5/html5parser/initial_phase.rb +134 -0
  26. data/lib/html5/html5parser/phase.rb +158 -0
  27. data/lib/html5/html5parser/root_element_phase.rb +42 -0
  28. data/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  29. data/lib/html5/html5parser.rb +248 -0
  30. data/lib/html5/inputstream.rb +654 -0
  31. data/lib/html5/liberalxmlparser.rb +158 -0
  32. data/lib/html5/sanitizer.rb +188 -0
  33. data/lib/html5/serializer/htmlserializer.rb +180 -0
  34. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  35. data/lib/html5/serializer.rb +2 -0
  36. data/lib/html5/tokenizer.rb +968 -0
  37. data/lib/html5/treebuilders/base.rb +334 -0
  38. data/lib/html5/treebuilders/hpricot.rb +231 -0
  39. data/lib/html5/treebuilders/rexml.rb +208 -0
  40. data/lib/html5/treebuilders/simpletree.rb +185 -0
  41. data/lib/html5/treebuilders.rb +24 -0
  42. data/lib/html5/treewalkers/base.rb +154 -0
  43. data/lib/html5/treewalkers/hpricot.rb +48 -0
  44. data/lib/html5/treewalkers/rexml.rb +48 -0
  45. data/lib/html5/treewalkers/simpletree.rb +48 -0
  46. data/lib/html5/treewalkers.rb +26 -0
  47. data/lib/html5.rb +13 -0
  48. data/parse.rb +217 -0
  49. data/tests/preamble.rb +82 -0
  50. data/tests/test_encoding.rb +35 -0
  51. data/tests/test_lxp.rb +263 -0
  52. data/tests/test_parser.rb +68 -0
  53. data/tests/test_sanitizer.rb +142 -0
  54. data/tests/test_serializer.rb +68 -0
  55. data/tests/test_stream.rb +62 -0
  56. data/tests/test_tokenizer.rb +94 -0
  57. data/tests/test_treewalkers.rb +116 -0
  58. data/tests/tokenizer_test_parser.rb +63 -0
  59. metadata +120 -0
@@ -0,0 +1,134 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InitialPhase < Phase
5
+
6
+ # This phase deals with error handling as well which is currently not
7
+ # covered in the specification. The error handling is typically known as
8
+ # "quirks mode". It is expected that a future version of HTML5 will define this.
9
+
10
+ def process_eof
11
+ parse_error(_('Unexpected End of file. Expected DOCTYPE.'))
12
+ @parser.phase = @parser.phases[:rootElement]
13
+ @parser.phase.process_eof
14
+ end
15
+
16
+ def processComment(data)
17
+ @tree.insert_comment(data, @tree.document)
18
+ end
19
+
20
+ def processDoctype(name, publicId, systemId, correct)
21
+ if name.downcase != 'html' or publicId or systemId
22
+ parse_error(_('Erroneous DOCTYPE.'))
23
+ end
24
+ # XXX need to update DOCTYPE tokens
25
+ @tree.insertDoctype(name, publicId, systemId)
26
+
27
+ publicId = publicId.to_s.upcase
28
+
29
+ if name.downcase != 'html'
30
+ # XXX quirks mode
31
+ else
32
+ if ["+//silmaril//dtd html pro v0r11 19970101//en",
33
+ "-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
34
+ "-//as//dtd html 3.0 aswedit + extensions//en",
35
+ "-//ietf//dtd html 2.0 level 1//en",
36
+ "-//ietf//dtd html 2.0 level 2//en",
37
+ "-//ietf//dtd html 2.0 strict level 1//en",
38
+ "-//ietf//dtd html 2.0 strict level 2//en",
39
+ "-//ietf//dtd html 2.0 strict//en",
40
+ "-//ietf//dtd html 2.0//en",
41
+ "-//ietf//dtd html 2.1e//en",
42
+ "-//ietf//dtd html 3.0//en",
43
+ "-//ietf//dtd html 3.0//en//",
44
+ "-//ietf//dtd html 3.2 final//en",
45
+ "-//ietf//dtd html 3.2//en",
46
+ "-//ietf//dtd html 3//en",
47
+ "-//ietf//dtd html level 0//en",
48
+ "-//ietf//dtd html level 0//en//2.0",
49
+ "-//ietf//dtd html level 1//en",
50
+ "-//ietf//dtd html level 1//en//2.0",
51
+ "-//ietf//dtd html level 2//en",
52
+ "-//ietf//dtd html level 2//en//2.0",
53
+ "-//ietf//dtd html level 3//en",
54
+ "-//ietf//dtd html level 3//en//3.0",
55
+ "-//ietf//dtd html strict level 0//en",
56
+ "-//ietf//dtd html strict level 0//en//2.0",
57
+ "-//ietf//dtd html strict level 1//en",
58
+ "-//ietf//dtd html strict level 1//en//2.0",
59
+ "-//ietf//dtd html strict level 2//en",
60
+ "-//ietf//dtd html strict level 2//en//2.0",
61
+ "-//ietf//dtd html strict level 3//en",
62
+ "-//ietf//dtd html strict level 3//en//3.0",
63
+ "-//ietf//dtd html strict//en",
64
+ "-//ietf//dtd html strict//en//2.0",
65
+ "-//ietf//dtd html strict//en//3.0",
66
+ "-//ietf//dtd html//en",
67
+ "-//ietf//dtd html//en//2.0",
68
+ "-//ietf//dtd html//en//3.0",
69
+ "-//metrius//dtd metrius presentational//en",
70
+ "-//microsoft//dtd internet explorer 2.0 html strict//en",
71
+ "-//microsoft//dtd internet explorer 2.0 html//en",
72
+ "-//microsoft//dtd internet explorer 2.0 tables//en",
73
+ "-//microsoft//dtd internet explorer 3.0 html strict//en",
74
+ "-//microsoft//dtd internet explorer 3.0 html//en",
75
+ "-//microsoft//dtd internet explorer 3.0 tables//en",
76
+ "-//netscape comm. corp.//dtd html//en",
77
+ "-//netscape comm. corp.//dtd strict html//en",
78
+ "-//o'reilly and associates//dtd html 2.0//en",
79
+ "-//o'reilly and associates//dtd html extended 1.0//en",
80
+ "-//spyglass//dtd html 2.0 extended//en",
81
+ "-//sq//dtd html 2.0 hotmetal + extensions//en",
82
+ "-//sun microsystems corp.//dtd hotjava html//en",
83
+ "-//sun microsystems corp.//dtd hotjava strict html//en",
84
+ "-//w3c//dtd html 3 1995-03-24//en",
85
+ "-//w3c//dtd html 3.2 draft//en",
86
+ "-//w3c//dtd html 3.2 final//en",
87
+ "-//w3c//dtd html 3.2//en",
88
+ "-//w3c//dtd html 3.2s draft//en",
89
+ "-//w3c//dtd html 4.0 frameset//en",
90
+ "-//w3c//dtd html 4.0 transitional//en",
91
+ "-//w3c//dtd html experimental 19960712//en",
92
+ "-//w3c//dtd html experimental 970421//en",
93
+ "-//w3c//dtd w3 html//en",
94
+ "-//w3o//dtd w3 html 3.0//en",
95
+ "-//w3o//dtd w3 html 3.0//en//",
96
+ "-//w3o//dtd w3 html strict 3.0//en//",
97
+ "-//webtechs//dtd mozilla html 2.0//en",
98
+ "-//webtechs//dtd mozilla html//en",
99
+ "-/w3c/dtd html 4.0 transitional/en",
100
+ "html"].include?(publicId) or
101
+ (systemId == nil and
102
+ ["-//w3c//dtd html 4.01 frameset//EN",
103
+ "-//w3c//dtd html 4.01 transitional//EN"].include?(publicId)) or
104
+ (systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")
105
+ #XXX quirks mode
106
+ end
107
+ end
108
+
109
+ @parser.phase = @parser.phases[:rootElement]
110
+ end
111
+
112
+ def processSpaceCharacters(data)
113
+ end
114
+
115
+ def processCharacters(data)
116
+ parse_error(_('Unexpected non-space characters. Expected DOCTYPE.'))
117
+ @parser.phase = @parser.phases[:rootElement]
118
+ @parser.phase.processCharacters(data)
119
+ end
120
+
121
+ def processStartTag(name, attributes)
122
+ parse_error(_("Unexpected start tag (#{name}). Expected DOCTYPE."))
123
+ @parser.phase = @parser.phases[:rootElement]
124
+ @parser.phase.processStartTag(name, attributes)
125
+ end
126
+
127
+ def processEndTag(name)
128
+ parse_error(_("Unexpected end tag (#{name}). Expected DOCTYPE."))
129
+ @parser.phase = @parser.phases[:rootElement]
130
+ @parser.phase.processEndTag(name)
131
+ end
132
+
133
+ end
134
+ end
@@ -0,0 +1,158 @@
1
+ module HTML5
2
+ # Base class for helper objects that implement each phase of processing.
3
+ #
4
+ # Handler methods should be in the following order (they can be omitted):
5
+ #
6
+ # * EOF
7
+ # * Comment
8
+ # * Doctype
9
+ # * SpaceCharacters
10
+ # * Characters
11
+ # * StartTag
12
+ # - startTag* methods
13
+ # * EndTag
14
+ # - endTag* methods
15
+ #
16
+ class Phase
17
+
18
+ extend Forwardable
19
+ def_delegators :@parser, :parse_error
20
+
21
+ # The following example call:
22
+ #
23
+ # tag_handlers('startTag', 'html', %w( base link meta ), %w( li dt dd ) => 'ListItem')
24
+ #
25
+ # ...would return a hash equal to this:
26
+ #
27
+ # { 'html' => 'startTagHtml',
28
+ # 'base' => 'startTagBaseLinkMeta',
29
+ # 'link' => 'startTagBaseLinkMeta',
30
+ # 'meta' => 'startTagBaseLinkMeta',
31
+ # 'li' => 'startTagListItem',
32
+ # 'dt' => 'startTagListItem',
33
+ # 'dd' => 'startTagListItem' }
34
+ #
35
+ def self.tag_handlers(prefix, *tags)
36
+ mapping = {}
37
+ if tags.last.is_a?(Hash)
38
+ tags.pop.each do |names, handler_method_suffix|
39
+ handler_method = prefix + handler_method_suffix
40
+ Array(names).each {|name| mapping[name] = handler_method }
41
+ end
42
+ end
43
+ tags.each do |names|
44
+ names = Array(names)
45
+ handler_method = prefix + names.map {|name| name.capitalize }.join
46
+ names.each {|name| mapping[name] = handler_method }
47
+ end
48
+ mapping
49
+ end
50
+
51
+ def self.start_tag_handlers
52
+ @start_tag_handlers ||= Hash.new('startTagOther')
53
+ end
54
+
55
+ # Declare what start tags this Phase handles. Can be called more than once.
56
+ #
57
+ # Example usage:
58
+ #
59
+ # handle_start 'html'
60
+ # # html start tags will be handled by a method named 'startTagHtml'
61
+ #
62
+ # handle_start %( base link meta )
63
+ # # base, link and meta start tags will be handled by a method named 'startTagBaseLinkMeta'
64
+ #
65
+ # handle_start %( li dt dd ) => 'ListItem'
66
+ # # li, dt, and dd start tags will be handled by a method named 'startTagListItem'
67
+ #
68
+ def self.handle_start(*tags)
69
+ start_tag_handlers.update tag_handlers('startTag', *tags)
70
+ end
71
+
72
+ def self.end_tag_handlers
73
+ @end_tag_handlers ||= Hash.new('endTagOther')
74
+ end
75
+
76
+ # Declare what end tags this Phase handles. Behaves like handle_start.
77
+ #
78
+ def self.handle_end(*tags)
79
+ end_tag_handlers.update tag_handlers('endTag', *tags)
80
+ end
81
+
82
+ def initialize(parser, tree)
83
+ @parser, @tree = parser, tree
84
+ end
85
+
86
+ def process_eof
87
+ @tree.generateImpliedEndTags
88
+
89
+ if @tree.open_elements.length > 2
90
+ parse_error(_('Unexpected end of file. Missing closing tags.'))
91
+ elsif @tree.open_elements.length == 2 and @tree.open_elements[1].name != 'body'
92
+ # This happens for framesets or something?
93
+ parse_error(_("Unexpected end of file. Expected end tag (#{@tree.open_elements[1].name}) first."))
94
+ elsif @parser.inner_html and @tree.open_elements.length > 1
95
+ # XXX This is not what the specification says. Not sure what to do here.
96
+ parse_error(_('XXX inner_html EOF'))
97
+ end
98
+ # Betting ends.
99
+ end
100
+
101
+ def processComment(data)
102
+ # For most phases the following is correct. Where it's not it will be
103
+ # overridden.
104
+ @tree.insert_comment(data, @tree.open_elements.last)
105
+ end
106
+
107
+ def processDoctype(name, publicId, systemId, correct)
108
+ parse_error(_('Unexpected DOCTYPE. Ignored.'))
109
+ end
110
+
111
+ def processSpaceCharacters(data)
112
+ @tree.insertText(data)
113
+ end
114
+
115
+ def processStartTag(name, attributes)
116
+ send self.class.start_tag_handlers[name], name, attributes
117
+ end
118
+
119
+ def startTagHtml(name, attributes)
120
+ if @parser.first_start_tag == false and name == 'html'
121
+ parse_error(_('html needs to be the first start tag.'))
122
+ end
123
+ # XXX Need a check here to see if the first start tag token emitted is
124
+ # this token... If it's not, invoke parse_error.
125
+ attributes.each do |attr, value|
126
+ unless @tree.open_elements.first.attributes.has_key?(attr)
127
+ @tree.open_elements.first.attributes[attr] = value
128
+ end
129
+ end
130
+ @parser.first_start_tag = false
131
+ end
132
+
133
+ def processEndTag(name)
134
+ send self.class.end_tag_handlers[name], name
135
+ end
136
+
137
+ def _(string)
138
+ string
139
+ end
140
+
141
+ def assert(value)
142
+ throw AssertionError.new unless value
143
+ end
144
+
145
+ def in_scope?(*args)
146
+ @tree.elementInScope(*args)
147
+ end
148
+
149
+ def remove_open_elements_until(name=nil)
150
+ finished = false
151
+ until finished
152
+ element = @tree.open_elements.pop
153
+ finished = name.nil? ? yield(element) : element.name == name
154
+ end
155
+ return element
156
+ end
157
+ end
158
+ end
@@ -0,0 +1,42 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class RootElementPhase < Phase
5
+
6
+ def process_eof
7
+ insert_html_element
8
+ @parser.phase.process_eof
9
+ end
10
+
11
+ def processComment(data)
12
+ @tree.insert_comment(data, @tree.document)
13
+ end
14
+
15
+ def processSpaceCharacters(data)
16
+ end
17
+
18
+ def processCharacters(data)
19
+ insert_html_element
20
+ @parser.phase.processCharacters(data)
21
+ end
22
+
23
+ def processStartTag(name, attributes)
24
+ @parser.first_start_tag = true if name == 'html'
25
+ insert_html_element
26
+ @parser.phase.processStartTag(name, attributes)
27
+ end
28
+
29
+ def processEndTag(name)
30
+ insert_html_element
31
+ @parser.phase.processEndTag(name)
32
+ end
33
+
34
+ def insert_html_element
35
+ element = @tree.createElement('html', {})
36
+ @tree.open_elements.push(element)
37
+ @tree.document.appendChild(element)
38
+ @parser.phase = @parser.phases[:beforeHead]
39
+ end
40
+
41
+ end
42
+ end
@@ -0,0 +1,35 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class TrailingEndPhase < Phase
5
+
6
+ def process_eof
7
+ end
8
+
9
+ def processComment(data)
10
+ @tree.insert_comment(data, @tree.document)
11
+ end
12
+
13
+ def processSpaceCharacters(data)
14
+ @parser.last_phase.processSpaceCharacters(data)
15
+ end
16
+
17
+ def processCharacters(data)
18
+ parse_error(_('Unexpected non-space characters. Expected end of file.'))
19
+ @parser.phase = @parser.last_phase
20
+ @parser.phase.processCharacters(data)
21
+ end
22
+
23
+ def processStartTag(name, attributes)
24
+ parse_error(_('Unexpected start tag (#{name}). Expected end of file.'))
25
+ @parser.phase = @parser.last_phase
26
+ @parser.phase.processStartTag(name, attributes)
27
+ end
28
+
29
+ def processEndTag(name)
30
+ parse_error(_('Unexpected end tag (#{name}). Expected end of file.'))
31
+ @parser.phase = @parser.last_phase
32
+ @parser.phase.processEndTag(name)
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,248 @@
1
+ require 'html5/constants'
2
+ require 'html5/tokenizer'
3
+ require 'html5/treebuilders/rexml'
4
+
5
+ Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path|
6
+ require 'html5/html5parser/' + File.basename(path)
7
+ end
8
+
9
+ module HTML5
10
+
11
+ # Error in parsed document
12
+ class ParseError < Exception; end
13
+ class AssertionError < Exception; end
14
+
15
+ # HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML
16
+ #
17
+ class HTMLParser
18
+
19
+ attr_accessor :phase, :first_start_tag, :inner_html, :last_phase, :insert_from_table
20
+
21
+ attr_reader :phases, :tokenizer, :tree, :errors
22
+
23
+ def self.parse(stream, options = {})
24
+ encoding = options.delete(:encoding)
25
+ new(options).parse(stream,encoding)
26
+ end
27
+
28
+ def self.parse_fragment(stream, options = {})
29
+ container = options.delete(:container) || 'div'
30
+ encoding = options.delete(:encoding)
31
+ new(options).parse_fragment(stream, container, encoding)
32
+ end
33
+
34
+ @@phases = %w( initial rootElement beforeHead inHead afterHead inBody inTable inCaption
35
+ inColumnGroup inTableBody inRow inCell inSelect afterBody inFrameset afterFrameset trailingEnd )
36
+
37
+ # :strict - raise an exception when a parse error is encountered
38
+ # :tree - a treebuilder class controlling the type of tree that will be
39
+ # returned. Built in treebuilders can be accessed through
40
+ # HTML5::TreeBuilders[treeType]
41
+ def initialize(options = {})
42
+ @strict = false
43
+ @errors = []
44
+
45
+ @tokenizer = HTMLTokenizer
46
+ @tree = TreeBuilders::REXML::TreeBuilder
47
+
48
+ options.each {|name, value| instance_variable_set("@#{name}", value) }
49
+ @lowercase_attr_name = nil unless instance_variable_defined?(:@lowercase_attr_name)
50
+ @lowercase_element_name = nil unless instance_variable_defined?(:@lowercase_element_name)
51
+
52
+ @tree = @tree.new
53
+
54
+ @phases = @@phases.inject({}) do |phases, phase_name|
55
+ phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
56
+ phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree)
57
+ phases
58
+ end
59
+ end
60
+
61
+ def _parse(stream, inner_html, encoding, container = 'div')
62
+ @tree.reset
63
+ @first_start_tag = false
64
+ @errors = []
65
+
66
+ @tokenizer = @tokenizer.class unless Class === @tokenizer
67
+ @tokenizer = @tokenizer.new(stream, :encoding => encoding,
68
+ :parseMeta => !inner_html, :lowercase_attr_name => @lowercase_attr_name, :lowercase_element_name => @lowercase_element_name)
69
+
70
+ if inner_html
71
+ case @inner_html = container.downcase
72
+ when 'title', 'textarea'
73
+ @tokenizer.content_model_flag = :RCDATA
74
+ when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
75
+ @tokenizer.content_model_flag = :CDATA
76
+ when 'plaintext'
77
+ @tokenizer.content_model_flag = :PLAINTEXT
78
+ else
79
+ # content_model_flag already is PCDATA
80
+ #@tokenizer.content_model_flag = :PCDATA
81
+ end
82
+
83
+ @phase = @phases[:rootElement]
84
+ @phase.insert_html_element
85
+ reset_insertion_mode
86
+ else
87
+ @inner_html = false
88
+ @phase = @phases[:initial]
89
+ end
90
+
91
+ # We only seem to have InBodyPhase testcases where the following is
92
+ # relevant ... need others too
93
+ @last_phase = nil
94
+
95
+ # XXX This is temporary for the moment so there isn't any other
96
+ # changes needed for the parser to work with the iterable tokenizer
97
+ @tokenizer.each do |token|
98
+ token = normalize_token(token)
99
+
100
+ method = 'process%s' % token[:type]
101
+
102
+ case token[:type]
103
+ when :Characters, :SpaceCharacters, :Comment
104
+ @phase.send method, token[:data]
105
+ when :StartTag
106
+ @phase.send method, token[:name], token[:data]
107
+ when :EndTag
108
+ @phase.send method, token[:name]
109
+ when :Doctype
110
+ @phase.send method, token[:name], token[:publicId],
111
+ token[:systemId], token[:correct]
112
+ else
113
+ parse_error(token[:data])
114
+ end
115
+ end
116
+
117
+ # When the loop finishes it's EOF
118
+ @phase.process_eof
119
+ end
120
+
121
+ # Parse a HTML document into a well-formed tree
122
+ #
123
+ # stream - a filelike object or string containing the HTML to be parsed
124
+ #
125
+ # The optional encoding parameter must be a string that indicates
126
+ # the encoding. If specified, that encoding will be used,
127
+ # regardless of any BOM or later declaration (such as in a meta
128
+ # element)
129
+ def parse(stream, encoding=nil)
130
+ _parse(stream, false, encoding)
131
+ @tree.get_document
132
+ end
133
+
134
+ # Parse a HTML fragment into a well-formed tree fragment
135
+
136
+ # container - name of the element we're setting the inner_html property
137
+ # if set to nil, default to 'div'
138
+ #
139
+ # stream - a filelike object or string containing the HTML to be parsed
140
+ #
141
+ # The optional encoding parameter must be a string that indicates
142
+ # the encoding. If specified, that encoding will be used,
143
+ # regardless of any BOM or later declaration (such as in a meta
144
+ # element)
145
+ def parse_fragment(stream, container='div', encoding=nil)
146
+ _parse(stream, true, encoding, container)
147
+ @tree.get_fragment
148
+ end
149
+
150
+ def parse_error(data = 'XXX ERROR MESSAGE NEEDED')
151
+ # XXX The idea is to make data mandatory.
152
+ @errors.push([@tokenizer.stream.position, data])
153
+ raise ParseError if @strict
154
+ end
155
+
156
+ # HTML5 specific normalizations to the token stream
157
+ def normalize_token(token)
158
+
159
+ if token[:type] == :EmptyTag
160
+ # When a solidus (/) is encountered within a tag name what happens
161
+ # depends on whether the current tag name matches that of a void
162
+ # element. If it matches a void element atheists did the wrong
163
+ # thing and if it doesn't it's wrong for everyone.
164
+
165
+ unless VOID_ELEMENTS.include?(token[:name])
166
+ parse_error(_('Solidus (/) incorrectly placed in tag.'))
167
+ end
168
+
169
+ token[:type] = :StartTag
170
+ end
171
+
172
+ if token[:type] == :StartTag
173
+ token[:name] = token[:name].downcase
174
+
175
+ # We need to remove the duplicate attributes and convert attributes
176
+ # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
177
+
178
+ unless token[:data].empty?
179
+ data = token[:data].reverse.map {|attr, value| [attr.downcase, value] }
180
+ token[:data] = Hash[*data.flatten]
181
+ end
182
+
183
+ elsif token[:type] == :EndTag
184
+ parse_error(_('End tag contains unexpected attributes.')) unless token[:data].empty?
185
+ token[:name] = token[:name].downcase
186
+ end
187
+
188
+ token
189
+ end
190
+
191
+ @@new_modes = {
192
+ 'select' => :inSelect,
193
+ 'td' => :inCell,
194
+ 'th' => :inCell,
195
+ 'tr' => :inRow,
196
+ 'tbody' => :inTableBody,
197
+ 'thead' => :inTableBody,
198
+ 'tfoot' => :inTableBody,
199
+ 'caption' => :inCaption,
200
+ 'colgroup' => :inColumnGroup,
201
+ 'table' => :inTable,
202
+ 'head' => :inBody,
203
+ 'body' => :inBody,
204
+ 'frameset' => :inFrameset
205
+ }
206
+
207
+ def reset_insertion_mode
208
+ # The name of this method is mostly historical. (It's also used in the
209
+ # specification.)
210
+ last = false
211
+
212
+ @tree.open_elements.reverse.each do |node|
213
+ node_name = node.name
214
+
215
+ if node == @tree.open_elements.first
216
+ last = true
217
+ unless ['td', 'th'].include?(node_name)
218
+ # XXX
219
+ # assert @inner_html
220
+ node_name = @inner_html
221
+ end
222
+ end
223
+
224
+ # Check for conditions that should only happen in the inner_html
225
+ # case
226
+ if ['select', 'colgroup', 'head', 'frameset'].include?(node_name)
227
+ # XXX
228
+ # assert @inner_html
229
+ end
230
+
231
+ if @@new_modes.has_key?(node_name)
232
+ @phase = @phases[@@new_modes[node_name]]
233
+ elsif node_name == 'html'
234
+ @phase = @phases[@tree.head_pointer.nil?? :beforeHead : :afterHead]
235
+ elsif last
236
+ @phase = @phases[:inBody]
237
+ else
238
+ next
239
+ end
240
+
241
+ break
242
+ end
243
+ end
244
+
245
+ def _(string); string; end
246
+ end
247
+
248
+ end