html5 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +58 -0
  3. data/README +9 -0
  4. data/Rakefile.rb +17 -0
  5. data/lib/html5/constants.rb +818 -0
  6. data/lib/html5/filters/base.rb +10 -0
  7. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  8. data/lib/html5/filters/optionaltags.rb +198 -0
  9. data/lib/html5/filters/sanitizer.rb +15 -0
  10. data/lib/html5/filters/whitespace.rb +36 -0
  11. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  12. data/lib/html5/html5parser/after_frameset_phase.rb +34 -0
  13. data/lib/html5/html5parser/after_head_phase.rb +50 -0
  14. data/lib/html5/html5parser/before_head_phase.rb +41 -0
  15. data/lib/html5/html5parser/in_body_phase.rb +607 -0
  16. data/lib/html5/html5parser/in_caption_phase.rb +68 -0
  17. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  18. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  19. data/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  20. data/lib/html5/html5parser/in_head_phase.rb +138 -0
  21. data/lib/html5/html5parser/in_row_phase.rb +87 -0
  22. data/lib/html5/html5parser/in_select_phase.rb +84 -0
  23. data/lib/html5/html5parser/in_table_body_phase.rb +83 -0
  24. data/lib/html5/html5parser/in_table_phase.rb +110 -0
  25. data/lib/html5/html5parser/initial_phase.rb +134 -0
  26. data/lib/html5/html5parser/phase.rb +158 -0
  27. data/lib/html5/html5parser/root_element_phase.rb +42 -0
  28. data/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  29. data/lib/html5/html5parser.rb +248 -0
  30. data/lib/html5/inputstream.rb +654 -0
  31. data/lib/html5/liberalxmlparser.rb +158 -0
  32. data/lib/html5/sanitizer.rb +188 -0
  33. data/lib/html5/serializer/htmlserializer.rb +180 -0
  34. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  35. data/lib/html5/serializer.rb +2 -0
  36. data/lib/html5/tokenizer.rb +968 -0
  37. data/lib/html5/treebuilders/base.rb +334 -0
  38. data/lib/html5/treebuilders/hpricot.rb +231 -0
  39. data/lib/html5/treebuilders/rexml.rb +208 -0
  40. data/lib/html5/treebuilders/simpletree.rb +185 -0
  41. data/lib/html5/treebuilders.rb +24 -0
  42. data/lib/html5/treewalkers/base.rb +154 -0
  43. data/lib/html5/treewalkers/hpricot.rb +48 -0
  44. data/lib/html5/treewalkers/rexml.rb +48 -0
  45. data/lib/html5/treewalkers/simpletree.rb +48 -0
  46. data/lib/html5/treewalkers.rb +26 -0
  47. data/lib/html5.rb +13 -0
  48. data/parse.rb +217 -0
  49. data/tests/preamble.rb +82 -0
  50. data/tests/test_encoding.rb +35 -0
  51. data/tests/test_lxp.rb +263 -0
  52. data/tests/test_parser.rb +68 -0
  53. data/tests/test_sanitizer.rb +142 -0
  54. data/tests/test_serializer.rb +68 -0
  55. data/tests/test_stream.rb +62 -0
  56. data/tests/test_tokenizer.rb +94 -0
  57. data/tests/test_treewalkers.rb +116 -0
  58. data/tests/tokenizer_test_parser.rb +63 -0
  59. metadata +120 -0
@@ -0,0 +1,134 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class InitialPhase < Phase
5
+
6
+ # This phase deals with error handling as well which is currently not
7
+ # covered in the specification. The error handling is typically known as
8
+ # "quirks mode". It is expected that a future version of HTML5 will define this.
9
+
10
+ def process_eof
11
+ parse_error(_('Unexpected End of file. Expected DOCTYPE.'))
12
+ @parser.phase = @parser.phases[:rootElement]
13
+ @parser.phase.process_eof
14
+ end
15
+
16
+ def processComment(data)
17
+ @tree.insert_comment(data, @tree.document)
18
+ end
19
+
20
+ def processDoctype(name, publicId, systemId, correct)
21
+ if name.downcase != 'html' or publicId or systemId
22
+ parse_error(_('Erroneous DOCTYPE.'))
23
+ end
24
+ # XXX need to update DOCTYPE tokens
25
+ @tree.insertDoctype(name, publicId, systemId)
26
+
27
+ publicId = publicId.to_s.upcase
28
+
29
+ if name.downcase != 'html'
30
+ # XXX quirks mode
31
+ else
32
+ if ["+//silmaril//dtd html pro v0r11 19970101//en",
33
+ "-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
34
+ "-//as//dtd html 3.0 aswedit + extensions//en",
35
+ "-//ietf//dtd html 2.0 level 1//en",
36
+ "-//ietf//dtd html 2.0 level 2//en",
37
+ "-//ietf//dtd html 2.0 strict level 1//en",
38
+ "-//ietf//dtd html 2.0 strict level 2//en",
39
+ "-//ietf//dtd html 2.0 strict//en",
40
+ "-//ietf//dtd html 2.0//en",
41
+ "-//ietf//dtd html 2.1e//en",
42
+ "-//ietf//dtd html 3.0//en",
43
+ "-//ietf//dtd html 3.0//en//",
44
+ "-//ietf//dtd html 3.2 final//en",
45
+ "-//ietf//dtd html 3.2//en",
46
+ "-//ietf//dtd html 3//en",
47
+ "-//ietf//dtd html level 0//en",
48
+ "-//ietf//dtd html level 0//en//2.0",
49
+ "-//ietf//dtd html level 1//en",
50
+ "-//ietf//dtd html level 1//en//2.0",
51
+ "-//ietf//dtd html level 2//en",
52
+ "-//ietf//dtd html level 2//en//2.0",
53
+ "-//ietf//dtd html level 3//en",
54
+ "-//ietf//dtd html level 3//en//3.0",
55
+ "-//ietf//dtd html strict level 0//en",
56
+ "-//ietf//dtd html strict level 0//en//2.0",
57
+ "-//ietf//dtd html strict level 1//en",
58
+ "-//ietf//dtd html strict level 1//en//2.0",
59
+ "-//ietf//dtd html strict level 2//en",
60
+ "-//ietf//dtd html strict level 2//en//2.0",
61
+ "-//ietf//dtd html strict level 3//en",
62
+ "-//ietf//dtd html strict level 3//en//3.0",
63
+ "-//ietf//dtd html strict//en",
64
+ "-//ietf//dtd html strict//en//2.0",
65
+ "-//ietf//dtd html strict//en//3.0",
66
+ "-//ietf//dtd html//en",
67
+ "-//ietf//dtd html//en//2.0",
68
+ "-//ietf//dtd html//en//3.0",
69
+ "-//metrius//dtd metrius presentational//en",
70
+ "-//microsoft//dtd internet explorer 2.0 html strict//en",
71
+ "-//microsoft//dtd internet explorer 2.0 html//en",
72
+ "-//microsoft//dtd internet explorer 2.0 tables//en",
73
+ "-//microsoft//dtd internet explorer 3.0 html strict//en",
74
+ "-//microsoft//dtd internet explorer 3.0 html//en",
75
+ "-//microsoft//dtd internet explorer 3.0 tables//en",
76
+ "-//netscape comm. corp.//dtd html//en",
77
+ "-//netscape comm. corp.//dtd strict html//en",
78
+ "-//o'reilly and associates//dtd html 2.0//en",
79
+ "-//o'reilly and associates//dtd html extended 1.0//en",
80
+ "-//spyglass//dtd html 2.0 extended//en",
81
+ "-//sq//dtd html 2.0 hotmetal + extensions//en",
82
+ "-//sun microsystems corp.//dtd hotjava html//en",
83
+ "-//sun microsystems corp.//dtd hotjava strict html//en",
84
+ "-//w3c//dtd html 3 1995-03-24//en",
85
+ "-//w3c//dtd html 3.2 draft//en",
86
+ "-//w3c//dtd html 3.2 final//en",
87
+ "-//w3c//dtd html 3.2//en",
88
+ "-//w3c//dtd html 3.2s draft//en",
89
+ "-//w3c//dtd html 4.0 frameset//en",
90
+ "-//w3c//dtd html 4.0 transitional//en",
91
+ "-//w3c//dtd html experimental 19960712//en",
92
+ "-//w3c//dtd html experimental 970421//en",
93
+ "-//w3c//dtd w3 html//en",
94
+ "-//w3o//dtd w3 html 3.0//en",
95
+ "-//w3o//dtd w3 html 3.0//en//",
96
+ "-//w3o//dtd w3 html strict 3.0//en//",
97
+ "-//webtechs//dtd mozilla html 2.0//en",
98
+ "-//webtechs//dtd mozilla html//en",
99
+ "-/w3c/dtd html 4.0 transitional/en",
100
+ "html"].include?(publicId) or
101
+ (systemId == nil and
102
+ ["-//w3c//dtd html 4.01 frameset//EN",
103
+ "-//w3c//dtd html 4.01 transitional//EN"].include?(publicId)) or
104
+ (systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")
105
+ #XXX quirks mode
106
+ end
107
+ end
108
+
109
+ @parser.phase = @parser.phases[:rootElement]
110
+ end
111
+
112
+ def processSpaceCharacters(data)
113
+ end
114
+
115
+ def processCharacters(data)
116
+ parse_error(_('Unexpected non-space characters. Expected DOCTYPE.'))
117
+ @parser.phase = @parser.phases[:rootElement]
118
+ @parser.phase.processCharacters(data)
119
+ end
120
+
121
+ def processStartTag(name, attributes)
122
+ parse_error(_("Unexpected start tag (#{name}). Expected DOCTYPE."))
123
+ @parser.phase = @parser.phases[:rootElement]
124
+ @parser.phase.processStartTag(name, attributes)
125
+ end
126
+
127
+ def processEndTag(name)
128
+ parse_error(_("Unexpected end tag (#{name}). Expected DOCTYPE."))
129
+ @parser.phase = @parser.phases[:rootElement]
130
+ @parser.phase.processEndTag(name)
131
+ end
132
+
133
+ end
134
+ end
@@ -0,0 +1,158 @@
1
+ module HTML5
2
+ # Base class for helper objects that implement each phase of processing.
3
+ #
4
+ # Handler methods should be in the following order (they can be omitted):
5
+ #
6
+ # * EOF
7
+ # * Comment
8
+ # * Doctype
9
+ # * SpaceCharacters
10
+ # * Characters
11
+ # * StartTag
12
+ # - startTag* methods
13
+ # * EndTag
14
+ # - endTag* methods
15
+ #
16
+ class Phase
17
+
18
+ extend Forwardable
19
+ def_delegators :@parser, :parse_error
20
+
21
+ # The following example call:
22
+ #
23
+ # tag_handlers('startTag', 'html', %w( base link meta ), %w( li dt dd ) => 'ListItem')
24
+ #
25
+ # ...would return a hash equal to this:
26
+ #
27
+ # { 'html' => 'startTagHtml',
28
+ # 'base' => 'startTagBaseLinkMeta',
29
+ # 'link' => 'startTagBaseLinkMeta',
30
+ # 'meta' => 'startTagBaseLinkMeta',
31
+ # 'li' => 'startTagListItem',
32
+ # 'dt' => 'startTagListItem',
33
+ # 'dd' => 'startTagListItem' }
34
+ #
35
+ def self.tag_handlers(prefix, *tags)
36
+ mapping = {}
37
+ if tags.last.is_a?(Hash)
38
+ tags.pop.each do |names, handler_method_suffix|
39
+ handler_method = prefix + handler_method_suffix
40
+ Array(names).each {|name| mapping[name] = handler_method }
41
+ end
42
+ end
43
+ tags.each do |names|
44
+ names = Array(names)
45
+ handler_method = prefix + names.map {|name| name.capitalize }.join
46
+ names.each {|name| mapping[name] = handler_method }
47
+ end
48
+ mapping
49
+ end
50
+
51
+ def self.start_tag_handlers
52
+ @start_tag_handlers ||= Hash.new('startTagOther')
53
+ end
54
+
55
+ # Declare what start tags this Phase handles. Can be called more than once.
56
+ #
57
+ # Example usage:
58
+ #
59
+ # handle_start 'html'
60
+ # # html start tags will be handled by a method named 'startTagHtml'
61
+ #
62
+ # handle_start %( base link meta )
63
+ # # base, link and meta start tags will be handled by a method named 'startTagBaseLinkMeta'
64
+ #
65
+ # handle_start %( li dt dd ) => 'ListItem'
66
+ # # li, dt, and dd start tags will be handled by a method named 'startTagListItem'
67
+ #
68
+ def self.handle_start(*tags)
69
+ start_tag_handlers.update tag_handlers('startTag', *tags)
70
+ end
71
+
72
+ def self.end_tag_handlers
73
+ @end_tag_handlers ||= Hash.new('endTagOther')
74
+ end
75
+
76
+ # Declare what end tags this Phase handles. Behaves like handle_start.
77
+ #
78
+ def self.handle_end(*tags)
79
+ end_tag_handlers.update tag_handlers('endTag', *tags)
80
+ end
81
+
82
+ def initialize(parser, tree)
83
+ @parser, @tree = parser, tree
84
+ end
85
+
86
+ def process_eof
87
+ @tree.generateImpliedEndTags
88
+
89
+ if @tree.open_elements.length > 2
90
+ parse_error(_('Unexpected end of file. Missing closing tags.'))
91
+ elsif @tree.open_elements.length == 2 and @tree.open_elements[1].name != 'body'
92
+ # This happens for framesets or something?
93
+ parse_error(_("Unexpected end of file. Expected end tag (#{@tree.open_elements[1].name}) first."))
94
+ elsif @parser.inner_html and @tree.open_elements.length > 1
95
+ # XXX This is not what the specification says. Not sure what to do here.
96
+ parse_error(_('XXX inner_html EOF'))
97
+ end
98
+ # Betting ends.
99
+ end
100
+
101
+ def processComment(data)
102
+ # For most phases the following is correct. Where it's not it will be
103
+ # overridden.
104
+ @tree.insert_comment(data, @tree.open_elements.last)
105
+ end
106
+
107
+ def processDoctype(name, publicId, systemId, correct)
108
+ parse_error(_('Unexpected DOCTYPE. Ignored.'))
109
+ end
110
+
111
+ def processSpaceCharacters(data)
112
+ @tree.insertText(data)
113
+ end
114
+
115
+ def processStartTag(name, attributes)
116
+ send self.class.start_tag_handlers[name], name, attributes
117
+ end
118
+
119
+ def startTagHtml(name, attributes)
120
+ if @parser.first_start_tag == false and name == 'html'
121
+ parse_error(_('html needs to be the first start tag.'))
122
+ end
123
+ # XXX Need a check here to see if the first start tag token emitted is
124
+ # this token... If it's not, invoke parse_error.
125
+ attributes.each do |attr, value|
126
+ unless @tree.open_elements.first.attributes.has_key?(attr)
127
+ @tree.open_elements.first.attributes[attr] = value
128
+ end
129
+ end
130
+ @parser.first_start_tag = false
131
+ end
132
+
133
+ def processEndTag(name)
134
+ send self.class.end_tag_handlers[name], name
135
+ end
136
+
137
+ def _(string)
138
+ string
139
+ end
140
+
141
+ def assert(value)
142
+ throw AssertionError.new unless value
143
+ end
144
+
145
+ def in_scope?(*args)
146
+ @tree.elementInScope(*args)
147
+ end
148
+
149
+ def remove_open_elements_until(name=nil)
150
+ finished = false
151
+ until finished
152
+ element = @tree.open_elements.pop
153
+ finished = name.nil? ? yield(element) : element.name == name
154
+ end
155
+ return element
156
+ end
157
+ end
158
+ end
@@ -0,0 +1,42 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class RootElementPhase < Phase
5
+
6
+ def process_eof
7
+ insert_html_element
8
+ @parser.phase.process_eof
9
+ end
10
+
11
+ def processComment(data)
12
+ @tree.insert_comment(data, @tree.document)
13
+ end
14
+
15
+ def processSpaceCharacters(data)
16
+ end
17
+
18
+ def processCharacters(data)
19
+ insert_html_element
20
+ @parser.phase.processCharacters(data)
21
+ end
22
+
23
+ def processStartTag(name, attributes)
24
+ @parser.first_start_tag = true if name == 'html'
25
+ insert_html_element
26
+ @parser.phase.processStartTag(name, attributes)
27
+ end
28
+
29
+ def processEndTag(name)
30
+ insert_html_element
31
+ @parser.phase.processEndTag(name)
32
+ end
33
+
34
+ def insert_html_element
35
+ element = @tree.createElement('html', {})
36
+ @tree.open_elements.push(element)
37
+ @tree.document.appendChild(element)
38
+ @parser.phase = @parser.phases[:beforeHead]
39
+ end
40
+
41
+ end
42
+ end
@@ -0,0 +1,35 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class TrailingEndPhase < Phase
5
+
6
+ def process_eof
7
+ end
8
+
9
+ def processComment(data)
10
+ @tree.insert_comment(data, @tree.document)
11
+ end
12
+
13
+ def processSpaceCharacters(data)
14
+ @parser.last_phase.processSpaceCharacters(data)
15
+ end
16
+
17
+ def processCharacters(data)
18
+ parse_error(_('Unexpected non-space characters. Expected end of file.'))
19
+ @parser.phase = @parser.last_phase
20
+ @parser.phase.processCharacters(data)
21
+ end
22
+
23
+ def processStartTag(name, attributes)
24
+ parse_error(_('Unexpected start tag (#{name}). Expected end of file.'))
25
+ @parser.phase = @parser.last_phase
26
+ @parser.phase.processStartTag(name, attributes)
27
+ end
28
+
29
+ def processEndTag(name)
30
+ parse_error(_('Unexpected end tag (#{name}). Expected end of file.'))
31
+ @parser.phase = @parser.last_phase
32
+ @parser.phase.processEndTag(name)
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,248 @@
1
+ require 'html5/constants'
2
+ require 'html5/tokenizer'
3
+ require 'html5/treebuilders/rexml'
4
+
5
+ Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path|
6
+ require 'html5/html5parser/' + File.basename(path)
7
+ end
8
+
9
+ module HTML5
10
+
11
+ # Error in parsed document
12
+ class ParseError < Exception; end
13
+ class AssertionError < Exception; end
14
+
15
+ # HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML
16
+ #
17
+ class HTMLParser
18
+
19
+ attr_accessor :phase, :first_start_tag, :inner_html, :last_phase, :insert_from_table
20
+
21
+ attr_reader :phases, :tokenizer, :tree, :errors
22
+
23
+ def self.parse(stream, options = {})
24
+ encoding = options.delete(:encoding)
25
+ new(options).parse(stream,encoding)
26
+ end
27
+
28
+ def self.parse_fragment(stream, options = {})
29
+ container = options.delete(:container) || 'div'
30
+ encoding = options.delete(:encoding)
31
+ new(options).parse_fragment(stream, container, encoding)
32
+ end
33
+
34
+ @@phases = %w( initial rootElement beforeHead inHead afterHead inBody inTable inCaption
35
+ inColumnGroup inTableBody inRow inCell inSelect afterBody inFrameset afterFrameset trailingEnd )
36
+
37
+ # :strict - raise an exception when a parse error is encountered
38
+ # :tree - a treebuilder class controlling the type of tree that will be
39
+ # returned. Built in treebuilders can be accessed through
40
+ # HTML5::TreeBuilders[treeType]
41
+ def initialize(options = {})
42
+ @strict = false
43
+ @errors = []
44
+
45
+ @tokenizer = HTMLTokenizer
46
+ @tree = TreeBuilders::REXML::TreeBuilder
47
+
48
+ options.each {|name, value| instance_variable_set("@#{name}", value) }
49
+ @lowercase_attr_name = nil unless instance_variable_defined?(:@lowercase_attr_name)
50
+ @lowercase_element_name = nil unless instance_variable_defined?(:@lowercase_element_name)
51
+
52
+ @tree = @tree.new
53
+
54
+ @phases = @@phases.inject({}) do |phases, phase_name|
55
+ phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
56
+ phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree)
57
+ phases
58
+ end
59
+ end
60
+
61
+ def _parse(stream, inner_html, encoding, container = 'div')
62
+ @tree.reset
63
+ @first_start_tag = false
64
+ @errors = []
65
+
66
+ @tokenizer = @tokenizer.class unless Class === @tokenizer
67
+ @tokenizer = @tokenizer.new(stream, :encoding => encoding,
68
+ :parseMeta => !inner_html, :lowercase_attr_name => @lowercase_attr_name, :lowercase_element_name => @lowercase_element_name)
69
+
70
+ if inner_html
71
+ case @inner_html = container.downcase
72
+ when 'title', 'textarea'
73
+ @tokenizer.content_model_flag = :RCDATA
74
+ when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
75
+ @tokenizer.content_model_flag = :CDATA
76
+ when 'plaintext'
77
+ @tokenizer.content_model_flag = :PLAINTEXT
78
+ else
79
+ # content_model_flag already is PCDATA
80
+ #@tokenizer.content_model_flag = :PCDATA
81
+ end
82
+
83
+ @phase = @phases[:rootElement]
84
+ @phase.insert_html_element
85
+ reset_insertion_mode
86
+ else
87
+ @inner_html = false
88
+ @phase = @phases[:initial]
89
+ end
90
+
91
+ # We only seem to have InBodyPhase testcases where the following is
92
+ # relevant ... need others too
93
+ @last_phase = nil
94
+
95
+ # XXX This is temporary for the moment so there isn't any other
96
+ # changes needed for the parser to work with the iterable tokenizer
97
+ @tokenizer.each do |token|
98
+ token = normalize_token(token)
99
+
100
+ method = 'process%s' % token[:type]
101
+
102
+ case token[:type]
103
+ when :Characters, :SpaceCharacters, :Comment
104
+ @phase.send method, token[:data]
105
+ when :StartTag
106
+ @phase.send method, token[:name], token[:data]
107
+ when :EndTag
108
+ @phase.send method, token[:name]
109
+ when :Doctype
110
+ @phase.send method, token[:name], token[:publicId],
111
+ token[:systemId], token[:correct]
112
+ else
113
+ parse_error(token[:data])
114
+ end
115
+ end
116
+
117
+ # When the loop finishes it's EOF
118
+ @phase.process_eof
119
+ end
120
+
121
+ # Parse a HTML document into a well-formed tree
122
+ #
123
+ # stream - a filelike object or string containing the HTML to be parsed
124
+ #
125
+ # The optional encoding parameter must be a string that indicates
126
+ # the encoding. If specified, that encoding will be used,
127
+ # regardless of any BOM or later declaration (such as in a meta
128
+ # element)
129
+ def parse(stream, encoding=nil)
130
+ _parse(stream, false, encoding)
131
+ @tree.get_document
132
+ end
133
+
134
+ # Parse a HTML fragment into a well-formed tree fragment
135
+
136
+ # container - name of the element we're setting the inner_html property
137
+ # if set to nil, default to 'div'
138
+ #
139
+ # stream - a filelike object or string containing the HTML to be parsed
140
+ #
141
+ # The optional encoding parameter must be a string that indicates
142
+ # the encoding. If specified, that encoding will be used,
143
+ # regardless of any BOM or later declaration (such as in a meta
144
+ # element)
145
+ def parse_fragment(stream, container='div', encoding=nil)
146
+ _parse(stream, true, encoding, container)
147
+ @tree.get_fragment
148
+ end
149
+
150
+ def parse_error(data = 'XXX ERROR MESSAGE NEEDED')
151
+ # XXX The idea is to make data mandatory.
152
+ @errors.push([@tokenizer.stream.position, data])
153
+ raise ParseError if @strict
154
+ end
155
+
156
+ # HTML5 specific normalizations to the token stream
157
+ def normalize_token(token)
158
+
159
+ if token[:type] == :EmptyTag
160
+ # When a solidus (/) is encountered within a tag name what happens
161
+ # depends on whether the current tag name matches that of a void
162
+ # element. If it matches a void element atheists did the wrong
163
+ # thing and if it doesn't it's wrong for everyone.
164
+
165
+ unless VOID_ELEMENTS.include?(token[:name])
166
+ parse_error(_('Solidus (/) incorrectly placed in tag.'))
167
+ end
168
+
169
+ token[:type] = :StartTag
170
+ end
171
+
172
+ if token[:type] == :StartTag
173
+ token[:name] = token[:name].downcase
174
+
175
+ # We need to remove the duplicate attributes and convert attributes
176
+ # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
177
+
178
+ unless token[:data].empty?
179
+ data = token[:data].reverse.map {|attr, value| [attr.downcase, value] }
180
+ token[:data] = Hash[*data.flatten]
181
+ end
182
+
183
+ elsif token[:type] == :EndTag
184
+ parse_error(_('End tag contains unexpected attributes.')) unless token[:data].empty?
185
+ token[:name] = token[:name].downcase
186
+ end
187
+
188
+ token
189
+ end
190
+
191
+ @@new_modes = {
192
+ 'select' => :inSelect,
193
+ 'td' => :inCell,
194
+ 'th' => :inCell,
195
+ 'tr' => :inRow,
196
+ 'tbody' => :inTableBody,
197
+ 'thead' => :inTableBody,
198
+ 'tfoot' => :inTableBody,
199
+ 'caption' => :inCaption,
200
+ 'colgroup' => :inColumnGroup,
201
+ 'table' => :inTable,
202
+ 'head' => :inBody,
203
+ 'body' => :inBody,
204
+ 'frameset' => :inFrameset
205
+ }
206
+
207
+ def reset_insertion_mode
208
+ # The name of this method is mostly historical. (It's also used in the
209
+ # specification.)
210
+ last = false
211
+
212
+ @tree.open_elements.reverse.each do |node|
213
+ node_name = node.name
214
+
215
+ if node == @tree.open_elements.first
216
+ last = true
217
+ unless ['td', 'th'].include?(node_name)
218
+ # XXX
219
+ # assert @inner_html
220
+ node_name = @inner_html
221
+ end
222
+ end
223
+
224
+ # Check for conditions that should only happen in the inner_html
225
+ # case
226
+ if ['select', 'colgroup', 'head', 'frameset'].include?(node_name)
227
+ # XXX
228
+ # assert @inner_html
229
+ end
230
+
231
+ if @@new_modes.has_key?(node_name)
232
+ @phase = @phases[@@new_modes[node_name]]
233
+ elsif node_name == 'html'
234
+ @phase = @phases[@tree.head_pointer.nil?? :beforeHead : :afterHead]
235
+ elsif last
236
+ @phase = @phases[:inBody]
237
+ else
238
+ next
239
+ end
240
+
241
+ break
242
+ end
243
+ end
244
+
245
+ def _(string); string; end
246
+ end
247
+
248
+ end