html5 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +3 -0
- data/Manifest.txt +58 -0
- data/README +9 -0
- data/Rakefile.rb +17 -0
- data/lib/html5/constants.rb +818 -0
- data/lib/html5/filters/base.rb +10 -0
- data/lib/html5/filters/inject_meta_charset.rb +82 -0
- data/lib/html5/filters/optionaltags.rb +198 -0
- data/lib/html5/filters/sanitizer.rb +15 -0
- data/lib/html5/filters/whitespace.rb +36 -0
- data/lib/html5/html5parser/after_body_phase.rb +46 -0
- data/lib/html5/html5parser/after_frameset_phase.rb +34 -0
- data/lib/html5/html5parser/after_head_phase.rb +50 -0
- data/lib/html5/html5parser/before_head_phase.rb +41 -0
- data/lib/html5/html5parser/in_body_phase.rb +607 -0
- data/lib/html5/html5parser/in_caption_phase.rb +68 -0
- data/lib/html5/html5parser/in_cell_phase.rb +78 -0
- data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
- data/lib/html5/html5parser/in_frameset_phase.rb +57 -0
- data/lib/html5/html5parser/in_head_phase.rb +138 -0
- data/lib/html5/html5parser/in_row_phase.rb +87 -0
- data/lib/html5/html5parser/in_select_phase.rb +84 -0
- data/lib/html5/html5parser/in_table_body_phase.rb +83 -0
- data/lib/html5/html5parser/in_table_phase.rb +110 -0
- data/lib/html5/html5parser/initial_phase.rb +134 -0
- data/lib/html5/html5parser/phase.rb +158 -0
- data/lib/html5/html5parser/root_element_phase.rb +42 -0
- data/lib/html5/html5parser/trailing_end_phase.rb +35 -0
- data/lib/html5/html5parser.rb +248 -0
- data/lib/html5/inputstream.rb +654 -0
- data/lib/html5/liberalxmlparser.rb +158 -0
- data/lib/html5/sanitizer.rb +188 -0
- data/lib/html5/serializer/htmlserializer.rb +180 -0
- data/lib/html5/serializer/xhtmlserializer.rb +20 -0
- data/lib/html5/serializer.rb +2 -0
- data/lib/html5/tokenizer.rb +968 -0
- data/lib/html5/treebuilders/base.rb +334 -0
- data/lib/html5/treebuilders/hpricot.rb +231 -0
- data/lib/html5/treebuilders/rexml.rb +208 -0
- data/lib/html5/treebuilders/simpletree.rb +185 -0
- data/lib/html5/treebuilders.rb +24 -0
- data/lib/html5/treewalkers/base.rb +154 -0
- data/lib/html5/treewalkers/hpricot.rb +48 -0
- data/lib/html5/treewalkers/rexml.rb +48 -0
- data/lib/html5/treewalkers/simpletree.rb +48 -0
- data/lib/html5/treewalkers.rb +26 -0
- data/lib/html5.rb +13 -0
- data/parse.rb +217 -0
- data/tests/preamble.rb +82 -0
- data/tests/test_encoding.rb +35 -0
- data/tests/test_lxp.rb +263 -0
- data/tests/test_parser.rb +68 -0
- data/tests/test_sanitizer.rb +142 -0
- data/tests/test_serializer.rb +68 -0
- data/tests/test_stream.rb +62 -0
- data/tests/test_tokenizer.rb +94 -0
- data/tests/test_treewalkers.rb +116 -0
- data/tests/tokenizer_test_parser.rb +63 -0
- metadata +120 -0
@@ -0,0 +1,134 @@
|
|
1
|
+
require 'html5/html5parser/phase'
|
2
|
+
|
3
|
+
module HTML5
|
4
|
+
class InitialPhase < Phase
|
5
|
+
|
6
|
+
# This phase deals with error handling as well which is currently not
|
7
|
+
# covered in the specification. The error handling is typically known as
|
8
|
+
# "quirks mode". It is expected that a future version of HTML5 will define this.
|
9
|
+
|
10
|
+
def process_eof
|
11
|
+
parse_error(_('Unexpected End of file. Expected DOCTYPE.'))
|
12
|
+
@parser.phase = @parser.phases[:rootElement]
|
13
|
+
@parser.phase.process_eof
|
14
|
+
end
|
15
|
+
|
16
|
+
def processComment(data)
|
17
|
+
@tree.insert_comment(data, @tree.document)
|
18
|
+
end
|
19
|
+
|
20
|
+
def processDoctype(name, publicId, systemId, correct)
|
21
|
+
if name.downcase != 'html' or publicId or systemId
|
22
|
+
parse_error(_('Erroneous DOCTYPE.'))
|
23
|
+
end
|
24
|
+
# XXX need to update DOCTYPE tokens
|
25
|
+
@tree.insertDoctype(name, publicId, systemId)
|
26
|
+
|
27
|
+
publicId = publicId.to_s.upcase
|
28
|
+
|
29
|
+
if name.downcase != 'html'
|
30
|
+
# XXX quirks mode
|
31
|
+
else
|
32
|
+
if ["+//silmaril//dtd html pro v0r11 19970101//en",
|
33
|
+
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
|
34
|
+
"-//as//dtd html 3.0 aswedit + extensions//en",
|
35
|
+
"-//ietf//dtd html 2.0 level 1//en",
|
36
|
+
"-//ietf//dtd html 2.0 level 2//en",
|
37
|
+
"-//ietf//dtd html 2.0 strict level 1//en",
|
38
|
+
"-//ietf//dtd html 2.0 strict level 2//en",
|
39
|
+
"-//ietf//dtd html 2.0 strict//en",
|
40
|
+
"-//ietf//dtd html 2.0//en",
|
41
|
+
"-//ietf//dtd html 2.1e//en",
|
42
|
+
"-//ietf//dtd html 3.0//en",
|
43
|
+
"-//ietf//dtd html 3.0//en//",
|
44
|
+
"-//ietf//dtd html 3.2 final//en",
|
45
|
+
"-//ietf//dtd html 3.2//en",
|
46
|
+
"-//ietf//dtd html 3//en",
|
47
|
+
"-//ietf//dtd html level 0//en",
|
48
|
+
"-//ietf//dtd html level 0//en//2.0",
|
49
|
+
"-//ietf//dtd html level 1//en",
|
50
|
+
"-//ietf//dtd html level 1//en//2.0",
|
51
|
+
"-//ietf//dtd html level 2//en",
|
52
|
+
"-//ietf//dtd html level 2//en//2.0",
|
53
|
+
"-//ietf//dtd html level 3//en",
|
54
|
+
"-//ietf//dtd html level 3//en//3.0",
|
55
|
+
"-//ietf//dtd html strict level 0//en",
|
56
|
+
"-//ietf//dtd html strict level 0//en//2.0",
|
57
|
+
"-//ietf//dtd html strict level 1//en",
|
58
|
+
"-//ietf//dtd html strict level 1//en//2.0",
|
59
|
+
"-//ietf//dtd html strict level 2//en",
|
60
|
+
"-//ietf//dtd html strict level 2//en//2.0",
|
61
|
+
"-//ietf//dtd html strict level 3//en",
|
62
|
+
"-//ietf//dtd html strict level 3//en//3.0",
|
63
|
+
"-//ietf//dtd html strict//en",
|
64
|
+
"-//ietf//dtd html strict//en//2.0",
|
65
|
+
"-//ietf//dtd html strict//en//3.0",
|
66
|
+
"-//ietf//dtd html//en",
|
67
|
+
"-//ietf//dtd html//en//2.0",
|
68
|
+
"-//ietf//dtd html//en//3.0",
|
69
|
+
"-//metrius//dtd metrius presentational//en",
|
70
|
+
"-//microsoft//dtd internet explorer 2.0 html strict//en",
|
71
|
+
"-//microsoft//dtd internet explorer 2.0 html//en",
|
72
|
+
"-//microsoft//dtd internet explorer 2.0 tables//en",
|
73
|
+
"-//microsoft//dtd internet explorer 3.0 html strict//en",
|
74
|
+
"-//microsoft//dtd internet explorer 3.0 html//en",
|
75
|
+
"-//microsoft//dtd internet explorer 3.0 tables//en",
|
76
|
+
"-//netscape comm. corp.//dtd html//en",
|
77
|
+
"-//netscape comm. corp.//dtd strict html//en",
|
78
|
+
"-//o'reilly and associates//dtd html 2.0//en",
|
79
|
+
"-//o'reilly and associates//dtd html extended 1.0//en",
|
80
|
+
"-//spyglass//dtd html 2.0 extended//en",
|
81
|
+
"-//sq//dtd html 2.0 hotmetal + extensions//en",
|
82
|
+
"-//sun microsystems corp.//dtd hotjava html//en",
|
83
|
+
"-//sun microsystems corp.//dtd hotjava strict html//en",
|
84
|
+
"-//w3c//dtd html 3 1995-03-24//en",
|
85
|
+
"-//w3c//dtd html 3.2 draft//en",
|
86
|
+
"-//w3c//dtd html 3.2 final//en",
|
87
|
+
"-//w3c//dtd html 3.2//en",
|
88
|
+
"-//w3c//dtd html 3.2s draft//en",
|
89
|
+
"-//w3c//dtd html 4.0 frameset//en",
|
90
|
+
"-//w3c//dtd html 4.0 transitional//en",
|
91
|
+
"-//w3c//dtd html experimental 19960712//en",
|
92
|
+
"-//w3c//dtd html experimental 970421//en",
|
93
|
+
"-//w3c//dtd w3 html//en",
|
94
|
+
"-//w3o//dtd w3 html 3.0//en",
|
95
|
+
"-//w3o//dtd w3 html 3.0//en//",
|
96
|
+
"-//w3o//dtd w3 html strict 3.0//en//",
|
97
|
+
"-//webtechs//dtd mozilla html 2.0//en",
|
98
|
+
"-//webtechs//dtd mozilla html//en",
|
99
|
+
"-/w3c/dtd html 4.0 transitional/en",
|
100
|
+
"html"].include?(publicId) or
|
101
|
+
(systemId == nil and
|
102
|
+
["-//w3c//dtd html 4.01 frameset//EN",
|
103
|
+
"-//w3c//dtd html 4.01 transitional//EN"].include?(publicId)) or
|
104
|
+
(systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")
|
105
|
+
#XXX quirks mode
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
@parser.phase = @parser.phases[:rootElement]
|
110
|
+
end
|
111
|
+
|
112
|
+
def processSpaceCharacters(data)
|
113
|
+
end
|
114
|
+
|
115
|
+
def processCharacters(data)
|
116
|
+
parse_error(_('Unexpected non-space characters. Expected DOCTYPE.'))
|
117
|
+
@parser.phase = @parser.phases[:rootElement]
|
118
|
+
@parser.phase.processCharacters(data)
|
119
|
+
end
|
120
|
+
|
121
|
+
def processStartTag(name, attributes)
|
122
|
+
parse_error(_("Unexpected start tag (#{name}). Expected DOCTYPE."))
|
123
|
+
@parser.phase = @parser.phases[:rootElement]
|
124
|
+
@parser.phase.processStartTag(name, attributes)
|
125
|
+
end
|
126
|
+
|
127
|
+
def processEndTag(name)
|
128
|
+
parse_error(_("Unexpected end tag (#{name}). Expected DOCTYPE."))
|
129
|
+
@parser.phase = @parser.phases[:rootElement]
|
130
|
+
@parser.phase.processEndTag(name)
|
131
|
+
end
|
132
|
+
|
133
|
+
end
|
134
|
+
end
|
@@ -0,0 +1,158 @@
|
|
1
|
+
module HTML5
|
2
|
+
# Base class for helper objects that implement each phase of processing.
|
3
|
+
#
|
4
|
+
# Handler methods should be in the following order (they can be omitted):
|
5
|
+
#
|
6
|
+
# * EOF
|
7
|
+
# * Comment
|
8
|
+
# * Doctype
|
9
|
+
# * SpaceCharacters
|
10
|
+
# * Characters
|
11
|
+
# * StartTag
|
12
|
+
# - startTag* methods
|
13
|
+
# * EndTag
|
14
|
+
# - endTag* methods
|
15
|
+
#
|
16
|
+
class Phase
|
17
|
+
|
18
|
+
extend Forwardable
|
19
|
+
def_delegators :@parser, :parse_error
|
20
|
+
|
21
|
+
# The following example call:
|
22
|
+
#
|
23
|
+
# tag_handlers('startTag', 'html', %w( base link meta ), %w( li dt dd ) => 'ListItem')
|
24
|
+
#
|
25
|
+
# ...would return a hash equal to this:
|
26
|
+
#
|
27
|
+
# { 'html' => 'startTagHtml',
|
28
|
+
# 'base' => 'startTagBaseLinkMeta',
|
29
|
+
# 'link' => 'startTagBaseLinkMeta',
|
30
|
+
# 'meta' => 'startTagBaseLinkMeta',
|
31
|
+
# 'li' => 'startTagListItem',
|
32
|
+
# 'dt' => 'startTagListItem',
|
33
|
+
# 'dd' => 'startTagListItem' }
|
34
|
+
#
|
35
|
+
def self.tag_handlers(prefix, *tags)
|
36
|
+
mapping = {}
|
37
|
+
if tags.last.is_a?(Hash)
|
38
|
+
tags.pop.each do |names, handler_method_suffix|
|
39
|
+
handler_method = prefix + handler_method_suffix
|
40
|
+
Array(names).each {|name| mapping[name] = handler_method }
|
41
|
+
end
|
42
|
+
end
|
43
|
+
tags.each do |names|
|
44
|
+
names = Array(names)
|
45
|
+
handler_method = prefix + names.map {|name| name.capitalize }.join
|
46
|
+
names.each {|name| mapping[name] = handler_method }
|
47
|
+
end
|
48
|
+
mapping
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.start_tag_handlers
|
52
|
+
@start_tag_handlers ||= Hash.new('startTagOther')
|
53
|
+
end
|
54
|
+
|
55
|
+
# Declare what start tags this Phase handles. Can be called more than once.
|
56
|
+
#
|
57
|
+
# Example usage:
|
58
|
+
#
|
59
|
+
# handle_start 'html'
|
60
|
+
# # html start tags will be handled by a method named 'startTagHtml'
|
61
|
+
#
|
62
|
+
# handle_start %( base link meta )
|
63
|
+
# # base, link and meta start tags will be handled by a method named 'startTagBaseLinkMeta'
|
64
|
+
#
|
65
|
+
# handle_start %( li dt dd ) => 'ListItem'
|
66
|
+
# # li, dt, and dd start tags will be handled by a method named 'startTagListItem'
|
67
|
+
#
|
68
|
+
def self.handle_start(*tags)
|
69
|
+
start_tag_handlers.update tag_handlers('startTag', *tags)
|
70
|
+
end
|
71
|
+
|
72
|
+
def self.end_tag_handlers
|
73
|
+
@end_tag_handlers ||= Hash.new('endTagOther')
|
74
|
+
end
|
75
|
+
|
76
|
+
# Declare what end tags this Phase handles. Behaves like handle_start.
|
77
|
+
#
|
78
|
+
def self.handle_end(*tags)
|
79
|
+
end_tag_handlers.update tag_handlers('endTag', *tags)
|
80
|
+
end
|
81
|
+
|
82
|
+
def initialize(parser, tree)
|
83
|
+
@parser, @tree = parser, tree
|
84
|
+
end
|
85
|
+
|
86
|
+
def process_eof
|
87
|
+
@tree.generateImpliedEndTags
|
88
|
+
|
89
|
+
if @tree.open_elements.length > 2
|
90
|
+
parse_error(_('Unexpected end of file. Missing closing tags.'))
|
91
|
+
elsif @tree.open_elements.length == 2 and @tree.open_elements[1].name != 'body'
|
92
|
+
# This happens for framesets or something?
|
93
|
+
parse_error(_("Unexpected end of file. Expected end tag (#{@tree.open_elements[1].name}) first."))
|
94
|
+
elsif @parser.inner_html and @tree.open_elements.length > 1
|
95
|
+
# XXX This is not what the specification says. Not sure what to do here.
|
96
|
+
parse_error(_('XXX inner_html EOF'))
|
97
|
+
end
|
98
|
+
# Betting ends.
|
99
|
+
end
|
100
|
+
|
101
|
+
def processComment(data)
|
102
|
+
# For most phases the following is correct. Where it's not it will be
|
103
|
+
# overridden.
|
104
|
+
@tree.insert_comment(data, @tree.open_elements.last)
|
105
|
+
end
|
106
|
+
|
107
|
+
def processDoctype(name, publicId, systemId, correct)
|
108
|
+
parse_error(_('Unexpected DOCTYPE. Ignored.'))
|
109
|
+
end
|
110
|
+
|
111
|
+
def processSpaceCharacters(data)
|
112
|
+
@tree.insertText(data)
|
113
|
+
end
|
114
|
+
|
115
|
+
def processStartTag(name, attributes)
|
116
|
+
send self.class.start_tag_handlers[name], name, attributes
|
117
|
+
end
|
118
|
+
|
119
|
+
def startTagHtml(name, attributes)
|
120
|
+
if @parser.first_start_tag == false and name == 'html'
|
121
|
+
parse_error(_('html needs to be the first start tag.'))
|
122
|
+
end
|
123
|
+
# XXX Need a check here to see if the first start tag token emitted is
|
124
|
+
# this token... If it's not, invoke parse_error.
|
125
|
+
attributes.each do |attr, value|
|
126
|
+
unless @tree.open_elements.first.attributes.has_key?(attr)
|
127
|
+
@tree.open_elements.first.attributes[attr] = value
|
128
|
+
end
|
129
|
+
end
|
130
|
+
@parser.first_start_tag = false
|
131
|
+
end
|
132
|
+
|
133
|
+
def processEndTag(name)
|
134
|
+
send self.class.end_tag_handlers[name], name
|
135
|
+
end
|
136
|
+
|
137
|
+
def _(string)
|
138
|
+
string
|
139
|
+
end
|
140
|
+
|
141
|
+
def assert(value)
|
142
|
+
throw AssertionError.new unless value
|
143
|
+
end
|
144
|
+
|
145
|
+
def in_scope?(*args)
|
146
|
+
@tree.elementInScope(*args)
|
147
|
+
end
|
148
|
+
|
149
|
+
def remove_open_elements_until(name=nil)
|
150
|
+
finished = false
|
151
|
+
until finished
|
152
|
+
element = @tree.open_elements.pop
|
153
|
+
finished = name.nil? ? yield(element) : element.name == name
|
154
|
+
end
|
155
|
+
return element
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'html5/html5parser/phase'
|
2
|
+
|
3
|
+
module HTML5
|
4
|
+
class RootElementPhase < Phase
|
5
|
+
|
6
|
+
def process_eof
|
7
|
+
insert_html_element
|
8
|
+
@parser.phase.process_eof
|
9
|
+
end
|
10
|
+
|
11
|
+
def processComment(data)
|
12
|
+
@tree.insert_comment(data, @tree.document)
|
13
|
+
end
|
14
|
+
|
15
|
+
def processSpaceCharacters(data)
|
16
|
+
end
|
17
|
+
|
18
|
+
def processCharacters(data)
|
19
|
+
insert_html_element
|
20
|
+
@parser.phase.processCharacters(data)
|
21
|
+
end
|
22
|
+
|
23
|
+
def processStartTag(name, attributes)
|
24
|
+
@parser.first_start_tag = true if name == 'html'
|
25
|
+
insert_html_element
|
26
|
+
@parser.phase.processStartTag(name, attributes)
|
27
|
+
end
|
28
|
+
|
29
|
+
def processEndTag(name)
|
30
|
+
insert_html_element
|
31
|
+
@parser.phase.processEndTag(name)
|
32
|
+
end
|
33
|
+
|
34
|
+
def insert_html_element
|
35
|
+
element = @tree.createElement('html', {})
|
36
|
+
@tree.open_elements.push(element)
|
37
|
+
@tree.document.appendChild(element)
|
38
|
+
@parser.phase = @parser.phases[:beforeHead]
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'html5/html5parser/phase'
|
2
|
+
|
3
|
+
module HTML5
|
4
|
+
class TrailingEndPhase < Phase
|
5
|
+
|
6
|
+
def process_eof
|
7
|
+
end
|
8
|
+
|
9
|
+
def processComment(data)
|
10
|
+
@tree.insert_comment(data, @tree.document)
|
11
|
+
end
|
12
|
+
|
13
|
+
def processSpaceCharacters(data)
|
14
|
+
@parser.last_phase.processSpaceCharacters(data)
|
15
|
+
end
|
16
|
+
|
17
|
+
def processCharacters(data)
|
18
|
+
parse_error(_('Unexpected non-space characters. Expected end of file.'))
|
19
|
+
@parser.phase = @parser.last_phase
|
20
|
+
@parser.phase.processCharacters(data)
|
21
|
+
end
|
22
|
+
|
23
|
+
def processStartTag(name, attributes)
|
24
|
+
parse_error(_('Unexpected start tag (#{name}). Expected end of file.'))
|
25
|
+
@parser.phase = @parser.last_phase
|
26
|
+
@parser.phase.processStartTag(name, attributes)
|
27
|
+
end
|
28
|
+
|
29
|
+
def processEndTag(name)
|
30
|
+
parse_error(_('Unexpected end tag (#{name}). Expected end of file.'))
|
31
|
+
@parser.phase = @parser.last_phase
|
32
|
+
@parser.phase.processEndTag(name)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,248 @@
|
|
1
|
+
require 'html5/constants'
|
2
|
+
require 'html5/tokenizer'
|
3
|
+
require 'html5/treebuilders/rexml'
|
4
|
+
|
5
|
+
Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path|
|
6
|
+
require 'html5/html5parser/' + File.basename(path)
|
7
|
+
end
|
8
|
+
|
9
|
+
module HTML5
|
10
|
+
|
11
|
+
# Error in parsed document
|
12
|
+
class ParseError < Exception; end
|
13
|
+
class AssertionError < Exception; end
|
14
|
+
|
15
|
+
# HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML
|
16
|
+
#
|
17
|
+
class HTMLParser
|
18
|
+
|
19
|
+
attr_accessor :phase, :first_start_tag, :inner_html, :last_phase, :insert_from_table
|
20
|
+
|
21
|
+
attr_reader :phases, :tokenizer, :tree, :errors
|
22
|
+
|
23
|
+
def self.parse(stream, options = {})
|
24
|
+
encoding = options.delete(:encoding)
|
25
|
+
new(options).parse(stream,encoding)
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.parse_fragment(stream, options = {})
|
29
|
+
container = options.delete(:container) || 'div'
|
30
|
+
encoding = options.delete(:encoding)
|
31
|
+
new(options).parse_fragment(stream, container, encoding)
|
32
|
+
end
|
33
|
+
|
34
|
+
@@phases = %w( initial rootElement beforeHead inHead afterHead inBody inTable inCaption
|
35
|
+
inColumnGroup inTableBody inRow inCell inSelect afterBody inFrameset afterFrameset trailingEnd )
|
36
|
+
|
37
|
+
# :strict - raise an exception when a parse error is encountered
|
38
|
+
# :tree - a treebuilder class controlling the type of tree that will be
|
39
|
+
# returned. Built in treebuilders can be accessed through
|
40
|
+
# HTML5::TreeBuilders[treeType]
|
41
|
+
def initialize(options = {})
|
42
|
+
@strict = false
|
43
|
+
@errors = []
|
44
|
+
|
45
|
+
@tokenizer = HTMLTokenizer
|
46
|
+
@tree = TreeBuilders::REXML::TreeBuilder
|
47
|
+
|
48
|
+
options.each {|name, value| instance_variable_set("@#{name}", value) }
|
49
|
+
@lowercase_attr_name = nil unless instance_variable_defined?(:@lowercase_attr_name)
|
50
|
+
@lowercase_element_name = nil unless instance_variable_defined?(:@lowercase_element_name)
|
51
|
+
|
52
|
+
@tree = @tree.new
|
53
|
+
|
54
|
+
@phases = @@phases.inject({}) do |phases, phase_name|
|
55
|
+
phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
|
56
|
+
phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree)
|
57
|
+
phases
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def _parse(stream, inner_html, encoding, container = 'div')
|
62
|
+
@tree.reset
|
63
|
+
@first_start_tag = false
|
64
|
+
@errors = []
|
65
|
+
|
66
|
+
@tokenizer = @tokenizer.class unless Class === @tokenizer
|
67
|
+
@tokenizer = @tokenizer.new(stream, :encoding => encoding,
|
68
|
+
:parseMeta => !inner_html, :lowercase_attr_name => @lowercase_attr_name, :lowercase_element_name => @lowercase_element_name)
|
69
|
+
|
70
|
+
if inner_html
|
71
|
+
case @inner_html = container.downcase
|
72
|
+
when 'title', 'textarea'
|
73
|
+
@tokenizer.content_model_flag = :RCDATA
|
74
|
+
when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
|
75
|
+
@tokenizer.content_model_flag = :CDATA
|
76
|
+
when 'plaintext'
|
77
|
+
@tokenizer.content_model_flag = :PLAINTEXT
|
78
|
+
else
|
79
|
+
# content_model_flag already is PCDATA
|
80
|
+
#@tokenizer.content_model_flag = :PCDATA
|
81
|
+
end
|
82
|
+
|
83
|
+
@phase = @phases[:rootElement]
|
84
|
+
@phase.insert_html_element
|
85
|
+
reset_insertion_mode
|
86
|
+
else
|
87
|
+
@inner_html = false
|
88
|
+
@phase = @phases[:initial]
|
89
|
+
end
|
90
|
+
|
91
|
+
# We only seem to have InBodyPhase testcases where the following is
|
92
|
+
# relevant ... need others too
|
93
|
+
@last_phase = nil
|
94
|
+
|
95
|
+
# XXX This is temporary for the moment so there isn't any other
|
96
|
+
# changes needed for the parser to work with the iterable tokenizer
|
97
|
+
@tokenizer.each do |token|
|
98
|
+
token = normalize_token(token)
|
99
|
+
|
100
|
+
method = 'process%s' % token[:type]
|
101
|
+
|
102
|
+
case token[:type]
|
103
|
+
when :Characters, :SpaceCharacters, :Comment
|
104
|
+
@phase.send method, token[:data]
|
105
|
+
when :StartTag
|
106
|
+
@phase.send method, token[:name], token[:data]
|
107
|
+
when :EndTag
|
108
|
+
@phase.send method, token[:name]
|
109
|
+
when :Doctype
|
110
|
+
@phase.send method, token[:name], token[:publicId],
|
111
|
+
token[:systemId], token[:correct]
|
112
|
+
else
|
113
|
+
parse_error(token[:data])
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
# When the loop finishes it's EOF
|
118
|
+
@phase.process_eof
|
119
|
+
end
|
120
|
+
|
121
|
+
# Parse a HTML document into a well-formed tree
|
122
|
+
#
|
123
|
+
# stream - a filelike object or string containing the HTML to be parsed
|
124
|
+
#
|
125
|
+
# The optional encoding parameter must be a string that indicates
|
126
|
+
# the encoding. If specified, that encoding will be used,
|
127
|
+
# regardless of any BOM or later declaration (such as in a meta
|
128
|
+
# element)
|
129
|
+
def parse(stream, encoding=nil)
|
130
|
+
_parse(stream, false, encoding)
|
131
|
+
@tree.get_document
|
132
|
+
end
|
133
|
+
|
134
|
+
# Parse a HTML fragment into a well-formed tree fragment
|
135
|
+
|
136
|
+
# container - name of the element we're setting the inner_html property
|
137
|
+
# if set to nil, default to 'div'
|
138
|
+
#
|
139
|
+
# stream - a filelike object or string containing the HTML to be parsed
|
140
|
+
#
|
141
|
+
# The optional encoding parameter must be a string that indicates
|
142
|
+
# the encoding. If specified, that encoding will be used,
|
143
|
+
# regardless of any BOM or later declaration (such as in a meta
|
144
|
+
# element)
|
145
|
+
def parse_fragment(stream, container='div', encoding=nil)
|
146
|
+
_parse(stream, true, encoding, container)
|
147
|
+
@tree.get_fragment
|
148
|
+
end
|
149
|
+
|
150
|
+
def parse_error(data = 'XXX ERROR MESSAGE NEEDED')
|
151
|
+
# XXX The idea is to make data mandatory.
|
152
|
+
@errors.push([@tokenizer.stream.position, data])
|
153
|
+
raise ParseError if @strict
|
154
|
+
end
|
155
|
+
|
156
|
+
# HTML5 specific normalizations to the token stream
|
157
|
+
def normalize_token(token)
|
158
|
+
|
159
|
+
if token[:type] == :EmptyTag
|
160
|
+
# When a solidus (/) is encountered within a tag name what happens
|
161
|
+
# depends on whether the current tag name matches that of a void
|
162
|
+
# element. If it matches a void element atheists did the wrong
|
163
|
+
# thing and if it doesn't it's wrong for everyone.
|
164
|
+
|
165
|
+
unless VOID_ELEMENTS.include?(token[:name])
|
166
|
+
parse_error(_('Solidus (/) incorrectly placed in tag.'))
|
167
|
+
end
|
168
|
+
|
169
|
+
token[:type] = :StartTag
|
170
|
+
end
|
171
|
+
|
172
|
+
if token[:type] == :StartTag
|
173
|
+
token[:name] = token[:name].downcase
|
174
|
+
|
175
|
+
# We need to remove the duplicate attributes and convert attributes
|
176
|
+
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
177
|
+
|
178
|
+
unless token[:data].empty?
|
179
|
+
data = token[:data].reverse.map {|attr, value| [attr.downcase, value] }
|
180
|
+
token[:data] = Hash[*data.flatten]
|
181
|
+
end
|
182
|
+
|
183
|
+
elsif token[:type] == :EndTag
|
184
|
+
parse_error(_('End tag contains unexpected attributes.')) unless token[:data].empty?
|
185
|
+
token[:name] = token[:name].downcase
|
186
|
+
end
|
187
|
+
|
188
|
+
token
|
189
|
+
end
|
190
|
+
|
191
|
+
@@new_modes = {
|
192
|
+
'select' => :inSelect,
|
193
|
+
'td' => :inCell,
|
194
|
+
'th' => :inCell,
|
195
|
+
'tr' => :inRow,
|
196
|
+
'tbody' => :inTableBody,
|
197
|
+
'thead' => :inTableBody,
|
198
|
+
'tfoot' => :inTableBody,
|
199
|
+
'caption' => :inCaption,
|
200
|
+
'colgroup' => :inColumnGroup,
|
201
|
+
'table' => :inTable,
|
202
|
+
'head' => :inBody,
|
203
|
+
'body' => :inBody,
|
204
|
+
'frameset' => :inFrameset
|
205
|
+
}
|
206
|
+
|
207
|
+
def reset_insertion_mode
|
208
|
+
# The name of this method is mostly historical. (It's also used in the
|
209
|
+
# specification.)
|
210
|
+
last = false
|
211
|
+
|
212
|
+
@tree.open_elements.reverse.each do |node|
|
213
|
+
node_name = node.name
|
214
|
+
|
215
|
+
if node == @tree.open_elements.first
|
216
|
+
last = true
|
217
|
+
unless ['td', 'th'].include?(node_name)
|
218
|
+
# XXX
|
219
|
+
# assert @inner_html
|
220
|
+
node_name = @inner_html
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
# Check for conditions that should only happen in the inner_html
|
225
|
+
# case
|
226
|
+
if ['select', 'colgroup', 'head', 'frameset'].include?(node_name)
|
227
|
+
# XXX
|
228
|
+
# assert @inner_html
|
229
|
+
end
|
230
|
+
|
231
|
+
if @@new_modes.has_key?(node_name)
|
232
|
+
@phase = @phases[@@new_modes[node_name]]
|
233
|
+
elsif node_name == 'html'
|
234
|
+
@phase = @phases[@tree.head_pointer.nil?? :beforeHead : :afterHead]
|
235
|
+
elsif last
|
236
|
+
@phase = @phases[:inBody]
|
237
|
+
else
|
238
|
+
next
|
239
|
+
end
|
240
|
+
|
241
|
+
break
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
def _(string); string; end
|
246
|
+
end
|
247
|
+
|
248
|
+
end
|