html5 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +3 -0
- data/Manifest.txt +58 -0
- data/README +9 -0
- data/Rakefile.rb +17 -0
- data/lib/html5/constants.rb +818 -0
- data/lib/html5/filters/base.rb +10 -0
- data/lib/html5/filters/inject_meta_charset.rb +82 -0
- data/lib/html5/filters/optionaltags.rb +198 -0
- data/lib/html5/filters/sanitizer.rb +15 -0
- data/lib/html5/filters/whitespace.rb +36 -0
- data/lib/html5/html5parser/after_body_phase.rb +46 -0
- data/lib/html5/html5parser/after_frameset_phase.rb +34 -0
- data/lib/html5/html5parser/after_head_phase.rb +50 -0
- data/lib/html5/html5parser/before_head_phase.rb +41 -0
- data/lib/html5/html5parser/in_body_phase.rb +607 -0
- data/lib/html5/html5parser/in_caption_phase.rb +68 -0
- data/lib/html5/html5parser/in_cell_phase.rb +78 -0
- data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
- data/lib/html5/html5parser/in_frameset_phase.rb +57 -0
- data/lib/html5/html5parser/in_head_phase.rb +138 -0
- data/lib/html5/html5parser/in_row_phase.rb +87 -0
- data/lib/html5/html5parser/in_select_phase.rb +84 -0
- data/lib/html5/html5parser/in_table_body_phase.rb +83 -0
- data/lib/html5/html5parser/in_table_phase.rb +110 -0
- data/lib/html5/html5parser/initial_phase.rb +134 -0
- data/lib/html5/html5parser/phase.rb +158 -0
- data/lib/html5/html5parser/root_element_phase.rb +42 -0
- data/lib/html5/html5parser/trailing_end_phase.rb +35 -0
- data/lib/html5/html5parser.rb +248 -0
- data/lib/html5/inputstream.rb +654 -0
- data/lib/html5/liberalxmlparser.rb +158 -0
- data/lib/html5/sanitizer.rb +188 -0
- data/lib/html5/serializer/htmlserializer.rb +180 -0
- data/lib/html5/serializer/xhtmlserializer.rb +20 -0
- data/lib/html5/serializer.rb +2 -0
- data/lib/html5/tokenizer.rb +968 -0
- data/lib/html5/treebuilders/base.rb +334 -0
- data/lib/html5/treebuilders/hpricot.rb +231 -0
- data/lib/html5/treebuilders/rexml.rb +208 -0
- data/lib/html5/treebuilders/simpletree.rb +185 -0
- data/lib/html5/treebuilders.rb +24 -0
- data/lib/html5/treewalkers/base.rb +154 -0
- data/lib/html5/treewalkers/hpricot.rb +48 -0
- data/lib/html5/treewalkers/rexml.rb +48 -0
- data/lib/html5/treewalkers/simpletree.rb +48 -0
- data/lib/html5/treewalkers.rb +26 -0
- data/lib/html5.rb +13 -0
- data/parse.rb +217 -0
- data/tests/preamble.rb +82 -0
- data/tests/test_encoding.rb +35 -0
- data/tests/test_lxp.rb +263 -0
- data/tests/test_parser.rb +68 -0
- data/tests/test_sanitizer.rb +142 -0
- data/tests/test_serializer.rb +68 -0
- data/tests/test_stream.rb +62 -0
- data/tests/test_tokenizer.rb +94 -0
- data/tests/test_treewalkers.rb +116 -0
- data/tests/tokenizer_test_parser.rb +63 -0
- metadata +120 -0
@@ -0,0 +1,158 @@
|
|
1
|
+
# Warning: this module is experimental and subject to change and even removal
|
2
|
+
# at any time.
|
3
|
+
#
|
4
|
+
# For background/rationale, see:
|
5
|
+
# * http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
|
6
|
+
# * http://tinyurl.com/ylfj8k (and follow-ups)
|
7
|
+
#
|
8
|
+
# References:
|
9
|
+
# * http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
|
10
|
+
# * http://wiki.whatwg.org/wiki/HtmlVsXhtml
|
11
|
+
#
|
12
|
+
# @@TODO:
|
13
|
+
# * Selectively lowercase only XHTML, but not foreign markup
|
14
|
+
require 'html5/html5parser'
|
15
|
+
require 'html5/constants'
|
16
|
+
|
17
|
+
module HTML5
|
18
|
+
|
19
|
+
# liberal XML parser
|
20
|
+
class XMLParser < HTMLParser
|
21
|
+
|
22
|
+
def initialize(options = {})
|
23
|
+
super options
|
24
|
+
@phases[:initial] = XmlRootPhase.new(self, @tree)
|
25
|
+
end
|
26
|
+
|
27
|
+
def normalize_token(token)
|
28
|
+
case token[:type]
|
29
|
+
when :StartTag, :EmptyTag
|
30
|
+
# We need to remove the duplicate attributes and convert attributes
|
31
|
+
# to a Hash so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
32
|
+
|
33
|
+
token[:data] = Hash[*token[:data].reverse.flatten]
|
34
|
+
|
35
|
+
# For EmptyTags, process both a Start and an End tag
|
36
|
+
if token[:type] == :EmptyTag
|
37
|
+
save = @tokenizer.content_model_flag
|
38
|
+
@phase.processStartTag(token[:name], token[:data])
|
39
|
+
@tokenizer.content_model_flag = save
|
40
|
+
token[:data] = {}
|
41
|
+
token[:type] = :EndTag
|
42
|
+
end
|
43
|
+
|
44
|
+
when :Characters
|
45
|
+
# un-escape RCDATA_ELEMENTS (e.g. style, script)
|
46
|
+
if @tokenizer.content_model_flag == :CDATA
|
47
|
+
token[:data] = token[:data].
|
48
|
+
gsub('<','<').gsub('>','>').gsub('&','&')
|
49
|
+
end
|
50
|
+
|
51
|
+
when :EndTag
|
52
|
+
if token[:data]
|
53
|
+
parse_error(_("End tag contains unexpected attributes."))
|
54
|
+
end
|
55
|
+
|
56
|
+
when :Comment
|
57
|
+
# Rescue CDATA from the comments
|
58
|
+
if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
|
59
|
+
token[:type] = :Characters
|
60
|
+
token[:data] = token[:data][7 ... -2]
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
return token
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# liberal XMTHML parser
|
69
|
+
class XHTMLParser < XMLParser
|
70
|
+
|
71
|
+
def initialize(options = {})
|
72
|
+
super options
|
73
|
+
@phases[:initial] = InitialPhase.new(self, @tree)
|
74
|
+
@phases[:rootElement] = XhmlRootPhase.new(self, @tree)
|
75
|
+
end
|
76
|
+
|
77
|
+
def normalize_token(token)
|
78
|
+
super(token)
|
79
|
+
|
80
|
+
# ensure that non-void XHTML elements have content so that separate
|
81
|
+
# open and close tags are emitted
|
82
|
+
if token[:type] == :EndTag
|
83
|
+
if VOID_ELEMENTS.include? token[:name]
|
84
|
+
if @tree.open_elements[-1].name != token["name"]:
|
85
|
+
token[:type] = :EmptyTag
|
86
|
+
token["data"] ||= {}
|
87
|
+
end
|
88
|
+
else
|
89
|
+
if token[:name] == @tree.open_elements[-1].name and \
|
90
|
+
not @tree.open_elements[-1].hasContent
|
91
|
+
@tree.insertText('') unless
|
92
|
+
@tree.open_elements.any? {|e|
|
93
|
+
e.attributes.keys.include? 'xmlns' and
|
94
|
+
e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
|
95
|
+
}
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
return token
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
class XhmlRootPhase < RootElementPhase
|
105
|
+
def insert_html_element
|
106
|
+
element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
|
107
|
+
@tree.open_elements.push(element)
|
108
|
+
@tree.document.appendChild(element)
|
109
|
+
@parser.phase = @parser.phases[:beforeHead]
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
class XmlRootPhase < Phase
|
114
|
+
# Prime the Xml parser
|
115
|
+
@start_tag_handlers = Hash.new(:startTagOther)
|
116
|
+
@end_tag_handlers = Hash.new(:endTagOther)
|
117
|
+
def startTagOther(name, attributes)
|
118
|
+
@tree.open_elements.push(@tree.document)
|
119
|
+
element = @tree.createElement(name, attributes)
|
120
|
+
@tree.open_elements[-1].appendChild(element)
|
121
|
+
@tree.open_elements.push(element)
|
122
|
+
@parser.phase = XmlElementPhase.new(@parser,@tree)
|
123
|
+
end
|
124
|
+
def endTagOther(name)
|
125
|
+
super
|
126
|
+
@tree.open_elements.pop
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
class XmlElementPhase < Phase
|
131
|
+
# Generic handling for all XML elements
|
132
|
+
|
133
|
+
@start_tag_handlers = Hash.new(:startTagOther)
|
134
|
+
@end_tag_handlers = Hash.new(:endTagOther)
|
135
|
+
|
136
|
+
def startTagOther(name, attributes)
|
137
|
+
element = @tree.createElement(name, attributes)
|
138
|
+
@tree.open_elements[-1].appendChild(element)
|
139
|
+
@tree.open_elements.push(element)
|
140
|
+
end
|
141
|
+
|
142
|
+
def endTagOther(name)
|
143
|
+
for node in @tree.open_elements.reverse
|
144
|
+
if node.name == name
|
145
|
+
{} while @tree.open_elements.pop != node
|
146
|
+
break
|
147
|
+
else
|
148
|
+
parse_error
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
def processCharacters(data)
|
154
|
+
@tree.insertText(data)
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
end
|
@@ -0,0 +1,188 @@
|
|
1
|
+
require 'cgi'
|
2
|
+
require 'html5/tokenizer'
|
3
|
+
|
4
|
+
module HTML5
|
5
|
+
|
6
|
+
# This module provides sanitization of XHTML+MathML+SVG
|
7
|
+
# and of inline style attributes.
|
8
|
+
#
|
9
|
+
# It can be either at the Tokenizer stage:
|
10
|
+
#
|
11
|
+
# HTMLParser.parse(html, :tokenizer => HTMLSanitizer)
|
12
|
+
#
|
13
|
+
# or, if you already have a parse tree (in this example, a REXML tree),
|
14
|
+
# at the Serializer stage:
|
15
|
+
#
|
16
|
+
# tokens = TreeWalkers.get_tree_walker('rexml').new(tree)
|
17
|
+
# HTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
|
18
|
+
# :sanitize => true})
|
19
|
+
|
20
|
+
module HTMLSanitizeModule
|
21
|
+
|
22
|
+
ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
|
23
|
+
button caption center cite code col colgroup dd del dfn dir div dl dt
|
24
|
+
em fieldset font form h1 h2 h3 h4 h5 h6 hr i img input ins kbd label
|
25
|
+
legend li map menu ol optgroup option p pre q s samp select small span
|
26
|
+
strike strong sub sup table tbody td textarea tfoot th thead tr tt u
|
27
|
+
ul var]
|
28
|
+
|
29
|
+
MATHML_ELEMENTS = %w[maction math merror mfrac mi mmultiscripts mn mo
|
30
|
+
mover mpadded mphantom mprescripts mroot mrow mspace msqrt mstyle msub
|
31
|
+
msubsup msup mtable mtd mtext mtr munder munderover none]
|
32
|
+
|
33
|
+
SVG_ELEMENTS = %w[a animate animateColor animateMotion animateTransform
|
34
|
+
circle defs desc ellipse font-face font-face-name font-face-src g
|
35
|
+
glyph hkern image linearGradient line marker metadata missing-glyph
|
36
|
+
mpath path polygon polyline radialGradient rect set stop svg switch
|
37
|
+
text title tspan use]
|
38
|
+
|
39
|
+
ACCEPTABLE_ATTRIBUTES = %w[abbr accept accept-charset accesskey action
|
40
|
+
align alt axis border cellpadding cellspacing char charoff charset
|
41
|
+
checked cite class clear cols colspan color compact coords datetime
|
42
|
+
dir disabled enctype for frame headers height href hreflang hspace id
|
43
|
+
ismap label lang longdesc maxlength media method multiple name nohref
|
44
|
+
noshade nowrap prompt readonly rel rev rows rowspan rules scope
|
45
|
+
selected shape size span src start style summary tabindex target title
|
46
|
+
type usemap valign value vspace width xml:lang]
|
47
|
+
|
48
|
+
MATHML_ATTRIBUTES = %w[actiontype align columnalign columnalign
|
49
|
+
columnalign columnlines columnspacing columnspan depth display
|
50
|
+
displaystyle equalcolumns equalrows fence fontstyle fontweight frame
|
51
|
+
height linethickness lspace mathbackground mathcolor mathvariant
|
52
|
+
mathvariant maxsize minsize other rowalign rowalign rowalign rowlines
|
53
|
+
rowspacing rowspan rspace scriptlevel selection separator stretchy
|
54
|
+
width width xlink:href xlink:show xlink:type xmlns xmlns:xlink]
|
55
|
+
|
56
|
+
SVG_ATTRIBUTES = %w[accent-height accumulate additive alphabetic
|
57
|
+
arabic-form ascent attributeName attributeType baseProfile bbox begin
|
58
|
+
by calcMode cap-height class color color-rendering content cx cy d dx
|
59
|
+
dy descent display dur end fill fill-rule font-family font-size
|
60
|
+
font-stretch font-style font-variant font-weight from fx fy g1 g2
|
61
|
+
glyph-name gradientUnits hanging height horiz-adv-x horiz-origin-x id
|
62
|
+
ideographic k keyPoints keySplines keyTimes lang marker-end
|
63
|
+
marker-mid marker-start markerHeight markerUnits markerWidth
|
64
|
+
mathematical max min name offset opacity orient origin
|
65
|
+
overline-position overline-thickness panose-1 path pathLength points
|
66
|
+
preserveAspectRatio r refX refY repeatCount repeatDur
|
67
|
+
requiredExtensions requiredFeatures restart rotate rx ry slope stemh
|
68
|
+
stemv stop-color stop-opacity strikethrough-position
|
69
|
+
strikethrough-thickness stroke stroke-dasharray stroke-dashoffset
|
70
|
+
stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity
|
71
|
+
stroke-width systemLanguage target text-anchor to transform type u1
|
72
|
+
u2 underline-position underline-thickness unicode unicode-range
|
73
|
+
units-per-em values version viewBox visibility width widths x
|
74
|
+
x-height x1 x2 xlink:actuate xlink:arcrole xlink:href xlink:role
|
75
|
+
xlink:show xlink:title xlink:type xml:base xml:lang xml:space xmlns
|
76
|
+
xmlns:xlink y y1 y2 zoomAndPan]
|
77
|
+
|
78
|
+
ATTR_VAL_IS_URI = %w[href src cite action longdesc xlink:href xml:base]
|
79
|
+
|
80
|
+
ACCEPTABLE_CSS_PROPERTIES = %w[azimuth background-color
|
81
|
+
border-bottom-color border-collapse border-color border-left-color
|
82
|
+
border-right-color border-top-color clear color cursor direction
|
83
|
+
display elevation float font font-family font-size font-style
|
84
|
+
font-variant font-weight height letter-spacing line-height overflow
|
85
|
+
pause pause-after pause-before pitch pitch-range richness speak
|
86
|
+
speak-header speak-numeral speak-punctuation speech-rate stress
|
87
|
+
text-align text-decoration text-indent unicode-bidi vertical-align
|
88
|
+
voice-family volume white-space width]
|
89
|
+
|
90
|
+
ACCEPTABLE_CSS_KEYWORDS = %w[auto aqua black block blue bold both bottom
|
91
|
+
brown center collapse dashed dotted fuchsia gray green !important
|
92
|
+
italic left lime maroon medium none navy normal nowrap olive pointer
|
93
|
+
purple red right solid silver teal top transparent underline white
|
94
|
+
yellow]
|
95
|
+
|
96
|
+
ACCEPTABLE_SVG_PROPERTIES = %w[fill fill-opacity fill-rule stroke
|
97
|
+
stroke-width stroke-linecap stroke-linejoin stroke-opacity]
|
98
|
+
|
99
|
+
ACCEPTABLE_PROTOCOLS = %w[ed2k ftp http https irc mailto news gopher nntp
|
100
|
+
telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs]
|
101
|
+
|
102
|
+
# subclasses may define their own versions of these constants
|
103
|
+
ALLOWED_ELEMENTS = ACCEPTABLE_ELEMENTS + MATHML_ELEMENTS + SVG_ELEMENTS
|
104
|
+
ALLOWED_ATTRIBUTES = ACCEPTABLE_ATTRIBUTES + MATHML_ATTRIBUTES + SVG_ATTRIBUTES
|
105
|
+
ALLOWED_CSS_PROPERTIES = ACCEPTABLE_CSS_PROPERTIES
|
106
|
+
ALLOWED_CSS_KEYWORDS = ACCEPTABLE_CSS_KEYWORDS
|
107
|
+
ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
|
108
|
+
ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
|
109
|
+
|
110
|
+
def sanitize_token(token)
|
111
|
+
case token[:type]
|
112
|
+
when :StartTag, :EndTag, :EmptyTag
|
113
|
+
if ALLOWED_ELEMENTS.include?(token[:name])
|
114
|
+
if token.has_key? :data
|
115
|
+
attrs = Hash[*token[:data].flatten]
|
116
|
+
attrs.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
|
117
|
+
ATTR_VAL_IS_URI.each do |attr|
|
118
|
+
val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
|
119
|
+
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
|
120
|
+
attrs.delete attr
|
121
|
+
end
|
122
|
+
end
|
123
|
+
if attrs['style']
|
124
|
+
attrs['style'] = sanitize_css(attrs['style'])
|
125
|
+
end
|
126
|
+
token[:data] = attrs.map {|k,v| [k,v]}
|
127
|
+
end
|
128
|
+
return token
|
129
|
+
else
|
130
|
+
if token[:type] == :EndTag
|
131
|
+
token[:data] = "</#{token[:name]}>"
|
132
|
+
elsif token[:data]
|
133
|
+
attrs = token[:data].map {|k,v| " #{k}=\"#{CGI.escapeHTML(v)}\""}.join('')
|
134
|
+
token[:data] = "<#{token[:name]}#{attrs}>"
|
135
|
+
else
|
136
|
+
token[:data] = "<#{token[:name]}>"
|
137
|
+
end
|
138
|
+
token[:data].insert(-2,'/') if token[:type] == :EmptyTag
|
139
|
+
token[:type] = :Characters
|
140
|
+
token.delete(:name)
|
141
|
+
return token
|
142
|
+
end
|
143
|
+
when :Comment
|
144
|
+
token[:data] = ""
|
145
|
+
return token
|
146
|
+
else
|
147
|
+
return token
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
def sanitize_css(style)
|
152
|
+
# disallow urls
|
153
|
+
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
|
154
|
+
|
155
|
+
# gauntlet
|
156
|
+
return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
|
157
|
+
return '' unless style =~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
|
158
|
+
|
159
|
+
clean = []
|
160
|
+
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
|
161
|
+
next if val.empty?
|
162
|
+
prop.downcase!
|
163
|
+
if ALLOWED_CSS_PROPERTIES.include?(prop)
|
164
|
+
clean << "#{prop}: #{val};"
|
165
|
+
elsif %w[background border margin padding].include?(prop.split('-')[0])
|
166
|
+
clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
|
167
|
+
!ALLOWED_CSS_KEYWORDS.include?(keyword) and
|
168
|
+
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
|
169
|
+
end
|
170
|
+
elsif ALLOWED_SVG_PROPERTIES.include?(prop)
|
171
|
+
clean << "#{prop}: #{val};"
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
style = clean.join(' ')
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
class HTMLSanitizer < HTMLTokenizer
|
180
|
+
include HTMLSanitizeModule
|
181
|
+
def each
|
182
|
+
super do |token|
|
183
|
+
yield(sanitize_token(token))
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
end
|
@@ -0,0 +1,180 @@
|
|
1
|
+
require 'html5/constants'
|
2
|
+
|
3
|
+
module HTML5
|
4
|
+
|
5
|
+
class HTMLSerializer
|
6
|
+
|
7
|
+
def self.serialize(stream, options = {})
|
8
|
+
new(options).serialize(stream, options[:encoding])
|
9
|
+
end
|
10
|
+
|
11
|
+
def escape(string)
|
12
|
+
string.gsub("&", "&").gsub("<", "<").gsub(">", ">")
|
13
|
+
end
|
14
|
+
|
15
|
+
def initialize(options={})
|
16
|
+
@quote_attr_values = false
|
17
|
+
@quote_char = '"'
|
18
|
+
@use_best_quote_char = true
|
19
|
+
@minimize_boolean_attributes = true
|
20
|
+
|
21
|
+
@use_trailing_solidus = false
|
22
|
+
@space_before_trailing_solidus = true
|
23
|
+
@escape_lt_in_attrs = false
|
24
|
+
@escape_rcdata = false
|
25
|
+
|
26
|
+
@omit_optional_tags = true
|
27
|
+
@sanitize = false
|
28
|
+
|
29
|
+
@strip_whitespace = false
|
30
|
+
|
31
|
+
@inject_meta_charset = true
|
32
|
+
|
33
|
+
options.each do |name, value|
|
34
|
+
next unless instance_variable_defined?("@#{name}")
|
35
|
+
@use_best_quote_char = false if name.to_s == 'quote_char'
|
36
|
+
instance_variable_set("@#{name}", value)
|
37
|
+
end
|
38
|
+
|
39
|
+
@errors = []
|
40
|
+
end
|
41
|
+
|
42
|
+
def serialize(treewalker, encoding=nil)
|
43
|
+
in_cdata = false
|
44
|
+
@errors = []
|
45
|
+
|
46
|
+
if encoding and @inject_meta_charset
|
47
|
+
require 'html5/filters/inject_meta_charset'
|
48
|
+
treewalker = Filters::InjectMetaCharset.new(treewalker, encoding)
|
49
|
+
end
|
50
|
+
|
51
|
+
if @strip_whitespace
|
52
|
+
require 'html5/filters/whitespace'
|
53
|
+
treewalker = Filters::WhitespaceFilter.new(treewalker)
|
54
|
+
end
|
55
|
+
|
56
|
+
if @sanitize
|
57
|
+
require 'html5/filters/sanitizer'
|
58
|
+
treewalker = Filters::HTMLSanitizeFilter.new(treewalker)
|
59
|
+
end
|
60
|
+
|
61
|
+
if @omit_optional_tags
|
62
|
+
require 'html5/filters/optionaltags'
|
63
|
+
treewalker = Filters::OptionalTagFilter.new(treewalker)
|
64
|
+
end
|
65
|
+
|
66
|
+
result = []
|
67
|
+
treewalker.each do |token|
|
68
|
+
type = token[:type]
|
69
|
+
if type == :Doctype
|
70
|
+
doctype = "<!DOCTYPE %s>" % token[:name]
|
71
|
+
result << doctype
|
72
|
+
|
73
|
+
elsif [:Characters, :SpaceCharacters].include? type
|
74
|
+
if type == :SpaceCharacters or in_cdata
|
75
|
+
if in_cdata and token[:data].include?("</")
|
76
|
+
serialize_error(_("Unexpected </ in CDATA"))
|
77
|
+
end
|
78
|
+
result << token[:data]
|
79
|
+
else
|
80
|
+
result << escape(token[:data])
|
81
|
+
end
|
82
|
+
|
83
|
+
elsif [:StartTag, :EmptyTag].include? type
|
84
|
+
name = token[:name]
|
85
|
+
if RCDATA_ELEMENTS.include?(name) and not @escape_rcdata
|
86
|
+
in_cdata = true
|
87
|
+
elsif in_cdata
|
88
|
+
serialize_error(_("Unexpected child element of a CDATA element"))
|
89
|
+
end
|
90
|
+
attributes = []
|
91
|
+
for k,v in attrs = token[:data].to_a.sort
|
92
|
+
attributes << ' '
|
93
|
+
|
94
|
+
attributes << k
|
95
|
+
if not @minimize_boolean_attributes or \
|
96
|
+
(!(BOOLEAN_ATTRIBUTES[name]||[]).include?(k) \
|
97
|
+
and !BOOLEAN_ATTRIBUTES[:global].include?(k))
|
98
|
+
attributes << "="
|
99
|
+
if @quote_attr_values or v.empty?
|
100
|
+
quote_attr = true
|
101
|
+
else
|
102
|
+
quote_attr = (SPACE_CHARACTERS + %w(< > " ')).any? {|c| v.include?(c)}
|
103
|
+
end
|
104
|
+
v = v.gsub("&", "&")
|
105
|
+
v = v.gsub("<", "<") if @escape_lt_in_attrs
|
106
|
+
if quote_attr
|
107
|
+
quote_char = @quote_char
|
108
|
+
if @use_best_quote_char
|
109
|
+
if v.index("'") and !v.index('"')
|
110
|
+
quote_char = '"'
|
111
|
+
elsif v.index('"') and !v.index("'")
|
112
|
+
quote_char = "'"
|
113
|
+
end
|
114
|
+
end
|
115
|
+
if quote_char == "'"
|
116
|
+
v = v.gsub("'", "'")
|
117
|
+
else
|
118
|
+
v = v.gsub('"', """)
|
119
|
+
end
|
120
|
+
attributes << quote_char << v << quote_char
|
121
|
+
else
|
122
|
+
attributes << v
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
if VOID_ELEMENTS.include?(name) and @use_trailing_solidus
|
127
|
+
if @space_before_trailing_solidus
|
128
|
+
attributes << " /"
|
129
|
+
else
|
130
|
+
attributes << "/"
|
131
|
+
end
|
132
|
+
end
|
133
|
+
result << "<%s%s>" % [name, attributes.join('')]
|
134
|
+
|
135
|
+
elsif type == :EndTag
|
136
|
+
name = token[:name]
|
137
|
+
if RCDATA_ELEMENTS.include?(name)
|
138
|
+
in_cdata = false
|
139
|
+
elsif in_cdata
|
140
|
+
serialize_error(_("Unexpected child element of a CDATA element"))
|
141
|
+
end
|
142
|
+
end_tag = "</#{name}>"
|
143
|
+
result << end_tag
|
144
|
+
|
145
|
+
elsif type == :Comment
|
146
|
+
data = token[:data]
|
147
|
+
serialize_error(_("Comment contains --")) if data.index("--")
|
148
|
+
comment = "<!--%s-->" % token[:data]
|
149
|
+
result << comment
|
150
|
+
|
151
|
+
else
|
152
|
+
serialize_error(token[:data])
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
if encoding and encoding != 'utf-8'
|
157
|
+
require 'iconv'
|
158
|
+
Iconv.iconv(encoding, 'utf-8', result.join('')).first
|
159
|
+
else
|
160
|
+
result.join('')
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
alias :render :serialize
|
165
|
+
|
166
|
+
def serialize_error(data="XXX ERROR MESSAGE NEEDED")
|
167
|
+
# XXX The idea is to make data mandatory.
|
168
|
+
@errors.push(data)
|
169
|
+
if @strict
|
170
|
+
raise SerializeError
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
def _(string); string; end
|
175
|
+
end
|
176
|
+
|
177
|
+
# Error in serialized tree
|
178
|
+
class SerializeError < Exception
|
179
|
+
end
|
180
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'html5/serializer/htmlserializer'
|
2
|
+
|
3
|
+
module HTML5
|
4
|
+
|
5
|
+
class XHTMLSerializer < HTMLSerializer
|
6
|
+
DEFAULTS = {
|
7
|
+
:quote_attr_values => true,
|
8
|
+
:minimize_boolean_attributes => false,
|
9
|
+
:use_trailing_solidus => true,
|
10
|
+
:escape_lt_in_attrs => true,
|
11
|
+
:omit_optional_tags => false,
|
12
|
+
:escape_rcdata => true
|
13
|
+
}
|
14
|
+
|
15
|
+
def initialize(options={})
|
16
|
+
super(DEFAULTS.clone.update(options))
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|