spk-html5 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +10 -0
- data/Manifest.txt +73 -0
- data/README +45 -0
- data/Rakefile.rb +33 -0
- data/bin/html5 +7 -0
- data/lib/html5.rb +13 -0
- data/lib/html5/cli.rb +248 -0
- data/lib/html5/constants.rb +1061 -0
- data/lib/html5/filters/base.rb +10 -0
- data/lib/html5/filters/inject_meta_charset.rb +82 -0
- data/lib/html5/filters/iso639codes.rb +755 -0
- data/lib/html5/filters/optionaltags.rb +198 -0
- data/lib/html5/filters/rfc2046.rb +31 -0
- data/lib/html5/filters/rfc3987.rb +91 -0
- data/lib/html5/filters/sanitizer.rb +15 -0
- data/lib/html5/filters/validator.rb +834 -0
- data/lib/html5/filters/whitespace.rb +36 -0
- data/lib/html5/html5parser.rb +247 -0
- data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
- data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
- data/lib/html5/html5parser/after_body_phase.rb +46 -0
- data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
- data/lib/html5/html5parser/after_head_phase.rb +55 -0
- data/lib/html5/html5parser/before_head_phase.rb +44 -0
- data/lib/html5/html5parser/before_html_phase.rb +41 -0
- data/lib/html5/html5parser/in_body_phase.rb +636 -0
- data/lib/html5/html5parser/in_caption_phase.rb +69 -0
- data/lib/html5/html5parser/in_cell_phase.rb +78 -0
- data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
- data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
- data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
- data/lib/html5/html5parser/in_head_phase.rb +143 -0
- data/lib/html5/html5parser/in_row_phase.rb +96 -0
- data/lib/html5/html5parser/in_select_phase.rb +90 -0
- data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
- data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
- data/lib/html5/html5parser/in_table_phase.rb +177 -0
- data/lib/html5/html5parser/initial_phase.rb +133 -0
- data/lib/html5/html5parser/phase.rb +171 -0
- data/lib/html5/inputstream.rb +735 -0
- data/lib/html5/liberalxmlparser.rb +158 -0
- data/lib/html5/sanitizer.rb +209 -0
- data/lib/html5/serializer.rb +2 -0
- data/lib/html5/serializer/htmlserializer.rb +179 -0
- data/lib/html5/serializer/xhtmlserializer.rb +20 -0
- data/lib/html5/sniffer.rb +45 -0
- data/lib/html5/tokenizer.rb +1059 -0
- data/lib/html5/treebuilders.rb +24 -0
- data/lib/html5/treebuilders/base.rb +339 -0
- data/lib/html5/treebuilders/hpricot.rb +231 -0
- data/lib/html5/treebuilders/rexml.rb +215 -0
- data/lib/html5/treebuilders/simpletree.rb +191 -0
- data/lib/html5/treewalkers.rb +26 -0
- data/lib/html5/treewalkers/base.rb +162 -0
- data/lib/html5/treewalkers/hpricot.rb +48 -0
- data/lib/html5/treewalkers/rexml.rb +48 -0
- data/lib/html5/treewalkers/simpletree.rb +48 -0
- data/lib/html5/version.rb +3 -0
- data/test/preamble.rb +69 -0
- data/test/test_cli.rb +16 -0
- data/test/test_encoding.rb +35 -0
- data/test/test_input_stream.rb +26 -0
- data/test/test_lxp.rb +283 -0
- data/test/test_parser.rb +63 -0
- data/test/test_sanitizer.rb +173 -0
- data/test/test_serializer.rb +67 -0
- data/test/test_sniffer.rb +27 -0
- data/test/test_stream.rb +71 -0
- data/test/test_tokenizer.rb +95 -0
- data/test/test_treewalkers.rb +135 -0
- data/test/test_validator.rb +31 -0
- data/test/tokenizer_test_parser.rb +67 -0
- data/test19.rb +38 -0
- metadata +198 -0
@@ -0,0 +1,158 @@
|
|
1
|
+
# Warning: this module is experimental and subject to change and even removal
|
2
|
+
# at any time.
|
3
|
+
#
|
4
|
+
# For background/rationale, see:
|
5
|
+
# * http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
|
6
|
+
# * http://tinyurl.com/ylfj8k (and follow-ups)
|
7
|
+
#
|
8
|
+
# References:
|
9
|
+
# * http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
|
10
|
+
# * http://wiki.whatwg.org/wiki/HtmlVsXhtml
|
11
|
+
#
|
12
|
+
# @@TODO:
|
13
|
+
# * Selectively lowercase only XHTML, but not foreign markup
|
14
|
+
require 'html5/html5parser'
|
15
|
+
require 'html5/constants'
|
16
|
+
|
17
|
+
module HTML5
|
18
|
+
|
19
|
+
# liberal XML parser
|
20
|
+
class XMLParser < HTMLParser
|
21
|
+
|
22
|
+
def initialize(options = {})
|
23
|
+
super options
|
24
|
+
@phases[:initial] = XmlRootPhase.new(self, @tree)
|
25
|
+
end
|
26
|
+
|
27
|
+
def normalize_token(token)
|
28
|
+
case token[:type]
|
29
|
+
when :StartTag, :EmptyTag
|
30
|
+
# We need to remove the duplicate attributes and convert attributes
|
31
|
+
# to a Hash so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
32
|
+
|
33
|
+
token[:data] = Hash[*token[:data].reverse.flatten]
|
34
|
+
|
35
|
+
# For EmptyTags, process both a Start and an End tag
|
36
|
+
if token[:type] == :EmptyTag
|
37
|
+
save = @tokenizer.content_model_flag
|
38
|
+
@phase.processStartTag(token[:name], token[:data])
|
39
|
+
@tokenizer.content_model_flag = save
|
40
|
+
token[:data] = {}
|
41
|
+
token[:type] = :EndTag
|
42
|
+
end
|
43
|
+
|
44
|
+
when :Characters
|
45
|
+
# un-escape RCDATA_ELEMENTS (e.g. style, script)
|
46
|
+
if @tokenizer.content_model_flag == :CDATA
|
47
|
+
token[:data] = token[:data].
|
48
|
+
gsub('<','<').gsub('>','>').gsub('&','&')
|
49
|
+
end
|
50
|
+
|
51
|
+
when :EndTag
|
52
|
+
if token[:data]
|
53
|
+
parse_error("attributes-in-end-tag")
|
54
|
+
end
|
55
|
+
|
56
|
+
when :Comment
|
57
|
+
# Rescue CDATA from the comments
|
58
|
+
if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
|
59
|
+
token[:type] = :Characters
|
60
|
+
token[:data] = token[:data][7 ... -2]
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
return token
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# liberal XMTHML parser
|
69
|
+
class XHTMLParser < XMLParser
|
70
|
+
|
71
|
+
def initialize(options = {})
|
72
|
+
super options
|
73
|
+
@phases[:initial] = InitialPhase.new(self, @tree)
|
74
|
+
@phases[:beforeHtml] = XhmlRootPhase.new(self, @tree)
|
75
|
+
end
|
76
|
+
|
77
|
+
def normalize_token(token)
|
78
|
+
super(token)
|
79
|
+
|
80
|
+
# ensure that non-void XHTML elements have content so that separate
|
81
|
+
# open and close tags are emitted
|
82
|
+
if token[:type] == :EndTag
|
83
|
+
if VOID_ELEMENTS.include? token[:name]
|
84
|
+
if @tree.open_elements[-1].name != token["name"]
|
85
|
+
token[:type] = :EmptyTag
|
86
|
+
token["data"] ||= {}
|
87
|
+
end
|
88
|
+
else
|
89
|
+
if token[:name] == @tree.open_elements[-1].name and \
|
90
|
+
not @tree.open_elements[-1].hasContent
|
91
|
+
@tree.insertText('') unless
|
92
|
+
@tree.open_elements.any? {|e|
|
93
|
+
e.attributes.keys.include? 'xmlns' and
|
94
|
+
e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
|
95
|
+
}
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
return token
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
class XhmlRootPhase < BeforeHtmlPhase
|
105
|
+
def insert_html_element
|
106
|
+
element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
|
107
|
+
@tree.open_elements.push(element)
|
108
|
+
@tree.document.appendChild(element)
|
109
|
+
@parser.phase = @parser.phases[:beforeHead]
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
class XmlRootPhase < Phase
|
114
|
+
# Prime the Xml parser
|
115
|
+
@start_tag_handlers = Hash.new(:startTagOther)
|
116
|
+
@end_tag_handlers = Hash.new(:endTagOther)
|
117
|
+
def startTagOther(name, attributes)
|
118
|
+
@tree.open_elements.push(@tree.document)
|
119
|
+
element = @tree.createElement(name, attributes)
|
120
|
+
@tree.open_elements[-1].appendChild(element)
|
121
|
+
@tree.open_elements.push(element)
|
122
|
+
@parser.phase = XmlElementPhase.new(@parser,@tree)
|
123
|
+
end
|
124
|
+
def endTagOther(name)
|
125
|
+
super
|
126
|
+
@tree.open_elements.pop
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
class XmlElementPhase < Phase
|
131
|
+
# Generic handling for all XML elements
|
132
|
+
|
133
|
+
@start_tag_handlers = Hash.new(:startTagOther)
|
134
|
+
@end_tag_handlers = Hash.new(:endTagOther)
|
135
|
+
|
136
|
+
def startTagOther(name, attributes)
|
137
|
+
element = @tree.createElement(name, attributes)
|
138
|
+
@tree.open_elements[-1].appendChild(element)
|
139
|
+
@tree.open_elements.push(element)
|
140
|
+
end
|
141
|
+
|
142
|
+
def endTagOther(name)
|
143
|
+
for node in @tree.open_elements.reverse
|
144
|
+
if node.name == name
|
145
|
+
{} while @tree.open_elements.pop != node
|
146
|
+
break
|
147
|
+
else
|
148
|
+
parse_error
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
def processCharacters(data)
|
154
|
+
@tree.insertText(data)
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
end
|
@@ -0,0 +1,209 @@
|
|
1
|
+
require 'cgi'
|
2
|
+
require 'html5/tokenizer'
|
3
|
+
require 'set'
|
4
|
+
|
5
|
+
module HTML5
|
6
|
+
|
7
|
+
# This module provides sanitization of XHTML+MathML+SVG
|
8
|
+
# and of inline style attributes.
|
9
|
+
#
|
10
|
+
# It can be either at the Tokenizer stage:
|
11
|
+
#
|
12
|
+
# HTMLParser.parse(html, :tokenizer => HTMLSanitizer)
|
13
|
+
#
|
14
|
+
# or, if you already have a parse tree (in this example, a REXML tree),
|
15
|
+
# at the Serializer stage:
|
16
|
+
#
|
17
|
+
# tokens = TreeWalkers.get_tree_walker('rexml').new(tree)
|
18
|
+
# HTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
|
19
|
+
# :sanitize => true})
|
20
|
+
|
21
|
+
module HTMLSanitizeModule
|
22
|
+
|
23
|
+
ACCEPTABLE_ELEMENTS = Set.new %w[a abbr acronym address area audio b big blockquote br
|
24
|
+
button caption center cite code col colgroup dd del dfn dir div dl dt
|
25
|
+
em fieldset font form h1 h2 h3 h4 h5 h6 hr i img input ins kbd label
|
26
|
+
legend li map menu ol optgroup option p pre q s samp select small span
|
27
|
+
strike strong sub sup table tbody td textarea tfoot th thead tr tt u
|
28
|
+
ul var video]
|
29
|
+
|
30
|
+
MATHML_ELEMENTS = Set.new %w[annotation annotation-xml maction math merror mfrac
|
31
|
+
mfenced mi mmultiscripts mn mo mover mpadded mphantom mprescripts mroot mrow
|
32
|
+
mspace msqrt mstyle msub msubsup msup mtable mtd mtext mtr munder
|
33
|
+
munderover none semantics]
|
34
|
+
|
35
|
+
SVG_ELEMENTS = Set.new %w[a animate animateColor animateMotion animateTransform
|
36
|
+
circle clipPath defs desc ellipse feGaussianBlur filter font-face
|
37
|
+
font-face-name font-face-src foreignObject
|
38
|
+
g glyph hkern linearGradient line marker mask metadata missing-glyph
|
39
|
+
mpath path polygon polyline radialGradient rect set stop svg switch
|
40
|
+
text textPath title tspan use]
|
41
|
+
|
42
|
+
ACCEPTABLE_ATTRIBUTES = Set.new %w[abbr accept accept-charset accesskey action
|
43
|
+
align alt axis border cellpadding cellspacing char charoff charset
|
44
|
+
checked cite class clear cols colspan color compact coords datetime
|
45
|
+
dir disabled enctype for frame headers height href hreflang hspace id
|
46
|
+
ismap label lang longdesc loop loopcount loopend loopstart
|
47
|
+
maxlength media method multiple name nohref
|
48
|
+
noshade nowrap poster prompt readonly rel rev rows rowspan rules scope
|
49
|
+
selected shape size span src start style summary tabindex target title
|
50
|
+
type usemap valign value vspace width xml:lang]
|
51
|
+
|
52
|
+
MATHML_ATTRIBUTES = Set.new %w[actiontype align close
|
53
|
+
columnalign columnlines columnspacing columnspan depth display
|
54
|
+
displaystyle encoding equalcolumns equalrows fence fontstyle fontweight
|
55
|
+
frame height linethickness lspace mathbackground mathcolor mathvariant
|
56
|
+
maxsize minsize open other rowalign rowlines
|
57
|
+
rowspacing rowspan rspace scriptlevel selection separator separators
|
58
|
+
stretchy width xlink:href xlink:show xlink:type xmlns xmlns:xlink]
|
59
|
+
|
60
|
+
SVG_ATTRIBUTES = Set.new %w[accent-height accumulate additive alphabetic
|
61
|
+
arabic-form ascent attributeName attributeType baseProfile bbox begin
|
62
|
+
by calcMode cap-height class clip-path clip-rule color
|
63
|
+
color-interpolation-filters color-rendering content cx cy d dx
|
64
|
+
dy descent display dur end fill fill-opacity fill-rule
|
65
|
+
filterRes filterUnits font-family
|
66
|
+
font-size font-stretch font-style font-variant font-weight from fx fy g1
|
67
|
+
g2 glyph-name gradientUnits hanging height horiz-adv-x horiz-origin-x id
|
68
|
+
ideographic k keyPoints keySplines keyTimes lang marker-end
|
69
|
+
marker-mid marker-start markerHeight markerUnits markerWidth
|
70
|
+
maskContentUnits maskUnits mathematical max method min name offset opacity orient origin
|
71
|
+
overline-position overline-thickness panose-1 path pathLength
|
72
|
+
patternContentUnits patternTransform patternUnits points
|
73
|
+
preserveAspectRatio primitiveUnits r refX refY repeatCount repeatDur
|
74
|
+
requiredExtensions requiredFeatures restart rotate rx ry slope spacing
|
75
|
+
startOffset stdDeviation stemh
|
76
|
+
stemv stop-color stop-opacity strikethrough-position
|
77
|
+
strikethrough-thickness stroke stroke-dasharray stroke-dashoffset
|
78
|
+
stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity
|
79
|
+
stroke-width systemLanguage target text-anchor to transform type u1
|
80
|
+
u2 underline-position underline-thickness unicode unicode-range
|
81
|
+
units-per-em values version viewBox visibility width widths x
|
82
|
+
x-height x1 x2 xlink:actuate xlink:arcrole xlink:href xlink:role
|
83
|
+
xlink:show xlink:title xlink:type xml:base xml:lang xml:space xmlns
|
84
|
+
xmlns:xlink y y1 y2 zoomAndPan]
|
85
|
+
|
86
|
+
ATTR_VAL_IS_URI = Set.new %w[href src cite action longdesc xlink:href xml:base]
|
87
|
+
|
88
|
+
SVG_ATTR_VAL_ALLOWS_REF = Set.new %w[clip-path color-profile cursor fill
|
89
|
+
filter marker marker-start marker-mid marker-end mask stroke]
|
90
|
+
|
91
|
+
SVG_ALLOW_LOCAL_HREF = Set.new %w[altGlyph animate animateColor animateMotion
|
92
|
+
animateTransform cursor feImage filter linearGradient pattern
|
93
|
+
radialGradient textpath tref set use]
|
94
|
+
|
95
|
+
ACCEPTABLE_CSS_PROPERTIES = Set.new %w[azimuth background-color
|
96
|
+
border-bottom-color border-collapse border-color border-left-color
|
97
|
+
border-right-color border-top-color clear color cursor direction
|
98
|
+
display elevation float font font-family font-size font-style
|
99
|
+
font-variant font-weight height letter-spacing line-height overflow
|
100
|
+
pause pause-after pause-before pitch pitch-range richness speak
|
101
|
+
speak-header speak-numeral speak-punctuation speech-rate stress
|
102
|
+
text-align text-decoration text-indent unicode-bidi vertical-align
|
103
|
+
voice-family volume white-space width]
|
104
|
+
|
105
|
+
ACCEPTABLE_CSS_KEYWORDS = Set.new %w[auto aqua black block blue bold both bottom
|
106
|
+
brown center collapse dashed dotted fuchsia gray green !important
|
107
|
+
italic left lime maroon medium none navy normal nowrap olive pointer
|
108
|
+
purple red right solid silver teal top transparent underline white
|
109
|
+
yellow]
|
110
|
+
|
111
|
+
ACCEPTABLE_SVG_PROPERTIES = Set.new %w[fill fill-opacity fill-rule stroke
|
112
|
+
stroke-width stroke-linecap stroke-linejoin stroke-opacity]
|
113
|
+
|
114
|
+
ACCEPTABLE_PROTOCOLS = Set.new %w[ed2k ftp http https irc mailto news gopher nntp
|
115
|
+
telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs]
|
116
|
+
|
117
|
+
# subclasses may define their own versions of these constants
|
118
|
+
ALLOWED_ELEMENTS = ACCEPTABLE_ELEMENTS + MATHML_ELEMENTS + SVG_ELEMENTS
|
119
|
+
ALLOWED_ATTRIBUTES = ACCEPTABLE_ATTRIBUTES + MATHML_ATTRIBUTES + SVG_ATTRIBUTES
|
120
|
+
ALLOWED_CSS_PROPERTIES = ACCEPTABLE_CSS_PROPERTIES
|
121
|
+
ALLOWED_CSS_KEYWORDS = ACCEPTABLE_CSS_KEYWORDS
|
122
|
+
ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
|
123
|
+
ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
|
124
|
+
|
125
|
+
def sanitize_token(token)
|
126
|
+
case token[:type]
|
127
|
+
when :StartTag, :EndTag, :EmptyTag
|
128
|
+
if self.class.const_get("ALLOWED_ELEMENTS").include?(token[:name])
|
129
|
+
if token.has_key? :data
|
130
|
+
attrs = Hash[*token[:data].flatten]
|
131
|
+
attrs.delete_if { |attr,v| !self.class.const_get("ALLOWED_ATTRIBUTES").include?(attr) }
|
132
|
+
ATTR_VAL_IS_URI.each do |attr|
|
133
|
+
val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
|
134
|
+
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !self.class.const_get("ALLOWED_PROTOCOLS").include?(val_unescaped.split(':')[0])
|
135
|
+
attrs.delete attr
|
136
|
+
end
|
137
|
+
end
|
138
|
+
SVG_ATTR_VAL_ALLOWS_REF.each do |attr|
|
139
|
+
attrs[attr] = attrs[attr].to_s.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if attrs[attr]
|
140
|
+
end
|
141
|
+
if SVG_ALLOW_LOCAL_HREF.include?(token[:name]) && attrs['xlink:href'] && attrs['xlink:href'] =~ /^\s*[^#\s].*/m
|
142
|
+
attrs.delete 'xlink:href'
|
143
|
+
end
|
144
|
+
if attrs['style']
|
145
|
+
attrs['style'] = sanitize_css(attrs['style'])
|
146
|
+
end
|
147
|
+
token[:data] = attrs.map {|k,v| [k,v]}
|
148
|
+
end
|
149
|
+
return token
|
150
|
+
else
|
151
|
+
if token[:type] == :EndTag
|
152
|
+
token[:data] = "</#{token[:name]}>"
|
153
|
+
elsif token[:data]
|
154
|
+
attrs = token[:data].map {|k,v| " #{k}=\"#{CGI.escapeHTML(v)}\""}.join('')
|
155
|
+
token[:data] = "<#{token[:name]}#{attrs}>"
|
156
|
+
else
|
157
|
+
token[:data] = "<#{token[:name]}>"
|
158
|
+
end
|
159
|
+
token[:data].insert(-2,'/') if token[:type] == :EmptyTag
|
160
|
+
token[:type] = :Characters
|
161
|
+
token.delete(:name)
|
162
|
+
return token
|
163
|
+
end
|
164
|
+
when :Comment
|
165
|
+
token[:data] = ""
|
166
|
+
return token
|
167
|
+
else
|
168
|
+
return token
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
def sanitize_css(style)
|
173
|
+
# disallow urls
|
174
|
+
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
|
175
|
+
|
176
|
+
# gauntlet
|
177
|
+
return '' unless style =~ /^([-:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
|
178
|
+
return '' unless style =~ /^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$/
|
179
|
+
|
180
|
+
clean = []
|
181
|
+
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
|
182
|
+
next if val.empty?
|
183
|
+
prop.downcase!
|
184
|
+
if self.class.const_get("ALLOWED_CSS_PROPERTIES").include?(prop)
|
185
|
+
clean << "#{prop}: #{val};"
|
186
|
+
elsif %w[background border margin padding].include?(prop.split('-')[0])
|
187
|
+
clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
|
188
|
+
!self.class.const_get("ALLOWED_CSS_KEYWORDS").include?(keyword) and
|
189
|
+
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
|
190
|
+
end
|
191
|
+
elsif self.class.const_get("ALLOWED_SVG_PROPERTIES").include?(prop)
|
192
|
+
clean << "#{prop}: #{val};"
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
style = clean.join(' ')
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
class HTMLSanitizer < HTMLTokenizer
|
201
|
+
include HTMLSanitizeModule
|
202
|
+
def each
|
203
|
+
super do |token|
|
204
|
+
yield(sanitize_token(token))
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
end
|
@@ -0,0 +1,179 @@
|
|
1
|
+
require 'html5/constants'
|
2
|
+
|
3
|
+
module HTML5
|
4
|
+
|
5
|
+
class HTMLSerializer
|
6
|
+
|
7
|
+
def self.serialize(stream, options = {})
|
8
|
+
new(options).serialize(stream, options[:encoding])
|
9
|
+
end
|
10
|
+
|
11
|
+
def escape(string)
|
12
|
+
string.gsub("&", "&").gsub("<", "<").gsub(">", ">")
|
13
|
+
end
|
14
|
+
|
15
|
+
def initialize(options={})
|
16
|
+
@quote_attr_values = false
|
17
|
+
@quote_char = '"'
|
18
|
+
@use_best_quote_char = true
|
19
|
+
@minimize_boolean_attributes = true
|
20
|
+
|
21
|
+
@use_trailing_solidus = false
|
22
|
+
@space_before_trailing_solidus = true
|
23
|
+
@escape_lt_in_attrs = false
|
24
|
+
@escape_rcdata = false
|
25
|
+
|
26
|
+
@omit_optional_tags = true
|
27
|
+
@sanitize = false
|
28
|
+
|
29
|
+
@strip_whitespace = false
|
30
|
+
|
31
|
+
@inject_meta_charset = true
|
32
|
+
|
33
|
+
options.each do |name, value|
|
34
|
+
next unless instance_variable_defined?("@#{name}")
|
35
|
+
@use_best_quote_char = false if name.to_s == 'quote_char'
|
36
|
+
instance_variable_set("@#{name}", value)
|
37
|
+
end
|
38
|
+
|
39
|
+
@errors = []
|
40
|
+
end
|
41
|
+
|
42
|
+
def serialize(treewalker, encoding=nil)
|
43
|
+
in_cdata = false
|
44
|
+
@errors = []
|
45
|
+
|
46
|
+
if encoding and @inject_meta_charset
|
47
|
+
require 'html5/filters/inject_meta_charset'
|
48
|
+
treewalker = Filters::InjectMetaCharset.new(treewalker, encoding)
|
49
|
+
end
|
50
|
+
|
51
|
+
if @strip_whitespace
|
52
|
+
require 'html5/filters/whitespace'
|
53
|
+
treewalker = Filters::WhitespaceFilter.new(treewalker)
|
54
|
+
end
|
55
|
+
|
56
|
+
if @sanitize
|
57
|
+
require 'html5/filters/sanitizer'
|
58
|
+
treewalker = Filters::HTMLSanitizeFilter.new(treewalker)
|
59
|
+
end
|
60
|
+
|
61
|
+
if @omit_optional_tags
|
62
|
+
require 'html5/filters/optionaltags'
|
63
|
+
treewalker = Filters::OptionalTagFilter.new(treewalker)
|
64
|
+
end
|
65
|
+
|
66
|
+
result = []
|
67
|
+
treewalker.each do |token|
|
68
|
+
type = token[:type]
|
69
|
+
if type == :Doctype
|
70
|
+
doctype = "<!DOCTYPE %s>" % token[:name]
|
71
|
+
result << doctype
|
72
|
+
|
73
|
+
elsif [:Characters, :SpaceCharacters].include? type
|
74
|
+
if type == :SpaceCharacters or in_cdata
|
75
|
+
if in_cdata and token[:data].include?("</")
|
76
|
+
serialize_error("Unexpected </ in CDATA")
|
77
|
+
end
|
78
|
+
result << token[:data]
|
79
|
+
else
|
80
|
+
result << escape(token[:data])
|
81
|
+
end
|
82
|
+
|
83
|
+
elsif [:StartTag, :EmptyTag].include? type
|
84
|
+
name = token[:name]
|
85
|
+
if RCDATA_ELEMENTS.include?(name) and not @escape_rcdata
|
86
|
+
in_cdata = true
|
87
|
+
elsif in_cdata
|
88
|
+
serialize_error(_("Unexpected child element of a CDATA element"))
|
89
|
+
end
|
90
|
+
attributes = []
|
91
|
+
for k,v in attrs = token[:data].to_a.sort
|
92
|
+
attributes << ' '
|
93
|
+
|
94
|
+
attributes << k
|
95
|
+
if not @minimize_boolean_attributes or \
|
96
|
+
(!(BOOLEAN_ATTRIBUTES[name]||[]).include?(k) \
|
97
|
+
and !BOOLEAN_ATTRIBUTES[:global].include?(k))
|
98
|
+
attributes << "="
|
99
|
+
if @quote_attr_values or v.empty?
|
100
|
+
quote_attr = true
|
101
|
+
else
|
102
|
+
quote_attr = (SPACE_CHARACTERS + %w(< > " ')).any? {|c| v.include?(c)}
|
103
|
+
end
|
104
|
+
v = v.gsub("&", "&")
|
105
|
+
v = v.gsub("<", "<") if @escape_lt_in_attrs
|
106
|
+
if quote_attr
|
107
|
+
quote_char = @quote_char
|
108
|
+
if @use_best_quote_char
|
109
|
+
if v.index("'") and !v.index('"')
|
110
|
+
quote_char = '"'
|
111
|
+
elsif v.index('"') and !v.index("'")
|
112
|
+
quote_char = "'"
|
113
|
+
end
|
114
|
+
end
|
115
|
+
if quote_char == "'"
|
116
|
+
v = v.gsub("'", "'")
|
117
|
+
else
|
118
|
+
v = v.gsub('"', """)
|
119
|
+
end
|
120
|
+
attributes << quote_char << v << quote_char
|
121
|
+
else
|
122
|
+
attributes << v
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
if VOID_ELEMENTS.include?(name) and @use_trailing_solidus
|
127
|
+
if @space_before_trailing_solidus
|
128
|
+
attributes << " /"
|
129
|
+
else
|
130
|
+
attributes << "/"
|
131
|
+
end
|
132
|
+
end
|
133
|
+
result << "<%s%s>" % [name, attributes.join('')]
|
134
|
+
|
135
|
+
elsif type == :EndTag
|
136
|
+
name = token[:name]
|
137
|
+
if RCDATA_ELEMENTS.include?(name)
|
138
|
+
in_cdata = false
|
139
|
+
elsif in_cdata
|
140
|
+
serialize_error(_("Unexpected child element of a CDATA element"))
|
141
|
+
end
|
142
|
+
end_tag = "</#{name}>"
|
143
|
+
result << end_tag
|
144
|
+
|
145
|
+
elsif type == :Comment
|
146
|
+
data = token[:data]
|
147
|
+
serialize_error("Comment contains --") if data.index("--")
|
148
|
+
comment = "<!--%s-->" % token[:data]
|
149
|
+
result << comment
|
150
|
+
|
151
|
+
else
|
152
|
+
serialize_error(token[:data])
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
if encoding and encoding != 'utf-8'
|
157
|
+
require 'iconv'
|
158
|
+
Iconv.iconv(encoding, 'utf-8', result.join('')).first
|
159
|
+
else
|
160
|
+
result.join('')
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
alias :render :serialize
|
165
|
+
|
166
|
+
def serialize_error(data="XXX ERROR MESSAGE NEEDED")
|
167
|
+
# XXX The idea is to make data mandatory.
|
168
|
+
@errors.push(data)
|
169
|
+
if @strict
|
170
|
+
raise SerializeError
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
end
|
175
|
+
|
176
|
+
# Error in serialized tree
|
177
|
+
class SerializeError < Exception
|
178
|
+
end
|
179
|
+
end
|