html5 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +58 -0
  3. data/README +9 -0
  4. data/Rakefile.rb +17 -0
  5. data/lib/html5/constants.rb +818 -0
  6. data/lib/html5/filters/base.rb +10 -0
  7. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  8. data/lib/html5/filters/optionaltags.rb +198 -0
  9. data/lib/html5/filters/sanitizer.rb +15 -0
  10. data/lib/html5/filters/whitespace.rb +36 -0
  11. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  12. data/lib/html5/html5parser/after_frameset_phase.rb +34 -0
  13. data/lib/html5/html5parser/after_head_phase.rb +50 -0
  14. data/lib/html5/html5parser/before_head_phase.rb +41 -0
  15. data/lib/html5/html5parser/in_body_phase.rb +607 -0
  16. data/lib/html5/html5parser/in_caption_phase.rb +68 -0
  17. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  18. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  19. data/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  20. data/lib/html5/html5parser/in_head_phase.rb +138 -0
  21. data/lib/html5/html5parser/in_row_phase.rb +87 -0
  22. data/lib/html5/html5parser/in_select_phase.rb +84 -0
  23. data/lib/html5/html5parser/in_table_body_phase.rb +83 -0
  24. data/lib/html5/html5parser/in_table_phase.rb +110 -0
  25. data/lib/html5/html5parser/initial_phase.rb +134 -0
  26. data/lib/html5/html5parser/phase.rb +158 -0
  27. data/lib/html5/html5parser/root_element_phase.rb +42 -0
  28. data/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  29. data/lib/html5/html5parser.rb +248 -0
  30. data/lib/html5/inputstream.rb +654 -0
  31. data/lib/html5/liberalxmlparser.rb +158 -0
  32. data/lib/html5/sanitizer.rb +188 -0
  33. data/lib/html5/serializer/htmlserializer.rb +180 -0
  34. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  35. data/lib/html5/serializer.rb +2 -0
  36. data/lib/html5/tokenizer.rb +968 -0
  37. data/lib/html5/treebuilders/base.rb +334 -0
  38. data/lib/html5/treebuilders/hpricot.rb +231 -0
  39. data/lib/html5/treebuilders/rexml.rb +208 -0
  40. data/lib/html5/treebuilders/simpletree.rb +185 -0
  41. data/lib/html5/treebuilders.rb +24 -0
  42. data/lib/html5/treewalkers/base.rb +154 -0
  43. data/lib/html5/treewalkers/hpricot.rb +48 -0
  44. data/lib/html5/treewalkers/rexml.rb +48 -0
  45. data/lib/html5/treewalkers/simpletree.rb +48 -0
  46. data/lib/html5/treewalkers.rb +26 -0
  47. data/lib/html5.rb +13 -0
  48. data/parse.rb +217 -0
  49. data/tests/preamble.rb +82 -0
  50. data/tests/test_encoding.rb +35 -0
  51. data/tests/test_lxp.rb +263 -0
  52. data/tests/test_parser.rb +68 -0
  53. data/tests/test_sanitizer.rb +142 -0
  54. data/tests/test_serializer.rb +68 -0
  55. data/tests/test_stream.rb +62 -0
  56. data/tests/test_tokenizer.rb +94 -0
  57. data/tests/test_treewalkers.rb +116 -0
  58. data/tests/tokenizer_test_parser.rb +63 -0
  59. metadata +120 -0
@@ -0,0 +1,158 @@
1
+ # Warning: this module is experimental and subject to change and even removal
2
+ # at any time.
3
+ #
4
+ # For background/rationale, see:
5
+ # * http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
6
+ # * http://tinyurl.com/ylfj8k (and follow-ups)
7
+ #
8
+ # References:
9
+ # * http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
10
+ # * http://wiki.whatwg.org/wiki/HtmlVsXhtml
11
+ #
12
+ # @@TODO:
13
+ # * Selectively lowercase only XHTML, but not foreign markup
14
+ require 'html5/html5parser'
15
+ require 'html5/constants'
16
+
17
+ module HTML5
18
+
19
+ # liberal XML parser
20
+ class XMLParser < HTMLParser
21
+
22
+ def initialize(options = {})
23
+ super options
24
+ @phases[:initial] = XmlRootPhase.new(self, @tree)
25
+ end
26
+
27
+ def normalize_token(token)
28
+ case token[:type]
29
+ when :StartTag, :EmptyTag
30
+ # We need to remove the duplicate attributes and convert attributes
31
+ # to a Hash so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
32
+
33
+ token[:data] = Hash[*token[:data].reverse.flatten]
34
+
35
+ # For EmptyTags, process both a Start and an End tag
36
+ if token[:type] == :EmptyTag
37
+ save = @tokenizer.content_model_flag
38
+ @phase.processStartTag(token[:name], token[:data])
39
+ @tokenizer.content_model_flag = save
40
+ token[:data] = {}
41
+ token[:type] = :EndTag
42
+ end
43
+
44
+ when :Characters
45
+ # un-escape RCDATA_ELEMENTS (e.g. style, script)
46
+ if @tokenizer.content_model_flag == :CDATA
47
+ token[:data] = token[:data].
48
+ gsub('&lt;','<').gsub('&gt;','>').gsub('&amp;','&')
49
+ end
50
+
51
+ when :EndTag
52
+ if token[:data]
53
+ parse_error(_("End tag contains unexpected attributes."))
54
+ end
55
+
56
+ when :Comment
57
+ # Rescue CDATA from the comments
58
+ if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
59
+ token[:type] = :Characters
60
+ token[:data] = token[:data][7 ... -2]
61
+ end
62
+ end
63
+
64
+ return token
65
+ end
66
+ end
67
+
68
+ # liberal XMTHML parser
69
+ class XHTMLParser < XMLParser
70
+
71
+ def initialize(options = {})
72
+ super options
73
+ @phases[:initial] = InitialPhase.new(self, @tree)
74
+ @phases[:rootElement] = XhmlRootPhase.new(self, @tree)
75
+ end
76
+
77
+ def normalize_token(token)
78
+ super(token)
79
+
80
+ # ensure that non-void XHTML elements have content so that separate
81
+ # open and close tags are emitted
82
+ if token[:type] == :EndTag
83
+ if VOID_ELEMENTS.include? token[:name]
84
+ if @tree.open_elements[-1].name != token["name"]:
85
+ token[:type] = :EmptyTag
86
+ token["data"] ||= {}
87
+ end
88
+ else
89
+ if token[:name] == @tree.open_elements[-1].name and \
90
+ not @tree.open_elements[-1].hasContent
91
+ @tree.insertText('') unless
92
+ @tree.open_elements.any? {|e|
93
+ e.attributes.keys.include? 'xmlns' and
94
+ e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
95
+ }
96
+ end
97
+ end
98
+ end
99
+
100
+ return token
101
+ end
102
+ end
103
+
104
+ class XhmlRootPhase < RootElementPhase
105
+ def insert_html_element
106
+ element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
107
+ @tree.open_elements.push(element)
108
+ @tree.document.appendChild(element)
109
+ @parser.phase = @parser.phases[:beforeHead]
110
+ end
111
+ end
112
+
113
+ class XmlRootPhase < Phase
114
+ # Prime the Xml parser
115
+ @start_tag_handlers = Hash.new(:startTagOther)
116
+ @end_tag_handlers = Hash.new(:endTagOther)
117
+ def startTagOther(name, attributes)
118
+ @tree.open_elements.push(@tree.document)
119
+ element = @tree.createElement(name, attributes)
120
+ @tree.open_elements[-1].appendChild(element)
121
+ @tree.open_elements.push(element)
122
+ @parser.phase = XmlElementPhase.new(@parser,@tree)
123
+ end
124
+ def endTagOther(name)
125
+ super
126
+ @tree.open_elements.pop
127
+ end
128
+ end
129
+
130
+ class XmlElementPhase < Phase
131
+ # Generic handling for all XML elements
132
+
133
+ @start_tag_handlers = Hash.new(:startTagOther)
134
+ @end_tag_handlers = Hash.new(:endTagOther)
135
+
136
+ def startTagOther(name, attributes)
137
+ element = @tree.createElement(name, attributes)
138
+ @tree.open_elements[-1].appendChild(element)
139
+ @tree.open_elements.push(element)
140
+ end
141
+
142
+ def endTagOther(name)
143
+ for node in @tree.open_elements.reverse
144
+ if node.name == name
145
+ {} while @tree.open_elements.pop != node
146
+ break
147
+ else
148
+ parse_error
149
+ end
150
+ end
151
+ end
152
+
153
+ def processCharacters(data)
154
+ @tree.insertText(data)
155
+ end
156
+ end
157
+
158
+ end
@@ -0,0 +1,188 @@
1
+ require 'cgi'
2
+ require 'html5/tokenizer'
3
+
4
+ module HTML5
5
+
6
+ # This module provides sanitization of XHTML+MathML+SVG
7
+ # and of inline style attributes.
8
+ #
9
+ # It can be either at the Tokenizer stage:
10
+ #
11
+ # HTMLParser.parse(html, :tokenizer => HTMLSanitizer)
12
+ #
13
+ # or, if you already have a parse tree (in this example, a REXML tree),
14
+ # at the Serializer stage:
15
+ #
16
+ # tokens = TreeWalkers.get_tree_walker('rexml').new(tree)
17
+ # HTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
18
+ # :sanitize => true})
19
+
20
+ module HTMLSanitizeModule
21
+
22
+ ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
23
+ button caption center cite code col colgroup dd del dfn dir div dl dt
24
+ em fieldset font form h1 h2 h3 h4 h5 h6 hr i img input ins kbd label
25
+ legend li map menu ol optgroup option p pre q s samp select small span
26
+ strike strong sub sup table tbody td textarea tfoot th thead tr tt u
27
+ ul var]
28
+
29
+ MATHML_ELEMENTS = %w[maction math merror mfrac mi mmultiscripts mn mo
30
+ mover mpadded mphantom mprescripts mroot mrow mspace msqrt mstyle msub
31
+ msubsup msup mtable mtd mtext mtr munder munderover none]
32
+
33
+ SVG_ELEMENTS = %w[a animate animateColor animateMotion animateTransform
34
+ circle defs desc ellipse font-face font-face-name font-face-src g
35
+ glyph hkern image linearGradient line marker metadata missing-glyph
36
+ mpath path polygon polyline radialGradient rect set stop svg switch
37
+ text title tspan use]
38
+
39
+ ACCEPTABLE_ATTRIBUTES = %w[abbr accept accept-charset accesskey action
40
+ align alt axis border cellpadding cellspacing char charoff charset
41
+ checked cite class clear cols colspan color compact coords datetime
42
+ dir disabled enctype for frame headers height href hreflang hspace id
43
+ ismap label lang longdesc maxlength media method multiple name nohref
44
+ noshade nowrap prompt readonly rel rev rows rowspan rules scope
45
+ selected shape size span src start style summary tabindex target title
46
+ type usemap valign value vspace width xml:lang]
47
+
48
+ MATHML_ATTRIBUTES = %w[actiontype align columnalign columnalign
49
+ columnalign columnlines columnspacing columnspan depth display
50
+ displaystyle equalcolumns equalrows fence fontstyle fontweight frame
51
+ height linethickness lspace mathbackground mathcolor mathvariant
52
+ mathvariant maxsize minsize other rowalign rowalign rowalign rowlines
53
+ rowspacing rowspan rspace scriptlevel selection separator stretchy
54
+ width width xlink:href xlink:show xlink:type xmlns xmlns:xlink]
55
+
56
+ SVG_ATTRIBUTES = %w[accent-height accumulate additive alphabetic
57
+ arabic-form ascent attributeName attributeType baseProfile bbox begin
58
+ by calcMode cap-height class color color-rendering content cx cy d dx
59
+ dy descent display dur end fill fill-rule font-family font-size
60
+ font-stretch font-style font-variant font-weight from fx fy g1 g2
61
+ glyph-name gradientUnits hanging height horiz-adv-x horiz-origin-x id
62
+ ideographic k keyPoints keySplines keyTimes lang marker-end
63
+ marker-mid marker-start markerHeight markerUnits markerWidth
64
+ mathematical max min name offset opacity orient origin
65
+ overline-position overline-thickness panose-1 path pathLength points
66
+ preserveAspectRatio r refX refY repeatCount repeatDur
67
+ requiredExtensions requiredFeatures restart rotate rx ry slope stemh
68
+ stemv stop-color stop-opacity strikethrough-position
69
+ strikethrough-thickness stroke stroke-dasharray stroke-dashoffset
70
+ stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity
71
+ stroke-width systemLanguage target text-anchor to transform type u1
72
+ u2 underline-position underline-thickness unicode unicode-range
73
+ units-per-em values version viewBox visibility width widths x
74
+ x-height x1 x2 xlink:actuate xlink:arcrole xlink:href xlink:role
75
+ xlink:show xlink:title xlink:type xml:base xml:lang xml:space xmlns
76
+ xmlns:xlink y y1 y2 zoomAndPan]
77
+
78
+ ATTR_VAL_IS_URI = %w[href src cite action longdesc xlink:href xml:base]
79
+
80
+ ACCEPTABLE_CSS_PROPERTIES = %w[azimuth background-color
81
+ border-bottom-color border-collapse border-color border-left-color
82
+ border-right-color border-top-color clear color cursor direction
83
+ display elevation float font font-family font-size font-style
84
+ font-variant font-weight height letter-spacing line-height overflow
85
+ pause pause-after pause-before pitch pitch-range richness speak
86
+ speak-header speak-numeral speak-punctuation speech-rate stress
87
+ text-align text-decoration text-indent unicode-bidi vertical-align
88
+ voice-family volume white-space width]
89
+
90
+ ACCEPTABLE_CSS_KEYWORDS = %w[auto aqua black block blue bold both bottom
91
+ brown center collapse dashed dotted fuchsia gray green !important
92
+ italic left lime maroon medium none navy normal nowrap olive pointer
93
+ purple red right solid silver teal top transparent underline white
94
+ yellow]
95
+
96
+ ACCEPTABLE_SVG_PROPERTIES = %w[fill fill-opacity fill-rule stroke
97
+ stroke-width stroke-linecap stroke-linejoin stroke-opacity]
98
+
99
+ ACCEPTABLE_PROTOCOLS = %w[ed2k ftp http https irc mailto news gopher nntp
100
+ telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs]
101
+
102
+ # subclasses may define their own versions of these constants
103
+ ALLOWED_ELEMENTS = ACCEPTABLE_ELEMENTS + MATHML_ELEMENTS + SVG_ELEMENTS
104
+ ALLOWED_ATTRIBUTES = ACCEPTABLE_ATTRIBUTES + MATHML_ATTRIBUTES + SVG_ATTRIBUTES
105
+ ALLOWED_CSS_PROPERTIES = ACCEPTABLE_CSS_PROPERTIES
106
+ ALLOWED_CSS_KEYWORDS = ACCEPTABLE_CSS_KEYWORDS
107
+ ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
108
+ ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
109
+
110
+ def sanitize_token(token)
111
+ case token[:type]
112
+ when :StartTag, :EndTag, :EmptyTag
113
+ if ALLOWED_ELEMENTS.include?(token[:name])
114
+ if token.has_key? :data
115
+ attrs = Hash[*token[:data].flatten]
116
+ attrs.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
117
+ ATTR_VAL_IS_URI.each do |attr|
118
+ val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
119
+ if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
120
+ attrs.delete attr
121
+ end
122
+ end
123
+ if attrs['style']
124
+ attrs['style'] = sanitize_css(attrs['style'])
125
+ end
126
+ token[:data] = attrs.map {|k,v| [k,v]}
127
+ end
128
+ return token
129
+ else
130
+ if token[:type] == :EndTag
131
+ token[:data] = "</#{token[:name]}>"
132
+ elsif token[:data]
133
+ attrs = token[:data].map {|k,v| " #{k}=\"#{CGI.escapeHTML(v)}\""}.join('')
134
+ token[:data] = "<#{token[:name]}#{attrs}>"
135
+ else
136
+ token[:data] = "<#{token[:name]}>"
137
+ end
138
+ token[:data].insert(-2,'/') if token[:type] == :EmptyTag
139
+ token[:type] = :Characters
140
+ token.delete(:name)
141
+ return token
142
+ end
143
+ when :Comment
144
+ token[:data] = ""
145
+ return token
146
+ else
147
+ return token
148
+ end
149
+ end
150
+
151
+ def sanitize_css(style)
152
+ # disallow urls
153
+ style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
154
+
155
+ # gauntlet
156
+ return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
157
+ return '' unless style =~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
158
+
159
+ clean = []
160
+ style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
161
+ next if val.empty?
162
+ prop.downcase!
163
+ if ALLOWED_CSS_PROPERTIES.include?(prop)
164
+ clean << "#{prop}: #{val};"
165
+ elsif %w[background border margin padding].include?(prop.split('-')[0])
166
+ clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
167
+ !ALLOWED_CSS_KEYWORDS.include?(keyword) and
168
+ keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
169
+ end
170
+ elsif ALLOWED_SVG_PROPERTIES.include?(prop)
171
+ clean << "#{prop}: #{val};"
172
+ end
173
+ end
174
+
175
+ style = clean.join(' ')
176
+ end
177
+ end
178
+
179
+ class HTMLSanitizer < HTMLTokenizer
180
+ include HTMLSanitizeModule
181
+ def each
182
+ super do |token|
183
+ yield(sanitize_token(token))
184
+ end
185
+ end
186
+ end
187
+
188
+ end
@@ -0,0 +1,180 @@
1
+ require 'html5/constants'
2
+
3
+ module HTML5
4
+
5
+ class HTMLSerializer
6
+
7
+ def self.serialize(stream, options = {})
8
+ new(options).serialize(stream, options[:encoding])
9
+ end
10
+
11
+ def escape(string)
12
+ string.gsub("&", "&amp;").gsub("<", "&lt;").gsub(">", "&gt;")
13
+ end
14
+
15
+ def initialize(options={})
16
+ @quote_attr_values = false
17
+ @quote_char = '"'
18
+ @use_best_quote_char = true
19
+ @minimize_boolean_attributes = true
20
+
21
+ @use_trailing_solidus = false
22
+ @space_before_trailing_solidus = true
23
+ @escape_lt_in_attrs = false
24
+ @escape_rcdata = false
25
+
26
+ @omit_optional_tags = true
27
+ @sanitize = false
28
+
29
+ @strip_whitespace = false
30
+
31
+ @inject_meta_charset = true
32
+
33
+ options.each do |name, value|
34
+ next unless instance_variable_defined?("@#{name}")
35
+ @use_best_quote_char = false if name.to_s == 'quote_char'
36
+ instance_variable_set("@#{name}", value)
37
+ end
38
+
39
+ @errors = []
40
+ end
41
+
42
+ def serialize(treewalker, encoding=nil)
43
+ in_cdata = false
44
+ @errors = []
45
+
46
+ if encoding and @inject_meta_charset
47
+ require 'html5/filters/inject_meta_charset'
48
+ treewalker = Filters::InjectMetaCharset.new(treewalker, encoding)
49
+ end
50
+
51
+ if @strip_whitespace
52
+ require 'html5/filters/whitespace'
53
+ treewalker = Filters::WhitespaceFilter.new(treewalker)
54
+ end
55
+
56
+ if @sanitize
57
+ require 'html5/filters/sanitizer'
58
+ treewalker = Filters::HTMLSanitizeFilter.new(treewalker)
59
+ end
60
+
61
+ if @omit_optional_tags
62
+ require 'html5/filters/optionaltags'
63
+ treewalker = Filters::OptionalTagFilter.new(treewalker)
64
+ end
65
+
66
+ result = []
67
+ treewalker.each do |token|
68
+ type = token[:type]
69
+ if type == :Doctype
70
+ doctype = "<!DOCTYPE %s>" % token[:name]
71
+ result << doctype
72
+
73
+ elsif [:Characters, :SpaceCharacters].include? type
74
+ if type == :SpaceCharacters or in_cdata
75
+ if in_cdata and token[:data].include?("</")
76
+ serialize_error(_("Unexpected </ in CDATA"))
77
+ end
78
+ result << token[:data]
79
+ else
80
+ result << escape(token[:data])
81
+ end
82
+
83
+ elsif [:StartTag, :EmptyTag].include? type
84
+ name = token[:name]
85
+ if RCDATA_ELEMENTS.include?(name) and not @escape_rcdata
86
+ in_cdata = true
87
+ elsif in_cdata
88
+ serialize_error(_("Unexpected child element of a CDATA element"))
89
+ end
90
+ attributes = []
91
+ for k,v in attrs = token[:data].to_a.sort
92
+ attributes << ' '
93
+
94
+ attributes << k
95
+ if not @minimize_boolean_attributes or \
96
+ (!(BOOLEAN_ATTRIBUTES[name]||[]).include?(k) \
97
+ and !BOOLEAN_ATTRIBUTES[:global].include?(k))
98
+ attributes << "="
99
+ if @quote_attr_values or v.empty?
100
+ quote_attr = true
101
+ else
102
+ quote_attr = (SPACE_CHARACTERS + %w(< > " ')).any? {|c| v.include?(c)}
103
+ end
104
+ v = v.gsub("&", "&amp;")
105
+ v = v.gsub("<", "&lt;") if @escape_lt_in_attrs
106
+ if quote_attr
107
+ quote_char = @quote_char
108
+ if @use_best_quote_char
109
+ if v.index("'") and !v.index('"')
110
+ quote_char = '"'
111
+ elsif v.index('"') and !v.index("'")
112
+ quote_char = "'"
113
+ end
114
+ end
115
+ if quote_char == "'"
116
+ v = v.gsub("'", "&#39;")
117
+ else
118
+ v = v.gsub('"', "&quot;")
119
+ end
120
+ attributes << quote_char << v << quote_char
121
+ else
122
+ attributes << v
123
+ end
124
+ end
125
+ end
126
+ if VOID_ELEMENTS.include?(name) and @use_trailing_solidus
127
+ if @space_before_trailing_solidus
128
+ attributes << " /"
129
+ else
130
+ attributes << "/"
131
+ end
132
+ end
133
+ result << "<%s%s>" % [name, attributes.join('')]
134
+
135
+ elsif type == :EndTag
136
+ name = token[:name]
137
+ if RCDATA_ELEMENTS.include?(name)
138
+ in_cdata = false
139
+ elsif in_cdata
140
+ serialize_error(_("Unexpected child element of a CDATA element"))
141
+ end
142
+ end_tag = "</#{name}>"
143
+ result << end_tag
144
+
145
+ elsif type == :Comment
146
+ data = token[:data]
147
+ serialize_error(_("Comment contains --")) if data.index("--")
148
+ comment = "<!--%s-->" % token[:data]
149
+ result << comment
150
+
151
+ else
152
+ serialize_error(token[:data])
153
+ end
154
+ end
155
+
156
+ if encoding and encoding != 'utf-8'
157
+ require 'iconv'
158
+ Iconv.iconv(encoding, 'utf-8', result.join('')).first
159
+ else
160
+ result.join('')
161
+ end
162
+ end
163
+
164
+ alias :render :serialize
165
+
166
+ def serialize_error(data="XXX ERROR MESSAGE NEEDED")
167
+ # XXX The idea is to make data mandatory.
168
+ @errors.push(data)
169
+ if @strict
170
+ raise SerializeError
171
+ end
172
+ end
173
+
174
+ def _(string); string; end
175
+ end
176
+
177
+ # Error in serialized tree
178
+ class SerializeError < Exception
179
+ end
180
+ end
@@ -0,0 +1,20 @@
1
+ require 'html5/serializer/htmlserializer'
2
+
3
+ module HTML5
4
+
5
+ class XHTMLSerializer < HTMLSerializer
6
+ DEFAULTS = {
7
+ :quote_attr_values => true,
8
+ :minimize_boolean_attributes => false,
9
+ :use_trailing_solidus => true,
10
+ :escape_lt_in_attrs => true,
11
+ :omit_optional_tags => false,
12
+ :escape_rcdata => true
13
+ }
14
+
15
+ def initialize(options={})
16
+ super(DEFAULTS.clone.update(options))
17
+ end
18
+ end
19
+
20
+ end
@@ -0,0 +1,2 @@
1
+ require 'html5/serializer/htmlserializer'
2
+ require 'html5/serializer/xhtmlserializer'