spk-html5 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. data/History.txt +10 -0
  2. data/Manifest.txt +73 -0
  3. data/README +45 -0
  4. data/Rakefile.rb +33 -0
  5. data/bin/html5 +7 -0
  6. data/lib/html5.rb +13 -0
  7. data/lib/html5/cli.rb +248 -0
  8. data/lib/html5/constants.rb +1061 -0
  9. data/lib/html5/filters/base.rb +10 -0
  10. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  11. data/lib/html5/filters/iso639codes.rb +755 -0
  12. data/lib/html5/filters/optionaltags.rb +198 -0
  13. data/lib/html5/filters/rfc2046.rb +31 -0
  14. data/lib/html5/filters/rfc3987.rb +91 -0
  15. data/lib/html5/filters/sanitizer.rb +15 -0
  16. data/lib/html5/filters/validator.rb +834 -0
  17. data/lib/html5/filters/whitespace.rb +36 -0
  18. data/lib/html5/html5parser.rb +247 -0
  19. data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
  20. data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
  21. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  22. data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  23. data/lib/html5/html5parser/after_head_phase.rb +55 -0
  24. data/lib/html5/html5parser/before_head_phase.rb +44 -0
  25. data/lib/html5/html5parser/before_html_phase.rb +41 -0
  26. data/lib/html5/html5parser/in_body_phase.rb +636 -0
  27. data/lib/html5/html5parser/in_caption_phase.rb +69 -0
  28. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  29. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  30. data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
  31. data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
  32. data/lib/html5/html5parser/in_head_phase.rb +143 -0
  33. data/lib/html5/html5parser/in_row_phase.rb +96 -0
  34. data/lib/html5/html5parser/in_select_phase.rb +90 -0
  35. data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
  36. data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
  37. data/lib/html5/html5parser/in_table_phase.rb +177 -0
  38. data/lib/html5/html5parser/initial_phase.rb +133 -0
  39. data/lib/html5/html5parser/phase.rb +171 -0
  40. data/lib/html5/inputstream.rb +735 -0
  41. data/lib/html5/liberalxmlparser.rb +158 -0
  42. data/lib/html5/sanitizer.rb +209 -0
  43. data/lib/html5/serializer.rb +2 -0
  44. data/lib/html5/serializer/htmlserializer.rb +179 -0
  45. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  46. data/lib/html5/sniffer.rb +45 -0
  47. data/lib/html5/tokenizer.rb +1059 -0
  48. data/lib/html5/treebuilders.rb +24 -0
  49. data/lib/html5/treebuilders/base.rb +339 -0
  50. data/lib/html5/treebuilders/hpricot.rb +231 -0
  51. data/lib/html5/treebuilders/rexml.rb +215 -0
  52. data/lib/html5/treebuilders/simpletree.rb +191 -0
  53. data/lib/html5/treewalkers.rb +26 -0
  54. data/lib/html5/treewalkers/base.rb +162 -0
  55. data/lib/html5/treewalkers/hpricot.rb +48 -0
  56. data/lib/html5/treewalkers/rexml.rb +48 -0
  57. data/lib/html5/treewalkers/simpletree.rb +48 -0
  58. data/lib/html5/version.rb +3 -0
  59. data/test/preamble.rb +69 -0
  60. data/test/test_cli.rb +16 -0
  61. data/test/test_encoding.rb +35 -0
  62. data/test/test_input_stream.rb +26 -0
  63. data/test/test_lxp.rb +283 -0
  64. data/test/test_parser.rb +63 -0
  65. data/test/test_sanitizer.rb +173 -0
  66. data/test/test_serializer.rb +67 -0
  67. data/test/test_sniffer.rb +27 -0
  68. data/test/test_stream.rb +71 -0
  69. data/test/test_tokenizer.rb +95 -0
  70. data/test/test_treewalkers.rb +135 -0
  71. data/test/test_validator.rb +31 -0
  72. data/test/tokenizer_test_parser.rb +67 -0
  73. data/test19.rb +38 -0
  74. metadata +198 -0
@@ -0,0 +1,158 @@
1
+ # Warning: this module is experimental and subject to change and even removal
2
+ # at any time.
3
+ #
4
+ # For background/rationale, see:
5
+ # * http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
6
+ # * http://tinyurl.com/ylfj8k (and follow-ups)
7
+ #
8
+ # References:
9
+ # * http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
10
+ # * http://wiki.whatwg.org/wiki/HtmlVsXhtml
11
+ #
12
+ # @@TODO:
13
+ # * Selectively lowercase only XHTML, but not foreign markup
14
+ require 'html5/html5parser'
15
+ require 'html5/constants'
16
+
17
+ module HTML5
18
+
19
+ # liberal XML parser
20
+ class XMLParser < HTMLParser
21
+
22
+ def initialize(options = {})
23
+ super options
24
+ @phases[:initial] = XmlRootPhase.new(self, @tree)
25
+ end
26
+
27
+ def normalize_token(token)
28
+ case token[:type]
29
+ when :StartTag, :EmptyTag
30
+ # We need to remove the duplicate attributes and convert attributes
31
+ # to a Hash so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
32
+
33
+ token[:data] = Hash[*token[:data].reverse.flatten]
34
+
35
+ # For EmptyTags, process both a Start and an End tag
36
+ if token[:type] == :EmptyTag
37
+ save = @tokenizer.content_model_flag
38
+ @phase.processStartTag(token[:name], token[:data])
39
+ @tokenizer.content_model_flag = save
40
+ token[:data] = {}
41
+ token[:type] = :EndTag
42
+ end
43
+
44
+ when :Characters
45
+ # un-escape RCDATA_ELEMENTS (e.g. style, script)
46
+ if @tokenizer.content_model_flag == :CDATA
47
+ token[:data] = token[:data].
48
+ gsub('&lt;','<').gsub('&gt;','>').gsub('&amp;','&')
49
+ end
50
+
51
+ when :EndTag
52
+ if token[:data]
53
+ parse_error("attributes-in-end-tag")
54
+ end
55
+
56
+ when :Comment
57
+ # Rescue CDATA from the comments
58
+ if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
59
+ token[:type] = :Characters
60
+ token[:data] = token[:data][7 ... -2]
61
+ end
62
+ end
63
+
64
+ return token
65
+ end
66
+ end
67
+
68
+ # liberal XMTHML parser
69
+ class XHTMLParser < XMLParser
70
+
71
+ def initialize(options = {})
72
+ super options
73
+ @phases[:initial] = InitialPhase.new(self, @tree)
74
+ @phases[:beforeHtml] = XhmlRootPhase.new(self, @tree)
75
+ end
76
+
77
+ def normalize_token(token)
78
+ super(token)
79
+
80
+ # ensure that non-void XHTML elements have content so that separate
81
+ # open and close tags are emitted
82
+ if token[:type] == :EndTag
83
+ if VOID_ELEMENTS.include? token[:name]
84
+ if @tree.open_elements[-1].name != token["name"]
85
+ token[:type] = :EmptyTag
86
+ token["data"] ||= {}
87
+ end
88
+ else
89
+ if token[:name] == @tree.open_elements[-1].name and \
90
+ not @tree.open_elements[-1].hasContent
91
+ @tree.insertText('') unless
92
+ @tree.open_elements.any? {|e|
93
+ e.attributes.keys.include? 'xmlns' and
94
+ e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
95
+ }
96
+ end
97
+ end
98
+ end
99
+
100
+ return token
101
+ end
102
+ end
103
+
104
+ class XhmlRootPhase < BeforeHtmlPhase
105
+ def insert_html_element
106
+ element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
107
+ @tree.open_elements.push(element)
108
+ @tree.document.appendChild(element)
109
+ @parser.phase = @parser.phases[:beforeHead]
110
+ end
111
+ end
112
+
113
+ class XmlRootPhase < Phase
114
+ # Prime the Xml parser
115
+ @start_tag_handlers = Hash.new(:startTagOther)
116
+ @end_tag_handlers = Hash.new(:endTagOther)
117
+ def startTagOther(name, attributes)
118
+ @tree.open_elements.push(@tree.document)
119
+ element = @tree.createElement(name, attributes)
120
+ @tree.open_elements[-1].appendChild(element)
121
+ @tree.open_elements.push(element)
122
+ @parser.phase = XmlElementPhase.new(@parser,@tree)
123
+ end
124
+ def endTagOther(name)
125
+ super
126
+ @tree.open_elements.pop
127
+ end
128
+ end
129
+
130
+ class XmlElementPhase < Phase
131
+ # Generic handling for all XML elements
132
+
133
+ @start_tag_handlers = Hash.new(:startTagOther)
134
+ @end_tag_handlers = Hash.new(:endTagOther)
135
+
136
+ def startTagOther(name, attributes)
137
+ element = @tree.createElement(name, attributes)
138
+ @tree.open_elements[-1].appendChild(element)
139
+ @tree.open_elements.push(element)
140
+ end
141
+
142
+ def endTagOther(name)
143
+ for node in @tree.open_elements.reverse
144
+ if node.name == name
145
+ {} while @tree.open_elements.pop != node
146
+ break
147
+ else
148
+ parse_error
149
+ end
150
+ end
151
+ end
152
+
153
+ def processCharacters(data)
154
+ @tree.insertText(data)
155
+ end
156
+ end
157
+
158
+ end
@@ -0,0 +1,209 @@
1
+ require 'cgi'
2
+ require 'html5/tokenizer'
3
+ require 'set'
4
+
5
+ module HTML5
6
+
7
+ # This module provides sanitization of XHTML+MathML+SVG
8
+ # and of inline style attributes.
9
+ #
10
+ # It can be either at the Tokenizer stage:
11
+ #
12
+ # HTMLParser.parse(html, :tokenizer => HTMLSanitizer)
13
+ #
14
+ # or, if you already have a parse tree (in this example, a REXML tree),
15
+ # at the Serializer stage:
16
+ #
17
+ # tokens = TreeWalkers.get_tree_walker('rexml').new(tree)
18
+ # HTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
19
+ # :sanitize => true})
20
+
21
+ module HTMLSanitizeModule
22
+
23
+ ACCEPTABLE_ELEMENTS = Set.new %w[a abbr acronym address area audio b big blockquote br
24
+ button caption center cite code col colgroup dd del dfn dir div dl dt
25
+ em fieldset font form h1 h2 h3 h4 h5 h6 hr i img input ins kbd label
26
+ legend li map menu ol optgroup option p pre q s samp select small span
27
+ strike strong sub sup table tbody td textarea tfoot th thead tr tt u
28
+ ul var video]
29
+
30
+ MATHML_ELEMENTS = Set.new %w[annotation annotation-xml maction math merror mfrac
31
+ mfenced mi mmultiscripts mn mo mover mpadded mphantom mprescripts mroot mrow
32
+ mspace msqrt mstyle msub msubsup msup mtable mtd mtext mtr munder
33
+ munderover none semantics]
34
+
35
+ SVG_ELEMENTS = Set.new %w[a animate animateColor animateMotion animateTransform
36
+ circle clipPath defs desc ellipse feGaussianBlur filter font-face
37
+ font-face-name font-face-src foreignObject
38
+ g glyph hkern linearGradient line marker mask metadata missing-glyph
39
+ mpath path polygon polyline radialGradient rect set stop svg switch
40
+ text textPath title tspan use]
41
+
42
+ ACCEPTABLE_ATTRIBUTES = Set.new %w[abbr accept accept-charset accesskey action
43
+ align alt axis border cellpadding cellspacing char charoff charset
44
+ checked cite class clear cols colspan color compact coords datetime
45
+ dir disabled enctype for frame headers height href hreflang hspace id
46
+ ismap label lang longdesc loop loopcount loopend loopstart
47
+ maxlength media method multiple name nohref
48
+ noshade nowrap poster prompt readonly rel rev rows rowspan rules scope
49
+ selected shape size span src start style summary tabindex target title
50
+ type usemap valign value vspace width xml:lang]
51
+
52
+ MATHML_ATTRIBUTES = Set.new %w[actiontype align close
53
+ columnalign columnlines columnspacing columnspan depth display
54
+ displaystyle encoding equalcolumns equalrows fence fontstyle fontweight
55
+ frame height linethickness lspace mathbackground mathcolor mathvariant
56
+ maxsize minsize open other rowalign rowlines
57
+ rowspacing rowspan rspace scriptlevel selection separator separators
58
+ stretchy width xlink:href xlink:show xlink:type xmlns xmlns:xlink]
59
+
60
+ SVG_ATTRIBUTES = Set.new %w[accent-height accumulate additive alphabetic
61
+ arabic-form ascent attributeName attributeType baseProfile bbox begin
62
+ by calcMode cap-height class clip-path clip-rule color
63
+ color-interpolation-filters color-rendering content cx cy d dx
64
+ dy descent display dur end fill fill-opacity fill-rule
65
+ filterRes filterUnits font-family
66
+ font-size font-stretch font-style font-variant font-weight from fx fy g1
67
+ g2 glyph-name gradientUnits hanging height horiz-adv-x horiz-origin-x id
68
+ ideographic k keyPoints keySplines keyTimes lang marker-end
69
+ marker-mid marker-start markerHeight markerUnits markerWidth
70
+ maskContentUnits maskUnits mathematical max method min name offset opacity orient origin
71
+ overline-position overline-thickness panose-1 path pathLength
72
+ patternContentUnits patternTransform patternUnits points
73
+ preserveAspectRatio primitiveUnits r refX refY repeatCount repeatDur
74
+ requiredExtensions requiredFeatures restart rotate rx ry slope spacing
75
+ startOffset stdDeviation stemh
76
+ stemv stop-color stop-opacity strikethrough-position
77
+ strikethrough-thickness stroke stroke-dasharray stroke-dashoffset
78
+ stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity
79
+ stroke-width systemLanguage target text-anchor to transform type u1
80
+ u2 underline-position underline-thickness unicode unicode-range
81
+ units-per-em values version viewBox visibility width widths x
82
+ x-height x1 x2 xlink:actuate xlink:arcrole xlink:href xlink:role
83
+ xlink:show xlink:title xlink:type xml:base xml:lang xml:space xmlns
84
+ xmlns:xlink y y1 y2 zoomAndPan]
85
+
86
+ ATTR_VAL_IS_URI = Set.new %w[href src cite action longdesc xlink:href xml:base]
87
+
88
+ SVG_ATTR_VAL_ALLOWS_REF = Set.new %w[clip-path color-profile cursor fill
89
+ filter marker marker-start marker-mid marker-end mask stroke]
90
+
91
+ SVG_ALLOW_LOCAL_HREF = Set.new %w[altGlyph animate animateColor animateMotion
92
+ animateTransform cursor feImage filter linearGradient pattern
93
+ radialGradient textpath tref set use]
94
+
95
+ ACCEPTABLE_CSS_PROPERTIES = Set.new %w[azimuth background-color
96
+ border-bottom-color border-collapse border-color border-left-color
97
+ border-right-color border-top-color clear color cursor direction
98
+ display elevation float font font-family font-size font-style
99
+ font-variant font-weight height letter-spacing line-height overflow
100
+ pause pause-after pause-before pitch pitch-range richness speak
101
+ speak-header speak-numeral speak-punctuation speech-rate stress
102
+ text-align text-decoration text-indent unicode-bidi vertical-align
103
+ voice-family volume white-space width]
104
+
105
+ ACCEPTABLE_CSS_KEYWORDS = Set.new %w[auto aqua black block blue bold both bottom
106
+ brown center collapse dashed dotted fuchsia gray green !important
107
+ italic left lime maroon medium none navy normal nowrap olive pointer
108
+ purple red right solid silver teal top transparent underline white
109
+ yellow]
110
+
111
+ ACCEPTABLE_SVG_PROPERTIES = Set.new %w[fill fill-opacity fill-rule stroke
112
+ stroke-width stroke-linecap stroke-linejoin stroke-opacity]
113
+
114
+ ACCEPTABLE_PROTOCOLS = Set.new %w[ed2k ftp http https irc mailto news gopher nntp
115
+ telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs]
116
+
117
+ # subclasses may define their own versions of these constants
118
+ ALLOWED_ELEMENTS = ACCEPTABLE_ELEMENTS + MATHML_ELEMENTS + SVG_ELEMENTS
119
+ ALLOWED_ATTRIBUTES = ACCEPTABLE_ATTRIBUTES + MATHML_ATTRIBUTES + SVG_ATTRIBUTES
120
+ ALLOWED_CSS_PROPERTIES = ACCEPTABLE_CSS_PROPERTIES
121
+ ALLOWED_CSS_KEYWORDS = ACCEPTABLE_CSS_KEYWORDS
122
+ ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
123
+ ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
124
+
125
+ def sanitize_token(token)
126
+ case token[:type]
127
+ when :StartTag, :EndTag, :EmptyTag
128
+ if self.class.const_get("ALLOWED_ELEMENTS").include?(token[:name])
129
+ if token.has_key? :data
130
+ attrs = Hash[*token[:data].flatten]
131
+ attrs.delete_if { |attr,v| !self.class.const_get("ALLOWED_ATTRIBUTES").include?(attr) }
132
+ ATTR_VAL_IS_URI.each do |attr|
133
+ val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
134
+ if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !self.class.const_get("ALLOWED_PROTOCOLS").include?(val_unescaped.split(':')[0])
135
+ attrs.delete attr
136
+ end
137
+ end
138
+ SVG_ATTR_VAL_ALLOWS_REF.each do |attr|
139
+ attrs[attr] = attrs[attr].to_s.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if attrs[attr]
140
+ end
141
+ if SVG_ALLOW_LOCAL_HREF.include?(token[:name]) && attrs['xlink:href'] && attrs['xlink:href'] =~ /^\s*[^#\s].*/m
142
+ attrs.delete 'xlink:href'
143
+ end
144
+ if attrs['style']
145
+ attrs['style'] = sanitize_css(attrs['style'])
146
+ end
147
+ token[:data] = attrs.map {|k,v| [k,v]}
148
+ end
149
+ return token
150
+ else
151
+ if token[:type] == :EndTag
152
+ token[:data] = "</#{token[:name]}>"
153
+ elsif token[:data]
154
+ attrs = token[:data].map {|k,v| " #{k}=\"#{CGI.escapeHTML(v)}\""}.join('')
155
+ token[:data] = "<#{token[:name]}#{attrs}>"
156
+ else
157
+ token[:data] = "<#{token[:name]}>"
158
+ end
159
+ token[:data].insert(-2,'/') if token[:type] == :EmptyTag
160
+ token[:type] = :Characters
161
+ token.delete(:name)
162
+ return token
163
+ end
164
+ when :Comment
165
+ token[:data] = ""
166
+ return token
167
+ else
168
+ return token
169
+ end
170
+ end
171
+
172
+ def sanitize_css(style)
173
+ # disallow urls
174
+ style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
175
+
176
+ # gauntlet
177
+ return '' unless style =~ /^([-:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
178
+ return '' unless style =~ /^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$/
179
+
180
+ clean = []
181
+ style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
182
+ next if val.empty?
183
+ prop.downcase!
184
+ if self.class.const_get("ALLOWED_CSS_PROPERTIES").include?(prop)
185
+ clean << "#{prop}: #{val};"
186
+ elsif %w[background border margin padding].include?(prop.split('-')[0])
187
+ clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
188
+ !self.class.const_get("ALLOWED_CSS_KEYWORDS").include?(keyword) and
189
+ keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
190
+ end
191
+ elsif self.class.const_get("ALLOWED_SVG_PROPERTIES").include?(prop)
192
+ clean << "#{prop}: #{val};"
193
+ end
194
+ end
195
+
196
+ style = clean.join(' ')
197
+ end
198
+ end
199
+
200
+ class HTMLSanitizer < HTMLTokenizer
201
+ include HTMLSanitizeModule
202
+ def each
203
+ super do |token|
204
+ yield(sanitize_token(token))
205
+ end
206
+ end
207
+ end
208
+
209
+ end
@@ -0,0 +1,2 @@
1
+ require 'html5/serializer/htmlserializer'
2
+ require 'html5/serializer/xhtmlserializer'
@@ -0,0 +1,179 @@
1
+ require 'html5/constants'
2
+
3
+ module HTML5
4
+
5
+ class HTMLSerializer
6
+
7
+ def self.serialize(stream, options = {})
8
+ new(options).serialize(stream, options[:encoding])
9
+ end
10
+
11
+ def escape(string)
12
+ string.gsub("&", "&amp;").gsub("<", "&lt;").gsub(">", "&gt;")
13
+ end
14
+
15
+ def initialize(options={})
16
+ @quote_attr_values = false
17
+ @quote_char = '"'
18
+ @use_best_quote_char = true
19
+ @minimize_boolean_attributes = true
20
+
21
+ @use_trailing_solidus = false
22
+ @space_before_trailing_solidus = true
23
+ @escape_lt_in_attrs = false
24
+ @escape_rcdata = false
25
+
26
+ @omit_optional_tags = true
27
+ @sanitize = false
28
+
29
+ @strip_whitespace = false
30
+
31
+ @inject_meta_charset = true
32
+
33
+ options.each do |name, value|
34
+ next unless instance_variable_defined?("@#{name}")
35
+ @use_best_quote_char = false if name.to_s == 'quote_char'
36
+ instance_variable_set("@#{name}", value)
37
+ end
38
+
39
+ @errors = []
40
+ end
41
+
42
+ def serialize(treewalker, encoding=nil)
43
+ in_cdata = false
44
+ @errors = []
45
+
46
+ if encoding and @inject_meta_charset
47
+ require 'html5/filters/inject_meta_charset'
48
+ treewalker = Filters::InjectMetaCharset.new(treewalker, encoding)
49
+ end
50
+
51
+ if @strip_whitespace
52
+ require 'html5/filters/whitespace'
53
+ treewalker = Filters::WhitespaceFilter.new(treewalker)
54
+ end
55
+
56
+ if @sanitize
57
+ require 'html5/filters/sanitizer'
58
+ treewalker = Filters::HTMLSanitizeFilter.new(treewalker)
59
+ end
60
+
61
+ if @omit_optional_tags
62
+ require 'html5/filters/optionaltags'
63
+ treewalker = Filters::OptionalTagFilter.new(treewalker)
64
+ end
65
+
66
+ result = []
67
+ treewalker.each do |token|
68
+ type = token[:type]
69
+ if type == :Doctype
70
+ doctype = "<!DOCTYPE %s>" % token[:name]
71
+ result << doctype
72
+
73
+ elsif [:Characters, :SpaceCharacters].include? type
74
+ if type == :SpaceCharacters or in_cdata
75
+ if in_cdata and token[:data].include?("</")
76
+ serialize_error("Unexpected </ in CDATA")
77
+ end
78
+ result << token[:data]
79
+ else
80
+ result << escape(token[:data])
81
+ end
82
+
83
+ elsif [:StartTag, :EmptyTag].include? type
84
+ name = token[:name]
85
+ if RCDATA_ELEMENTS.include?(name) and not @escape_rcdata
86
+ in_cdata = true
87
+ elsif in_cdata
88
+ serialize_error(_("Unexpected child element of a CDATA element"))
89
+ end
90
+ attributes = []
91
+ for k,v in attrs = token[:data].to_a.sort
92
+ attributes << ' '
93
+
94
+ attributes << k
95
+ if not @minimize_boolean_attributes or \
96
+ (!(BOOLEAN_ATTRIBUTES[name]||[]).include?(k) \
97
+ and !BOOLEAN_ATTRIBUTES[:global].include?(k))
98
+ attributes << "="
99
+ if @quote_attr_values or v.empty?
100
+ quote_attr = true
101
+ else
102
+ quote_attr = (SPACE_CHARACTERS + %w(< > " ')).any? {|c| v.include?(c)}
103
+ end
104
+ v = v.gsub("&", "&amp;")
105
+ v = v.gsub("<", "&lt;") if @escape_lt_in_attrs
106
+ if quote_attr
107
+ quote_char = @quote_char
108
+ if @use_best_quote_char
109
+ if v.index("'") and !v.index('"')
110
+ quote_char = '"'
111
+ elsif v.index('"') and !v.index("'")
112
+ quote_char = "'"
113
+ end
114
+ end
115
+ if quote_char == "'"
116
+ v = v.gsub("'", "&#39;")
117
+ else
118
+ v = v.gsub('"', "&quot;")
119
+ end
120
+ attributes << quote_char << v << quote_char
121
+ else
122
+ attributes << v
123
+ end
124
+ end
125
+ end
126
+ if VOID_ELEMENTS.include?(name) and @use_trailing_solidus
127
+ if @space_before_trailing_solidus
128
+ attributes << " /"
129
+ else
130
+ attributes << "/"
131
+ end
132
+ end
133
+ result << "<%s%s>" % [name, attributes.join('')]
134
+
135
+ elsif type == :EndTag
136
+ name = token[:name]
137
+ if RCDATA_ELEMENTS.include?(name)
138
+ in_cdata = false
139
+ elsif in_cdata
140
+ serialize_error(_("Unexpected child element of a CDATA element"))
141
+ end
142
+ end_tag = "</#{name}>"
143
+ result << end_tag
144
+
145
+ elsif type == :Comment
146
+ data = token[:data]
147
+ serialize_error("Comment contains --") if data.index("--")
148
+ comment = "<!--%s-->" % token[:data]
149
+ result << comment
150
+
151
+ else
152
+ serialize_error(token[:data])
153
+ end
154
+ end
155
+
156
+ if encoding and encoding != 'utf-8'
157
+ require 'iconv'
158
+ Iconv.iconv(encoding, 'utf-8', result.join('')).first
159
+ else
160
+ result.join('')
161
+ end
162
+ end
163
+
164
+ alias :render :serialize
165
+
166
+ def serialize_error(data="XXX ERROR MESSAGE NEEDED")
167
+ # XXX The idea is to make data mandatory.
168
+ @errors.push(data)
169
+ if @strict
170
+ raise SerializeError
171
+ end
172
+ end
173
+
174
+ end
175
+
176
+ # Error in serialized tree
177
+ class SerializeError < Exception
178
+ end
179
+ end