rexml 3.1.7.3

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of rexml might be problematic. Click here for more details.

Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/.travis.yml +10 -0
  4. data/Gemfile +6 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +60 -0
  7. data/Rakefile +10 -0
  8. data/bin/console +14 -0
  9. data/bin/setup +8 -0
  10. data/lib/rexml/attlistdecl.rb +63 -0
  11. data/lib/rexml/attribute.rb +192 -0
  12. data/lib/rexml/cdata.rb +68 -0
  13. data/lib/rexml/child.rb +97 -0
  14. data/lib/rexml/comment.rb +80 -0
  15. data/lib/rexml/doctype.rb +270 -0
  16. data/lib/rexml/document.rb +291 -0
  17. data/lib/rexml/dtd/attlistdecl.rb +11 -0
  18. data/lib/rexml/dtd/dtd.rb +47 -0
  19. data/lib/rexml/dtd/elementdecl.rb +18 -0
  20. data/lib/rexml/dtd/entitydecl.rb +57 -0
  21. data/lib/rexml/dtd/notationdecl.rb +40 -0
  22. data/lib/rexml/element.rb +1267 -0
  23. data/lib/rexml/encoding.rb +51 -0
  24. data/lib/rexml/entity.rb +171 -0
  25. data/lib/rexml/formatters/default.rb +112 -0
  26. data/lib/rexml/formatters/pretty.rb +142 -0
  27. data/lib/rexml/formatters/transitive.rb +58 -0
  28. data/lib/rexml/functions.rb +447 -0
  29. data/lib/rexml/instruction.rb +71 -0
  30. data/lib/rexml/light/node.rb +196 -0
  31. data/lib/rexml/namespace.rb +48 -0
  32. data/lib/rexml/node.rb +76 -0
  33. data/lib/rexml/output.rb +30 -0
  34. data/lib/rexml/parent.rb +166 -0
  35. data/lib/rexml/parseexception.rb +52 -0
  36. data/lib/rexml/parsers/baseparser.rb +586 -0
  37. data/lib/rexml/parsers/lightparser.rb +59 -0
  38. data/lib/rexml/parsers/pullparser.rb +197 -0
  39. data/lib/rexml/parsers/sax2parser.rb +273 -0
  40. data/lib/rexml/parsers/streamparser.rb +61 -0
  41. data/lib/rexml/parsers/treeparser.rb +101 -0
  42. data/lib/rexml/parsers/ultralightparser.rb +57 -0
  43. data/lib/rexml/parsers/xpathparser.rb +675 -0
  44. data/lib/rexml/quickpath.rb +266 -0
  45. data/lib/rexml/rexml.rb +32 -0
  46. data/lib/rexml/sax2listener.rb +98 -0
  47. data/lib/rexml/security.rb +28 -0
  48. data/lib/rexml/source.rb +298 -0
  49. data/lib/rexml/streamlistener.rb +93 -0
  50. data/lib/rexml/syncenumerator.rb +33 -0
  51. data/lib/rexml/text.rb +424 -0
  52. data/lib/rexml/undefinednamespaceexception.rb +9 -0
  53. data/lib/rexml/validation/relaxng.rb +539 -0
  54. data/lib/rexml/validation/validation.rb +144 -0
  55. data/lib/rexml/validation/validationexception.rb +10 -0
  56. data/lib/rexml/xmldecl.rb +116 -0
  57. data/lib/rexml/xmltokens.rb +85 -0
  58. data/lib/rexml/xpath.rb +81 -0
  59. data/lib/rexml/xpath_parser.rb +934 -0
  60. data/rexml.gemspec +42 -0
  61. metadata +131 -0
@@ -0,0 +1,93 @@
1
+ # frozen_string_literal: false
2
+ module REXML
3
+ # A template for stream parser listeners.
4
+ # Note that the declarations (attlistdecl, elementdecl, etc) are trivially
5
+ # processed; REXML doesn't yet handle doctype entity declarations, so you
6
+ # have to parse them out yourself.
7
+ module StreamListener
8
+ # Called when a tag is encountered.
9
+ # @p name the tag name
10
+ # @p attrs an array of arrays of attribute/value pairs, suitable for
11
+ # use with assoc or rassoc. IE, <tag attr1="value1" attr2="value2">
12
+ # will result in
13
+ # tag_start( "tag", # [["attr1","value1"],["attr2","value2"]])
14
+ def tag_start name, attrs
15
+ end
16
+ # Called when the end tag is reached. In the case of <tag/>, tag_end
17
+ # will be called immediately after tag_start
18
+ # @p the name of the tag
19
+ def tag_end name
20
+ end
21
+ # Called when text is encountered in the document
22
+ # @p text the text content.
23
+ def text text
24
+ end
25
+ # Called when an instruction is encountered. EG: <?xsl sheet='foo'?>
26
+ # @p name the instruction name; in the example, "xsl"
27
+ # @p instruction the rest of the instruction. In the example,
28
+ # "sheet='foo'"
29
+ def instruction name, instruction
30
+ end
31
+ # Called when a comment is encountered.
32
+ # @p comment The content of the comment
33
+ def comment comment
34
+ end
35
+ # Handles a doctype declaration. Any attributes of the doctype which are
36
+ # not supplied will be nil. # EG, <!DOCTYPE me PUBLIC "foo" "bar">
37
+ # @p name the name of the doctype; EG, "me"
38
+ # @p pub_sys "PUBLIC", "SYSTEM", or nil. EG, "PUBLIC"
39
+ # @p long_name the supplied long name, or nil. EG, "foo"
40
+ # @p uri the uri of the doctype, or nil. EG, "bar"
41
+ def doctype name, pub_sys, long_name, uri
42
+ end
43
+ # Called when the doctype is done
44
+ def doctype_end
45
+ end
46
+ # If a doctype includes an ATTLIST declaration, it will cause this
47
+ # method to be called. The content is the declaration itself, unparsed.
48
+ # EG, <!ATTLIST el attr CDATA #REQUIRED> will come to this method as "el
49
+ # attr CDATA #REQUIRED". This is the same for all of the .*decl
50
+ # methods.
51
+ def attlistdecl element_name, attributes, raw_content
52
+ end
53
+ # <!ELEMENT ...>
54
+ def elementdecl content
55
+ end
56
+ # <!ENTITY ...>
57
+ # The argument passed to this method is an array of the entity
58
+ # declaration. It can be in a number of formats, but in general it
59
+ # returns (example, result):
60
+ # <!ENTITY % YN '"Yes"'>
61
+ # ["YN", "\"Yes\"", "%"]
62
+ # <!ENTITY % YN 'Yes'>
63
+ # ["YN", "Yes", "%"]
64
+ # <!ENTITY WhatHeSaid "He said %YN;">
65
+ # ["WhatHeSaid", "He said %YN;"]
66
+ # <!ENTITY open-hatch SYSTEM "http://www.textuality.com/boilerplate/OpenHatch.xml">
67
+ # ["open-hatch", "SYSTEM", "http://www.textuality.com/boilerplate/OpenHatch.xml"]
68
+ # <!ENTITY open-hatch PUBLIC "-//Textuality//TEXT Standard open-hatch boilerplate//EN" "http://www.textuality.com/boilerplate/OpenHatch.xml">
69
+ # ["open-hatch", "PUBLIC", "-//Textuality//TEXT Standard open-hatch boilerplate//EN", "http://www.textuality.com/boilerplate/OpenHatch.xml"]
70
+ # <!ENTITY hatch-pic SYSTEM "../grafix/OpenHatch.gif" NDATA gif>
71
+ # ["hatch-pic", "SYSTEM", "../grafix/OpenHatch.gif", "gif"]
72
+ def entitydecl content
73
+ end
74
+ # <!NOTATION ...>
75
+ def notationdecl content
76
+ end
77
+ # Called when %foo; is encountered in a doctype declaration.
78
+ # @p content "foo"
79
+ def entity content
80
+ end
81
+ # Called when <![CDATA[ ... ]]> is encountered in a document.
82
+ # @p content "..."
83
+ def cdata content
84
+ end
85
+ # Called when an XML PI is encountered in the document.
86
+ # EG: <?xml version="1.0" encoding="utf"?>
87
+ # @p version the version attribute value. EG, "1.0"
88
+ # @p encoding the encoding attribute value, or nil. EG, "utf"
89
+ # @p standalone the standalone attribute value, or nil. EG, nil
90
+ def xmldecl version, encoding, standalone
91
+ end
92
+ end
93
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: false
2
+ module REXML
3
+ class SyncEnumerator
4
+ include Enumerable
5
+
6
+ # Creates a new SyncEnumerator which enumerates rows of given
7
+ # Enumerable objects.
8
+ def initialize(*enums)
9
+ @gens = enums
10
+ @length = @gens.collect {|x| x.size }.max
11
+ end
12
+
13
+ # Returns the number of enumerated Enumerable objects, i.e. the size
14
+ # of each row.
15
+ def size
16
+ @gens.size
17
+ end
18
+
19
+ # Returns the number of enumerated Enumerable objects, i.e. the size
20
+ # of each row.
21
+ def length
22
+ @gens.length
23
+ end
24
+
25
+ # Enumerates rows of the Enumerable objects.
26
+ def each
27
+ @length.times {|i|
28
+ yield @gens.collect {|x| x[i]}
29
+ }
30
+ self
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,424 @@
1
+ # frozen_string_literal: false
2
+ require_relative 'security'
3
+ require_relative 'entity'
4
+ require_relative 'doctype'
5
+ require_relative 'child'
6
+ require_relative 'doctype'
7
+ require_relative 'parseexception'
8
+
9
+ module REXML
10
+ # Represents text nodes in an XML document
11
+ class Text < Child
12
+ include Comparable
13
+ # The order in which the substitutions occur
14
+ SPECIALS = [ /&(?!#?[\w-]+;)/u, /</u, />/u, /"/u, /'/u, /\r/u ]
15
+ SUBSTITUTES = ['&amp;', '&lt;', '&gt;', '&quot;', '&apos;', '&#13;']
16
+ # Characters which are substituted in written strings
17
+ SLAICEPS = [ '<', '>', '"', "'", '&' ]
18
+ SETUTITSBUS = [ /&lt;/u, /&gt;/u, /&quot;/u, /&apos;/u, /&amp;/u ]
19
+
20
+ # If +raw+ is true, then REXML leaves the value alone
21
+ attr_accessor :raw
22
+
23
+ NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um
24
+ NUMERICENTITY = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/
25
+ VALID_CHAR = [
26
+ 0x9, 0xA, 0xD,
27
+ (0x20..0xD7FF),
28
+ (0xE000..0xFFFD),
29
+ (0x10000..0x10FFFF)
30
+ ]
31
+
32
+ if String.method_defined? :encode
33
+ VALID_XML_CHARS = Regexp.new('^['+
34
+ VALID_CHAR.map { |item|
35
+ case item
36
+ when Integer
37
+ [item].pack('U').force_encoding('utf-8')
38
+ when Range
39
+ [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8')
40
+ end
41
+ }.join +
42
+ ']*$')
43
+ else
44
+ VALID_XML_CHARS = /^(
45
+ [\x09\x0A\x0D\x20-\x7E] # ASCII
46
+ | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
47
+ | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
48
+ | [\xE1-\xEC\xEE][\x80-\xBF]{2} # straight 3-byte
49
+ | \xEF[\x80-\xBE]{2} #
50
+ | \xEF\xBF[\x80-\xBD] # excluding U+fffe and U+ffff
51
+ | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
52
+ | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
53
+ | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
54
+ | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
55
+ )*$/nx;
56
+ end
57
+
58
+ # Constructor
59
+ # +arg+ if a String, the content is set to the String. If a Text,
60
+ # the object is shallowly cloned.
61
+ #
62
+ # +respect_whitespace+ (boolean, false) if true, whitespace is
63
+ # respected
64
+ #
65
+ # +parent+ (nil) if this is a Parent object, the parent
66
+ # will be set to this.
67
+ #
68
+ # +raw+ (nil) This argument can be given three values.
69
+ # If true, then the value of used to construct this object is expected to
70
+ # contain no unescaped XML markup, and REXML will not change the text. If
71
+ # this value is false, the string may contain any characters, and REXML will
72
+ # escape any and all defined entities whose values are contained in the
73
+ # text. If this value is nil (the default), then the raw value of the
74
+ # parent will be used as the raw value for this node. If there is no raw
75
+ # value for the parent, and no value is supplied, the default is false.
76
+ # Use this field if you have entities defined for some text, and you don't
77
+ # want REXML to escape that text in output.
78
+ # Text.new( "<&", false, nil, false ) #-> "&lt;&amp;"
79
+ # Text.new( "&lt;&amp;", false, nil, false ) #-> "&amp;lt;&amp;amp;"
80
+ # Text.new( "<&", false, nil, true ) #-> Parse exception
81
+ # Text.new( "&lt;&amp;", false, nil, true ) #-> "&lt;&amp;"
82
+ # # Assume that the entity "s" is defined to be "sean"
83
+ # # and that the entity "r" is defined to be "russell"
84
+ # Text.new( "sean russell" ) #-> "&s; &r;"
85
+ # Text.new( "sean russell", false, nil, true ) #-> "sean russell"
86
+ #
87
+ # +entity_filter+ (nil) This can be an array of entities to match in the
88
+ # supplied text. This argument is only useful if +raw+ is set to false.
89
+ # Text.new( "sean russell", false, nil, false, ["s"] ) #-> "&s; russell"
90
+ # Text.new( "sean russell", false, nil, true, ["s"] ) #-> "sean russell"
91
+ # In the last example, the +entity_filter+ argument is ignored.
92
+ #
93
+ # +illegal+ INTERNAL USE ONLY
94
+ def initialize(arg, respect_whitespace=false, parent=nil, raw=nil,
95
+ entity_filter=nil, illegal=NEEDS_A_SECOND_CHECK )
96
+
97
+ @raw = false
98
+ @parent = nil
99
+ @entity_filter = nil
100
+
101
+ if parent
102
+ super( parent )
103
+ @raw = parent.raw
104
+ end
105
+
106
+ if arg.kind_of? String
107
+ @string = arg.dup
108
+ elsif arg.kind_of? Text
109
+ @string = arg.instance_variable_get(:@string).dup
110
+ @raw = arg.raw
111
+ @entity_filter = arg.instance_variable_get(:@entity_filter)
112
+ elsif
113
+ raise "Illegal argument of type #{arg.type} for Text constructor (#{arg})"
114
+ end
115
+
116
+ @string.squeeze!(" \n\t") unless respect_whitespace
117
+ @string.gsub!(/\r\n?/, "\n")
118
+ @raw = raw unless raw.nil?
119
+ @entity_filter = entity_filter if entity_filter
120
+ clear_cache
121
+
122
+ Text.check(@string, illegal, doctype) if @raw
123
+ end
124
+
125
+ def parent= parent
126
+ super(parent)
127
+ Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent
128
+ end
129
+
130
+ # check for illegal characters
131
+ def Text.check string, pattern, doctype
132
+
133
+ # illegal anywhere
134
+ if string !~ VALID_XML_CHARS
135
+ if String.method_defined? :encode
136
+ string.chars.each do |c|
137
+ case c.ord
138
+ when *VALID_CHAR
139
+ else
140
+ raise "Illegal character #{c.inspect} in raw string \"#{string}\""
141
+ end
142
+ end
143
+ else
144
+ string.scan(/[\x00-\x7F]|[\x80-\xBF][\xC0-\xF0]*|[\xC0-\xF0]/n) do |c|
145
+ case c.unpack('U')
146
+ when *VALID_CHAR
147
+ else
148
+ raise "Illegal character #{c.inspect} in raw string \"#{string}\""
149
+ end
150
+ end
151
+ end
152
+ end
153
+
154
+ # context sensitive
155
+ string.scan(pattern) do
156
+ if $1[-1] != ?;
157
+ raise "Illegal character '#{$1}' in raw string \"#{string}\""
158
+ elsif $1[0] == ?&
159
+ if $5 and $5[0] == ?#
160
+ case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i)
161
+ when *VALID_CHAR
162
+ else
163
+ raise "Illegal character '#{$1}' in raw string \"#{string}\""
164
+ end
165
+ # FIXME: below can't work but this needs API change.
166
+ # elsif @parent and $3 and !SUBSTITUTES.include?($1)
167
+ # if !doctype or !doctype.entities.has_key?($3)
168
+ # raise "Undeclared entity '#{$1}' in raw string \"#{string}\""
169
+ # end
170
+ end
171
+ end
172
+ end
173
+ end
174
+
175
+ def node_type
176
+ :text
177
+ end
178
+
179
+ def empty?
180
+ @string.size==0
181
+ end
182
+
183
+
184
+ def clone
185
+ return Text.new(self, true)
186
+ end
187
+
188
+
189
+ # Appends text to this text node. The text is appended in the +raw+ mode
190
+ # of this text node.
191
+ #
192
+ # +returns+ the text itself to enable method chain like
193
+ # 'text << "XXX" << "YYY"'.
194
+ def <<( to_append )
195
+ @string << to_append.gsub( /\r\n?/, "\n" )
196
+ clear_cache
197
+ self
198
+ end
199
+
200
+
201
+ # +other+ a String or a Text
202
+ # +returns+ the result of (to_s <=> arg.to_s)
203
+ def <=>( other )
204
+ to_s() <=> other.to_s
205
+ end
206
+
207
+ def doctype
208
+ if @parent
209
+ doc = @parent.document
210
+ doc.doctype if doc
211
+ end
212
+ end
213
+
214
+ REFERENCE = /#{Entity::REFERENCE}/
215
+ # Returns the string value of this text node. This string is always
216
+ # escaped, meaning that it is a valid XML text node string, and all
217
+ # entities that can be escaped, have been inserted. This method respects
218
+ # the entity filter set in the constructor.
219
+ #
220
+ # # Assume that the entity "s" is defined to be "sean", and that the
221
+ # # entity "r" is defined to be "russell"
222
+ # t = Text.new( "< & sean russell", false, nil, false, ['s'] )
223
+ # t.to_s #-> "&lt; &amp; &s; russell"
224
+ # t = Text.new( "< & &s; russell", false, nil, false )
225
+ # t.to_s #-> "&lt; &amp; &s; russell"
226
+ # u = Text.new( "sean russell", false, nil, true )
227
+ # u.to_s #-> "sean russell"
228
+ def to_s
229
+ return @string if @raw
230
+ @normalized ||= Text::normalize( @string, doctype, @entity_filter )
231
+ end
232
+
233
+ def inspect
234
+ @string.inspect
235
+ end
236
+
237
+ # Returns the string value of this text. This is the text without
238
+ # entities, as it might be used programmatically, or printed to the
239
+ # console. This ignores the 'raw' attribute setting, and any
240
+ # entity_filter.
241
+ #
242
+ # # Assume that the entity "s" is defined to be "sean", and that the
243
+ # # entity "r" is defined to be "russell"
244
+ # t = Text.new( "< & sean russell", false, nil, false, ['s'] )
245
+ # t.value #-> "< & sean russell"
246
+ # t = Text.new( "< & &s; russell", false, nil, false )
247
+ # t.value #-> "< & sean russell"
248
+ # u = Text.new( "sean russell", false, nil, true )
249
+ # u.value #-> "sean russell"
250
+ def value
251
+ @unnormalized ||= Text::unnormalize( @string, doctype )
252
+ end
253
+
254
+ # Sets the contents of this text node. This expects the text to be
255
+ # unnormalized. It returns self.
256
+ #
257
+ # e = Element.new( "a" )
258
+ # e.add_text( "foo" ) # <a>foo</a>
259
+ # e[0].value = "bar" # <a>bar</a>
260
+ # e[0].value = "<a>" # <a>&lt;a&gt;</a>
261
+ def value=( val )
262
+ @string = val.gsub( /\r\n?/, "\n" )
263
+ clear_cache
264
+ @raw = false
265
+ end
266
+
267
+ def wrap(string, width, addnewline=false)
268
+ # Recursively wrap string at width.
269
+ return string if string.length <= width
270
+ place = string.rindex(' ', width) # Position in string with last ' ' before cutoff
271
+ if addnewline then
272
+ return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width)
273
+ else
274
+ return string[0,place] + "\n" + wrap(string[place+1..-1], width)
275
+ end
276
+ end
277
+
278
+ def indent_text(string, level=1, style="\t", indentfirstline=true)
279
+ return string if level < 0
280
+ new_string = ''
281
+ string.each_line { |line|
282
+ indent_string = style * level
283
+ new_line = (indent_string + line).sub(/[\s]+$/,'')
284
+ new_string << new_line
285
+ }
286
+ new_string.strip! unless indentfirstline
287
+ return new_string
288
+ end
289
+
290
+ # == DEPRECATED
291
+ # See REXML::Formatters
292
+ #
293
+ def write( writer, indent=-1, transitive=false, ie_hack=false )
294
+ Kernel.warn("#{self.class.name}.write is deprecated. See REXML::Formatters", uplevel: 1)
295
+ formatter = if indent > -1
296
+ REXML::Formatters::Pretty.new( indent )
297
+ else
298
+ REXML::Formatters::Default.new
299
+ end
300
+ formatter.write( self, writer )
301
+ end
302
+
303
+ # FIXME
304
+ # This probably won't work properly
305
+ def xpath
306
+ path = @parent.xpath
307
+ path += "/text()"
308
+ return path
309
+ end
310
+
311
+ # Writes out text, substituting special characters beforehand.
312
+ # +out+ A String, IO, or any other object supporting <<( String )
313
+ # +input+ the text to substitute and the write out
314
+ #
315
+ # z=utf8.unpack("U*")
316
+ # ascOut=""
317
+ # z.each{|r|
318
+ # if r < 0x100
319
+ # ascOut.concat(r.chr)
320
+ # else
321
+ # ascOut.concat(sprintf("&#x%x;", r))
322
+ # end
323
+ # }
324
+ # puts ascOut
325
+ def write_with_substitution out, input
326
+ copy = input.clone
327
+ # Doing it like this rather than in a loop improves the speed
328
+ copy.gsub!( SPECIALS[0], SUBSTITUTES[0] )
329
+ copy.gsub!( SPECIALS[1], SUBSTITUTES[1] )
330
+ copy.gsub!( SPECIALS[2], SUBSTITUTES[2] )
331
+ copy.gsub!( SPECIALS[3], SUBSTITUTES[3] )
332
+ copy.gsub!( SPECIALS[4], SUBSTITUTES[4] )
333
+ copy.gsub!( SPECIALS[5], SUBSTITUTES[5] )
334
+ out << copy
335
+ end
336
+
337
+ private
338
+ def clear_cache
339
+ @normalized = nil
340
+ @unnormalized = nil
341
+ end
342
+
343
+ # Reads text, substituting entities
344
+ def Text::read_with_substitution( input, illegal=nil )
345
+ copy = input.clone
346
+
347
+ if copy =~ illegal
348
+ raise ParseException.new( "malformed text: Illegal character #$& in \"#{copy}\"" )
349
+ end if illegal
350
+
351
+ copy.gsub!( /\r\n?/, "\n" )
352
+ if copy.include? ?&
353
+ copy.gsub!( SETUTITSBUS[0], SLAICEPS[0] )
354
+ copy.gsub!( SETUTITSBUS[1], SLAICEPS[1] )
355
+ copy.gsub!( SETUTITSBUS[2], SLAICEPS[2] )
356
+ copy.gsub!( SETUTITSBUS[3], SLAICEPS[3] )
357
+ copy.gsub!( SETUTITSBUS[4], SLAICEPS[4] )
358
+ copy.gsub!( /&#0*((?:\d+)|(?:x[a-f0-9]+));/ ) {
359
+ m=$1
360
+ #m='0' if m==''
361
+ m = "0#{m}" if m[0] == ?x
362
+ [Integer(m)].pack('U*')
363
+ }
364
+ end
365
+ copy
366
+ end
367
+
368
+ EREFERENCE = /&(?!#{Entity::NAME};)/
369
+ # Escapes all possible entities
370
+ def Text::normalize( input, doctype=nil, entity_filter=nil )
371
+ copy = input.to_s
372
+ # Doing it like this rather than in a loop improves the speed
373
+ #copy = copy.gsub( EREFERENCE, '&amp;' )
374
+ copy = copy.gsub( "&", "&amp;" )
375
+ if doctype
376
+ # Replace all ampersands that aren't part of an entity
377
+ doctype.entities.each_value do |entity|
378
+ copy = copy.gsub( entity.value,
379
+ "&#{entity.name};" ) if entity.value and
380
+ not( entity_filter and entity_filter.include?(entity.name) )
381
+ end
382
+ else
383
+ # Replace all ampersands that aren't part of an entity
384
+ DocType::DEFAULT_ENTITIES.each_value do |entity|
385
+ copy = copy.gsub(entity.value, "&#{entity.name};" )
386
+ end
387
+ end
388
+ copy
389
+ end
390
+
391
+ # Unescapes all possible entities
392
+ def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil )
393
+ sum = 0
394
+ string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) {
395
+ s = Text.expand($&, doctype, filter)
396
+ if sum + s.bytesize > Security.entity_expansion_text_limit
397
+ raise "entity expansion has grown too large"
398
+ else
399
+ sum += s.bytesize
400
+ end
401
+ s
402
+ }
403
+ end
404
+
405
+ def Text.expand(ref, doctype, filter)
406
+ if ref[1] == ?#
407
+ if ref[2] == ?x
408
+ [ref[3...-1].to_i(16)].pack('U*')
409
+ else
410
+ [ref[2...-1].to_i].pack('U*')
411
+ end
412
+ elsif ref == '&amp;'
413
+ '&'
414
+ elsif filter and filter.include?( ref[1...-1] )
415
+ ref
416
+ elsif doctype
417
+ doctype.entity( ref[1...-1] ) or ref
418
+ else
419
+ entity_value = DocType::DEFAULT_ENTITIES[ ref[1...-1] ]
420
+ entity_value ? entity_value.value : ref
421
+ end
422
+ end
423
+ end
424
+ end