moxml 0.1.22 → 0.1.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.rubocop.yml +1 -0
- data/.rubocop_todo.yml +680 -110
- data/Rakefile +12 -9
- data/lib/compat/opal/rexml/namespace.rb +8 -5
- data/lib/compat/opal/rexml/parsers/baseparser.rb +276 -212
- data/lib/compat/opal/rexml/source.rb +28 -27
- data/lib/compat/opal/rexml/text.rb +112 -104
- data/lib/compat/opal/rexml/xmltokens.rb +8 -8
- data/lib/compat/opal/rexml_compat.rb +12 -11
- data/lib/moxml/adapter/customized_oga/xml_declaration.rb +8 -1
- data/lib/moxml/adapter/customized_rexml/formatter.rb +4 -4
- data/lib/moxml/adapter/libxml/entity_ref_registry.rb +4 -2
- data/lib/moxml/adapter/libxml/entity_restorer.rb +3 -1
- data/lib/moxml/adapter/libxml.rb +17 -4
- data/lib/moxml/adapter/nokogiri.rb +17 -15
- data/lib/moxml/adapter/oga.rb +43 -62
- data/lib/moxml/adapter/ox.rb +35 -18
- data/lib/moxml/adapter.rb +1 -1
- data/lib/moxml/config.rb +15 -2
- data/lib/moxml/document.rb +2 -8
- data/lib/moxml/entity_registry.rb +8 -4
- data/lib/moxml/entity_registry_opal_data.rb +3 -2
- data/lib/moxml/node.rb +8 -0
- data/lib/moxml/version.rb +1 -1
- data/lib/moxml/xml_utils.rb +1 -0
- data/lib/moxml.rb +7 -0
- data/spec/integration/all_adapters_spec.rb +1 -0
- data/spec/integration/shared_examples/line_ending_behavior.rb +56 -0
- data/spec/moxml/adapter/libxml_internals_spec.rb +4 -2
- data/spec/moxml/adapter/platform_spec.rb +2 -1
- data/spec/moxml/config_spec.rb +33 -0
- metadata +3 -2
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
require "stringio"
|
|
4
4
|
require "strscan"
|
|
5
5
|
|
|
6
|
-
require
|
|
6
|
+
require "rexml/encoding"
|
|
7
7
|
|
|
8
8
|
module REXML
|
|
9
9
|
if defined?(StringScanner::Version) && StringScanner::Version < "1.0.0"
|
|
@@ -11,22 +11,22 @@ module REXML
|
|
|
11
11
|
refine StringScanner do
|
|
12
12
|
def check(pattern)
|
|
13
13
|
pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
|
|
14
|
-
super
|
|
14
|
+
super
|
|
15
15
|
end
|
|
16
16
|
|
|
17
17
|
def scan(pattern)
|
|
18
18
|
pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
|
|
19
|
-
super
|
|
19
|
+
super
|
|
20
20
|
end
|
|
21
21
|
|
|
22
22
|
def match?(pattern)
|
|
23
23
|
pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
|
|
24
|
-
super
|
|
24
|
+
super
|
|
25
25
|
end
|
|
26
26
|
|
|
27
27
|
def skip(pattern)
|
|
28
28
|
pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
|
|
29
|
-
super
|
|
29
|
+
super
|
|
30
30
|
end
|
|
31
31
|
end
|
|
32
32
|
end
|
|
@@ -34,11 +34,11 @@ module REXML
|
|
|
34
34
|
end
|
|
35
35
|
|
|
36
36
|
class SourceFactory
|
|
37
|
-
def
|
|
38
|
-
if arg.respond_to?
|
|
39
|
-
arg.respond_to?
|
|
40
|
-
arg.respond_to?
|
|
41
|
-
arg.respond_to?
|
|
37
|
+
def self.create_from(arg)
|
|
38
|
+
if arg.respond_to?(:read) &&
|
|
39
|
+
arg.respond_to?(:readline) &&
|
|
40
|
+
arg.respond_to?(:nil?) &&
|
|
41
|
+
arg.respond_to?(:eof?)
|
|
42
42
|
if RUBY_ENGINE == "opal"
|
|
43
43
|
# Opal's StringScanner lacks <<, so use Source (full-string) instead
|
|
44
44
|
# of IOSource (streaming). Read everything upfront.
|
|
@@ -52,19 +52,18 @@ module REXML
|
|
|
52
52
|
else
|
|
53
53
|
IOSource.new(StringIO.new(arg))
|
|
54
54
|
end
|
|
55
|
-
elsif arg.
|
|
55
|
+
elsif arg.is_a? Source
|
|
56
56
|
arg
|
|
57
57
|
else
|
|
58
|
-
raise "#{arg.class} is not a valid input stream. It must walk \
|
|
59
|
-
"like either a String, an IO, or a Source."
|
|
58
|
+
raise "#{arg.class} is not a valid input stream. It must walk \nlike either a String, an IO, or a Source."
|
|
60
59
|
end
|
|
61
60
|
end
|
|
62
61
|
end
|
|
63
62
|
|
|
64
63
|
class Source
|
|
65
64
|
include Encoding
|
|
66
|
-
|
|
67
|
-
attr_reader :encoding
|
|
65
|
+
|
|
66
|
+
attr_reader :line, :encoding
|
|
68
67
|
|
|
69
68
|
module Private
|
|
70
69
|
SPACES_PATTERN = /\s+/
|
|
@@ -75,10 +74,11 @@ module REXML
|
|
|
75
74
|
pre_defined_terms.each do |term|
|
|
76
75
|
PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/
|
|
77
76
|
end
|
|
77
|
+
PRE_DEFINED_TERM_PATTERNS.freeze
|
|
78
78
|
end
|
|
79
79
|
private_constant :Private
|
|
80
80
|
|
|
81
|
-
def initialize(arg, encoding=nil)
|
|
81
|
+
def initialize(arg, encoding = nil)
|
|
82
82
|
@orig = arg
|
|
83
83
|
@scanner = StringScanner.new(@orig)
|
|
84
84
|
if encoding
|
|
@@ -106,11 +106,11 @@ module REXML
|
|
|
106
106
|
|
|
107
107
|
def encoding=(enc)
|
|
108
108
|
return unless super
|
|
109
|
+
|
|
109
110
|
encoding_updated
|
|
110
111
|
end
|
|
111
112
|
|
|
112
|
-
def read(term = nil)
|
|
113
|
-
end
|
|
113
|
+
def read(term = nil); end
|
|
114
114
|
|
|
115
115
|
def read_until(term)
|
|
116
116
|
pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
|
|
@@ -122,10 +122,9 @@ module REXML
|
|
|
122
122
|
data
|
|
123
123
|
end
|
|
124
124
|
|
|
125
|
-
def ensure_buffer
|
|
126
|
-
end
|
|
125
|
+
def ensure_buffer; end
|
|
127
126
|
|
|
128
|
-
def match(pattern, cons=false)
|
|
127
|
+
def match(pattern, cons = false)
|
|
129
128
|
pattern = Regexp.new(Regexp.escape(pattern)) if pattern.is_a?(String)
|
|
130
129
|
if cons
|
|
131
130
|
@scanner.scan(pattern).nil? ? nil : @scanner
|
|
@@ -134,12 +133,14 @@ module REXML
|
|
|
134
133
|
end
|
|
135
134
|
end
|
|
136
135
|
|
|
137
|
-
def match?(pattern, cons=false)
|
|
136
|
+
def match?(pattern, cons = false)
|
|
138
137
|
pattern = Regexp.new(Regexp.escape(pattern)) if pattern.is_a?(String)
|
|
139
138
|
window = @scanner.peek(4096)
|
|
140
139
|
return false if window.empty?
|
|
140
|
+
|
|
141
141
|
m = pattern.match(window)
|
|
142
142
|
return false unless m && m.begin(0) == 0
|
|
143
|
+
|
|
143
144
|
@scanner.pos += m[0].length if cons
|
|
144
145
|
true
|
|
145
146
|
end
|
|
@@ -171,8 +172,8 @@ module REXML
|
|
|
171
172
|
def current_line
|
|
172
173
|
lines = @orig.split
|
|
173
174
|
res = lines.grep @scanner.rest[0..30]
|
|
174
|
-
res = res[-1] if res.
|
|
175
|
-
lines.index(
|
|
175
|
+
res = res[-1] if res.is_a? Array
|
|
176
|
+
lines.index(res) if res
|
|
176
177
|
end
|
|
177
178
|
|
|
178
179
|
private
|
|
@@ -202,11 +203,11 @@ module REXML
|
|
|
202
203
|
end
|
|
203
204
|
|
|
204
205
|
def encoding_updated
|
|
205
|
-
if @encoding
|
|
206
|
+
if @encoding == "UTF-8"
|
|
207
|
+
@to_utf = false
|
|
208
|
+
else
|
|
206
209
|
@scanner = StringScanner.new(decode(@scanner.rest))
|
|
207
210
|
@to_utf = true
|
|
208
|
-
else
|
|
209
|
-
@to_utf = false
|
|
210
211
|
end
|
|
211
212
|
end
|
|
212
213
|
end
|
|
@@ -1,20 +1,22 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require
|
|
4
|
-
require
|
|
5
|
-
require
|
|
6
|
-
require
|
|
2
|
+
|
|
3
|
+
require "rexml/security"
|
|
4
|
+
require "rexml/entity"
|
|
5
|
+
require "rexml/doctype"
|
|
6
|
+
require "rexml/child"
|
|
7
|
+
require "rexml/parseexception"
|
|
7
8
|
|
|
8
9
|
module REXML
|
|
9
10
|
# Represents text nodes in an XML document
|
|
10
11
|
class Text < Child
|
|
11
12
|
include Comparable
|
|
13
|
+
|
|
12
14
|
# The order in which the substitutions occur
|
|
13
|
-
SPECIALS = [
|
|
14
|
-
SUBSTITUTES = [
|
|
15
|
+
SPECIALS = [/&(?!#?[\w-]+;)/u, /</u, />/u, /"/u, /'/u, /\r/u].freeze
|
|
16
|
+
SUBSTITUTES = ["&", "<", ">", """, "'", " "].freeze
|
|
15
17
|
# Characters which are substituted in written strings
|
|
16
|
-
SLAICEPS = [
|
|
17
|
-
SETUTITSBUS = [
|
|
18
|
+
SLAICEPS = ["<", ">", '"', "'", "&"].freeze
|
|
19
|
+
SETUTITSBUS = [/</u, />/u, /"/u, /'/u, /&/u].freeze
|
|
18
20
|
|
|
19
21
|
# If +raw+ is true, then REXML leaves the value alone
|
|
20
22
|
attr_accessor :raw
|
|
@@ -25,19 +27,19 @@ module REXML
|
|
|
25
27
|
VALID_CHAR = [
|
|
26
28
|
0x9, 0xA, 0xD,
|
|
27
29
|
(0x20..0xD7FF),
|
|
28
|
-
(0xE000..0xFFFD)
|
|
29
|
-
]
|
|
30
|
+
(0xE000..0xFFFD)
|
|
31
|
+
].freeze
|
|
30
32
|
|
|
31
|
-
VALID_XML_CHARS = Regexp.new(
|
|
33
|
+
VALID_XML_CHARS = Regexp.new("^[" +
|
|
32
34
|
VALID_CHAR.map { |item|
|
|
33
35
|
case item
|
|
34
36
|
when Integer
|
|
35
|
-
[item].pack(
|
|
37
|
+
[item].pack("U")
|
|
36
38
|
when Range
|
|
37
|
-
[item.first,
|
|
39
|
+
[item.first, "-".ord, item.last].pack("UUU")
|
|
38
40
|
end
|
|
39
41
|
}.join +
|
|
40
|
-
|
|
42
|
+
"]*$")
|
|
41
43
|
|
|
42
44
|
# Constructor
|
|
43
45
|
# +arg+ if a String, the content is set to the String. If a Text,
|
|
@@ -75,21 +77,20 @@ module REXML
|
|
|
75
77
|
# In the last example, the +entity_filter+ argument is ignored.
|
|
76
78
|
#
|
|
77
79
|
# +illegal+ INTERNAL USE ONLY
|
|
78
|
-
def initialize(arg, respect_whitespace=false, parent=nil, raw=nil,
|
|
79
|
-
entity_filter=nil, illegal=NEEDS_A_SECOND_CHECK
|
|
80
|
-
|
|
80
|
+
def initialize(arg, respect_whitespace = false, parent = nil, raw = nil,
|
|
81
|
+
entity_filter = nil, illegal = NEEDS_A_SECOND_CHECK)
|
|
81
82
|
@raw = false
|
|
82
83
|
@parent = nil
|
|
83
84
|
@entity_filter = nil
|
|
84
85
|
|
|
85
86
|
if parent
|
|
86
|
-
super(
|
|
87
|
+
super(parent)
|
|
87
88
|
@raw = parent.raw
|
|
88
89
|
end
|
|
89
90
|
|
|
90
|
-
if arg.
|
|
91
|
+
if arg.is_a? String
|
|
91
92
|
@string = arg.dup
|
|
92
|
-
elsif arg.
|
|
93
|
+
elsif arg.is_a? Text
|
|
93
94
|
@string = arg.instance_variable_get(:@string).dup
|
|
94
95
|
@raw = arg.raw
|
|
95
96
|
@entity_filter = arg.instance_variable_get(:@entity_filter)
|
|
@@ -106,20 +107,19 @@ module REXML
|
|
|
106
107
|
Text.check(@string, illegal) if @raw
|
|
107
108
|
end
|
|
108
109
|
|
|
109
|
-
def parent=
|
|
110
|
-
super
|
|
111
|
-
Text.check(@string, NEEDS_A_SECOND_CHECK) if @raw
|
|
110
|
+
def parent=(parent)
|
|
111
|
+
super
|
|
112
|
+
Text.check(@string, NEEDS_A_SECOND_CHECK) if @raw && @parent
|
|
112
113
|
end
|
|
113
114
|
|
|
114
115
|
# check for illegal characters
|
|
115
|
-
def
|
|
116
|
-
|
|
116
|
+
def self.check(string, _pattern, _doctype = nil)
|
|
117
117
|
# illegal anywhere — avoid VALID_XML_CHARS regex on uncontrolled data
|
|
118
118
|
string.each_char do |c|
|
|
119
119
|
code = c.ord
|
|
120
|
-
unless
|
|
121
|
-
|
|
122
|
-
|
|
120
|
+
unless code == 0x9 || code == 0xA || code == 0xD ||
|
|
121
|
+
code.between?(0x20, 0xD7FF) ||
|
|
122
|
+
code.between?(0xE000, 0xFFFD)
|
|
123
123
|
raise "Illegal character #{c.inspect} in raw string #{string.inspect}"
|
|
124
124
|
end
|
|
125
125
|
end
|
|
@@ -140,9 +140,9 @@ module REXML
|
|
|
140
140
|
end
|
|
141
141
|
|
|
142
142
|
if value[0] == "#"
|
|
143
|
-
character_reference = value[1
|
|
143
|
+
character_reference = value[1..]
|
|
144
144
|
|
|
145
|
-
unless
|
|
145
|
+
unless /^(\d+|x[0-9a-fA-F]+)$/.match?(character_reference)
|
|
146
146
|
if character_reference[0] == "x" || character_reference[-1] == "x"
|
|
147
147
|
raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
|
|
148
148
|
else
|
|
@@ -150,12 +150,12 @@ module REXML
|
|
|
150
150
|
end
|
|
151
151
|
end
|
|
152
152
|
|
|
153
|
-
case (character_reference[0] == "x" ? character_reference[1
|
|
153
|
+
case (character_reference[0] == "x" ? character_reference[1..].to_i(16) : character_reference.to_i)
|
|
154
154
|
when *VALID_CHAR
|
|
155
155
|
else
|
|
156
156
|
raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
|
|
157
157
|
end
|
|
158
|
-
elsif
|
|
158
|
+
elsif !/^#{Entity::NAME}$/umo.match?(value)
|
|
159
159
|
raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
|
|
160
160
|
end
|
|
161
161
|
|
|
@@ -170,31 +170,28 @@ module REXML
|
|
|
170
170
|
end
|
|
171
171
|
|
|
172
172
|
def empty?
|
|
173
|
-
@string.
|
|
173
|
+
@string.empty?
|
|
174
174
|
end
|
|
175
175
|
|
|
176
|
-
|
|
177
176
|
def clone
|
|
178
177
|
Text.new(self, true)
|
|
179
178
|
end
|
|
180
179
|
|
|
181
|
-
|
|
182
180
|
# Appends text to this text node. The text is appended in the +raw+ mode
|
|
183
181
|
# of this text node.
|
|
184
182
|
#
|
|
185
183
|
# +returns+ the text itself to enable method chain like
|
|
186
184
|
# 'text << "XXX" << "YYY"'.
|
|
187
|
-
def <<(
|
|
188
|
-
@string << to_append.gsub(
|
|
185
|
+
def <<(to_append)
|
|
186
|
+
@string << to_append.gsub(/\r\n?/, "\n")
|
|
189
187
|
clear_cache
|
|
190
188
|
self
|
|
191
189
|
end
|
|
192
190
|
|
|
193
|
-
|
|
194
191
|
# +other+ a String or a Text
|
|
195
192
|
# +returns+ the result of (to_s <=> arg.to_s)
|
|
196
|
-
def <=>(
|
|
197
|
-
to_s
|
|
193
|
+
def <=>(other)
|
|
194
|
+
to_s <=> other.to_s
|
|
198
195
|
end
|
|
199
196
|
|
|
200
197
|
def doctype
|
|
@@ -217,7 +214,8 @@ module REXML
|
|
|
217
214
|
# u.to_s #-> "sean russell"
|
|
218
215
|
def to_s
|
|
219
216
|
return @string if @raw
|
|
220
|
-
|
|
217
|
+
|
|
218
|
+
@to_s ||= Text::normalize(@string, doctype, @entity_filter)
|
|
221
219
|
end
|
|
222
220
|
|
|
223
221
|
def inspect
|
|
@@ -238,8 +236,8 @@ module REXML
|
|
|
238
236
|
# u = Text.new( "sean russell", false, nil, true )
|
|
239
237
|
# u.value #-> "sean russell"
|
|
240
238
|
def value
|
|
241
|
-
@
|
|
242
|
-
|
|
239
|
+
@value ||= Text::unnormalize(@string, doctype,
|
|
240
|
+
entity_expansion_text_limit: document&.entity_expansion_text_limit)
|
|
243
241
|
end
|
|
244
242
|
|
|
245
243
|
# Sets the contents of this text node. This expects the text to be
|
|
@@ -249,33 +247,36 @@ module REXML
|
|
|
249
247
|
# e.add_text( "foo" ) # <a>foo</a>
|
|
250
248
|
# e[0].value = "bar" # <a>bar</a>
|
|
251
249
|
# e[0].value = "<a>" # <a><a></a>
|
|
252
|
-
def value=(
|
|
253
|
-
@string = val.gsub(
|
|
250
|
+
def value=(val)
|
|
251
|
+
@string = val.gsub(/\r\n?/, "\n")
|
|
254
252
|
clear_cache
|
|
255
253
|
@raw = false
|
|
256
254
|
end
|
|
257
255
|
|
|
258
|
-
def wrap(string, width, addnewline=false)
|
|
256
|
+
def wrap(string, width, addnewline = false)
|
|
259
257
|
# Recursively wrap string at width.
|
|
260
258
|
return string if string.length <= width
|
|
261
|
-
|
|
259
|
+
|
|
260
|
+
place = string.rindex(" ", width) # Position in string with last ' ' before cutoff
|
|
262
261
|
if addnewline
|
|
263
|
-
"\n
|
|
262
|
+
"\n#{string[0, place]}\n#{wrap(string[(place + 1)..], width)}"
|
|
264
263
|
else
|
|
265
|
-
string[0,place]
|
|
264
|
+
"#{string[0, place]}\n#{wrap(string[(place + 1)..], width)}"
|
|
266
265
|
end
|
|
267
266
|
end
|
|
268
267
|
|
|
269
|
-
def indent_text(string, level=1, style="\t", indentfirstline=true)
|
|
270
|
-
Kernel.warn(
|
|
271
|
-
|
|
268
|
+
def indent_text(string, level = 1, style = "\t", indentfirstline = true)
|
|
269
|
+
Kernel.warn(
|
|
270
|
+
"#{self.class.name}#indent_text is deprecated. See REXML::Formatters", uplevel: 1
|
|
271
|
+
)
|
|
272
|
+
return string if level.negative?
|
|
272
273
|
|
|
273
|
-
new_string = +
|
|
274
|
-
string.each_line
|
|
274
|
+
new_string = +""
|
|
275
|
+
string.each_line do |line|
|
|
275
276
|
indent_string = style * level
|
|
276
277
|
new_line = (indent_string + line).rstrip
|
|
277
278
|
new_string << new_line
|
|
278
|
-
|
|
279
|
+
end
|
|
279
280
|
new_string.strip! unless indentfirstline
|
|
280
281
|
new_string
|
|
281
282
|
end
|
|
@@ -283,20 +284,22 @@ module REXML
|
|
|
283
284
|
# == DEPRECATED
|
|
284
285
|
# See REXML::Formatters
|
|
285
286
|
#
|
|
286
|
-
def write(
|
|
287
|
-
Kernel.warn(
|
|
287
|
+
def write(writer, indent = -1, _transitive = false, _ie_hack = false)
|
|
288
|
+
Kernel.warn(
|
|
289
|
+
"#{self.class.name}#write is deprecated. See REXML::Formatters", uplevel: 1
|
|
290
|
+
)
|
|
288
291
|
formatter = if indent > -1
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
formatter.write(
|
|
292
|
+
REXML::Formatters::Pretty.new(indent)
|
|
293
|
+
else
|
|
294
|
+
REXML::Formatters::Default.new
|
|
295
|
+
end
|
|
296
|
+
formatter.write(self, writer)
|
|
294
297
|
end
|
|
295
298
|
|
|
296
299
|
# FIXME
|
|
297
300
|
# This probably won't work properly
|
|
298
301
|
def xpath
|
|
299
|
-
@parent.xpath
|
|
302
|
+
"#{@parent.xpath}/text()"
|
|
300
303
|
end
|
|
301
304
|
|
|
302
305
|
# Writes out text, substituting special characters beforehand.
|
|
@@ -313,68 +316,71 @@ module REXML
|
|
|
313
316
|
# end
|
|
314
317
|
# }
|
|
315
318
|
# puts ascOut
|
|
316
|
-
def write_with_substitution
|
|
319
|
+
def write_with_substitution(out, input)
|
|
317
320
|
copy = input.clone
|
|
318
321
|
# Doing it like this rather than in a loop improves the speed
|
|
319
|
-
copy.gsub!(
|
|
320
|
-
copy.gsub!(
|
|
321
|
-
copy.gsub!(
|
|
322
|
-
copy.gsub!(
|
|
323
|
-
copy.gsub!(
|
|
324
|
-
copy.gsub!(
|
|
322
|
+
copy.gsub!(SPECIALS[0], SUBSTITUTES[0])
|
|
323
|
+
copy.gsub!(SPECIALS[1], SUBSTITUTES[1])
|
|
324
|
+
copy.gsub!(SPECIALS[2], SUBSTITUTES[2])
|
|
325
|
+
copy.gsub!(SPECIALS[3], SUBSTITUTES[3])
|
|
326
|
+
copy.gsub!(SPECIALS[4], SUBSTITUTES[4])
|
|
327
|
+
copy.gsub!(SPECIALS[5], SUBSTITUTES[5])
|
|
325
328
|
out << copy
|
|
326
329
|
end
|
|
327
330
|
|
|
328
331
|
private
|
|
332
|
+
|
|
329
333
|
def clear_cache
|
|
330
334
|
@normalized = nil
|
|
331
335
|
@unnormalized = nil
|
|
332
336
|
end
|
|
333
337
|
|
|
334
338
|
# Reads text, substituting entities
|
|
335
|
-
def
|
|
339
|
+
def self.read_with_substitution(input, illegal = nil)
|
|
336
340
|
copy = input.clone
|
|
337
341
|
|
|
338
|
-
if copy =~ illegal
|
|
339
|
-
raise ParseException.new(
|
|
340
|
-
end
|
|
342
|
+
if illegal && illegal && (copy =~ illegal)
|
|
343
|
+
raise ParseException.new("malformed text: Illegal character #$& in \"#{copy}\"")
|
|
344
|
+
end
|
|
341
345
|
|
|
342
|
-
copy.gsub!(
|
|
346
|
+
copy.gsub!(/\r\n?/, "\n")
|
|
343
347
|
if copy.include? ?&
|
|
344
|
-
copy.gsub!(
|
|
345
|
-
copy.gsub!(
|
|
346
|
-
copy.gsub!(
|
|
347
|
-
copy.gsub!(
|
|
348
|
-
copy.gsub!(
|
|
349
|
-
copy.gsub!(
|
|
350
|
-
m
|
|
351
|
-
#m='0' if m==''
|
|
348
|
+
copy.gsub!(SETUTITSBUS[0], SLAICEPS[0])
|
|
349
|
+
copy.gsub!(SETUTITSBUS[1], SLAICEPS[1])
|
|
350
|
+
copy.gsub!(SETUTITSBUS[2], SLAICEPS[2])
|
|
351
|
+
copy.gsub!(SETUTITSBUS[3], SLAICEPS[3])
|
|
352
|
+
copy.gsub!(SETUTITSBUS[4], SLAICEPS[4])
|
|
353
|
+
copy.gsub!(/�*((?:\d+)|(?:x[a-f0-9]+));/) do
|
|
354
|
+
m = $1
|
|
355
|
+
# m='0' if m==''
|
|
352
356
|
m = "0#{m}" if m[0] == ?x
|
|
353
|
-
[Integer(m)].pack(
|
|
354
|
-
|
|
357
|
+
[Integer(m)].pack("U*")
|
|
358
|
+
end
|
|
355
359
|
end
|
|
356
360
|
copy
|
|
357
361
|
end
|
|
358
362
|
|
|
359
363
|
EREFERENCE = /&(?!#{Entity::NAME};)/
|
|
360
364
|
# Escapes all possible entities
|
|
361
|
-
def
|
|
365
|
+
def self.normalize(input, doctype = nil, entity_filter = nil)
|
|
362
366
|
copy = input.to_s
|
|
363
367
|
# Doing it like this rather than in a loop improves the speed
|
|
364
|
-
#copy = copy.gsub( EREFERENCE, '&' )
|
|
365
|
-
copy = copy.gsub(
|
|
368
|
+
# copy = copy.gsub( EREFERENCE, '&' )
|
|
369
|
+
copy = copy.gsub("&", "&") if copy.include?("&")
|
|
366
370
|
if doctype
|
|
367
371
|
# Replace all ampersands that aren't part of an entity
|
|
368
372
|
doctype.entities.each_value do |entity|
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
373
|
+
if entity.value &&
|
|
374
|
+
not(entity_filter && entity_filter.include?(entity.name))
|
|
375
|
+
copy = copy.gsub(entity.value,
|
|
376
|
+
"&#{entity.name};")
|
|
377
|
+
end
|
|
372
378
|
end
|
|
373
379
|
else
|
|
374
380
|
# Replace all ampersands that aren't part of an entity
|
|
375
381
|
DocType::DEFAULT_ENTITIES.each_value do |entity|
|
|
376
382
|
if copy.include?(entity.value)
|
|
377
|
-
copy = copy.gsub(entity.value, "&#{entity.name};"
|
|
383
|
+
copy = copy.gsub(entity.value, "&#{entity.name};")
|
|
378
384
|
end
|
|
379
385
|
end
|
|
380
386
|
end
|
|
@@ -382,35 +388,37 @@ module REXML
|
|
|
382
388
|
end
|
|
383
389
|
|
|
384
390
|
# Unescapes all possible entities
|
|
385
|
-
def
|
|
391
|
+
def self.unnormalize(string, doctype = nil, filter = nil, _illegal = nil,
|
|
392
|
+
entity_expansion_text_limit: nil)
|
|
386
393
|
entity_expansion_text_limit ||= Security.entity_expansion_text_limit
|
|
387
394
|
sum = 0
|
|
388
|
-
string.gsub(
|
|
395
|
+
string.gsub(/\r\n?/, "\n").gsub(REFERENCE) do
|
|
389
396
|
s = Text.expand($&, doctype, filter)
|
|
390
397
|
if sum + s.bytesize > entity_expansion_text_limit
|
|
391
398
|
raise "entity expansion has grown too large"
|
|
392
399
|
else
|
|
393
400
|
sum += s.bytesize
|
|
394
401
|
end
|
|
402
|
+
|
|
395
403
|
s
|
|
396
|
-
|
|
404
|
+
end
|
|
397
405
|
end
|
|
398
406
|
|
|
399
|
-
def
|
|
407
|
+
def self.expand(ref, doctype, filter)
|
|
400
408
|
if ref[1] == ?#
|
|
401
409
|
if ref[2] == ?x
|
|
402
|
-
[ref[3...-1].to_i(16)].pack(
|
|
410
|
+
[ref[3...-1].to_i(16)].pack("U*")
|
|
403
411
|
else
|
|
404
|
-
[ref[2...-1].to_i].pack(
|
|
412
|
+
[ref[2...-1].to_i].pack("U*")
|
|
405
413
|
end
|
|
406
|
-
elsif ref ==
|
|
407
|
-
|
|
408
|
-
elsif filter
|
|
414
|
+
elsif ref == "&"
|
|
415
|
+
"&"
|
|
416
|
+
elsif filter&.include?(ref[1...-1])
|
|
409
417
|
ref
|
|
410
418
|
elsif doctype
|
|
411
|
-
doctype.entity(
|
|
419
|
+
doctype.entity(ref[1...-1]) or ref
|
|
412
420
|
else
|
|
413
|
-
entity_value = DocType::DEFAULT_ENTITIES[
|
|
421
|
+
entity_value = DocType::DEFAULT_ENTITIES[ref[1...-1]]
|
|
414
422
|
entity_value ? entity_value.value : ref
|
|
415
423
|
end
|
|
416
424
|
end
|
|
@@ -28,18 +28,18 @@ module REXML
|
|
|
28
28
|
"\\u0300-\\u036F",
|
|
29
29
|
"\\u203F-\\u2040",
|
|
30
30
|
]
|
|
31
|
-
NAME_START_CHAR = "[#{name_start_chars.join
|
|
32
|
-
NAME_CHAR = "[#{name_chars.join
|
|
31
|
+
NAME_START_CHAR = "[#{name_start_chars.join}]".freeze
|
|
32
|
+
NAME_CHAR = "[#{name_chars.join}]".freeze
|
|
33
33
|
NAMECHAR = NAME_CHAR
|
|
34
34
|
|
|
35
35
|
ncname_start_chars = name_start_chars - [":"]
|
|
36
36
|
ncname_chars = name_chars - [":"]
|
|
37
|
-
NCNAME_STR = "[#{ncname_start_chars.join
|
|
38
|
-
NAME_STR = "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
|
|
37
|
+
NCNAME_STR = "[#{ncname_start_chars.join}][#{ncname_chars.join}]*".freeze
|
|
38
|
+
NAME_STR = "(?:#{NCNAME_STR}:)?#{NCNAME_STR}".freeze
|
|
39
39
|
|
|
40
|
-
NAME = "(#{NAME_START_CHAR}#{NAME_CHAR}*)"
|
|
41
|
-
NMTOKEN = "(?:#{NAME_CHAR})+"
|
|
42
|
-
NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
|
|
43
|
-
REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)"
|
|
40
|
+
NAME = "(#{NAME_START_CHAR}#{NAME_CHAR}*)".freeze
|
|
41
|
+
NMTOKEN = "(?:#{NAME_CHAR})+".freeze
|
|
42
|
+
NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*".freeze
|
|
43
|
+
REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)".freeze
|
|
44
44
|
end
|
|
45
45
|
end
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
|
+
|
|
2
3
|
# backtick_javascript: true
|
|
3
4
|
|
|
4
|
-
require
|
|
5
|
+
require "corelib/array/pack"
|
|
5
6
|
|
|
6
7
|
unless defined?(StringScanner::Version)
|
|
7
8
|
class StringScanner
|
|
@@ -17,7 +18,7 @@ unless String.method_defined?(:force_encoding)
|
|
|
17
18
|
end
|
|
18
19
|
end
|
|
19
20
|
|
|
20
|
-
unless defined?(
|
|
21
|
+
unless defined?(Encoding)
|
|
21
22
|
module ::Encoding
|
|
22
23
|
UTF_8 = "UTF-8"
|
|
23
24
|
ASCII_8BIT = "ASCII-8BIT"
|
|
@@ -36,35 +37,35 @@ end
|
|
|
36
37
|
# Override with functional equivalents that return new strings.
|
|
37
38
|
class String
|
|
38
39
|
def <<(str)
|
|
39
|
-
|
|
40
|
+
`return self + #{str}.to_s`
|
|
40
41
|
end
|
|
41
42
|
|
|
42
43
|
def chomp!(sep = nil)
|
|
43
|
-
|
|
44
|
+
`
|
|
44
45
|
var r = #{chomp(sep)};
|
|
45
46
|
return r === self ? nil : r;
|
|
46
|
-
|
|
47
|
+
`
|
|
47
48
|
end
|
|
48
49
|
|
|
49
50
|
def gsub!(pattern, replacement, &block)
|
|
50
|
-
|
|
51
|
+
`
|
|
51
52
|
var r = #{gsub(pattern, replacement, &block)};
|
|
52
53
|
return r === self ? nil : r;
|
|
53
|
-
|
|
54
|
+
`
|
|
54
55
|
end
|
|
55
56
|
|
|
56
57
|
def squeeze!(*sets)
|
|
57
|
-
|
|
58
|
+
`
|
|
58
59
|
var r = #{squeeze(*sets)};
|
|
59
60
|
return r === self ? nil : r;
|
|
60
|
-
|
|
61
|
+
`
|
|
61
62
|
end
|
|
62
63
|
|
|
63
64
|
def strip!
|
|
64
|
-
|
|
65
|
+
`
|
|
65
66
|
var r = #{strip};
|
|
66
67
|
return r === self ? nil : r;
|
|
67
|
-
|
|
68
|
+
`
|
|
68
69
|
end
|
|
69
70
|
end
|
|
70
71
|
|