@rgrove/parse-xml 4.1.0 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -81,54 +81,60 @@ export class StringScanner {
81
81
  *
82
82
  * If no characters could be consumed, an empty string will be returned.
83
83
  */
84
- consume(count = 1): string {
85
- let chars = this.peek(count);
86
- this.advance(count);
84
+ consume(charCount = 1): string {
85
+ let chars = this.peek(charCount);
86
+ this.advance(charCount);
87
87
  return chars;
88
88
  }
89
89
 
90
90
  /**
91
- * Consumes a match for the given sticky regex, advances the scanner, updates
92
- * the `lastIndex` property of the regex, and returns the matching string.
91
+ * Consumes and returns the given number of bytes if possible, advancing the
92
+ * scanner and stopping if the end of the string is reached.
93
93
  *
94
- * The regex must have a sticky flag ("y") so that its `lastIndex` prop can be
95
- * used to anchor the match at the current scanner position.
94
+ * It's up to the caller to ensure that the given byte count doesn't split a
95
+ * multibyte character.
96
96
  *
97
- * Returns the consumed string, or an empty string if nothing was consumed.
97
+ * If no bytes could be consumed, an empty string will be returned.
98
98
  */
99
- consumeMatch(regex: RegExp): string {
100
- if (!regex.sticky) {
101
- throw new Error('`regex` must have a sticky flag ("y")');
102
- }
103
-
104
- regex.lastIndex = this.charIndexToByteIndex();
105
-
106
- let result = regex.exec(this.string);
107
-
108
- if (result === null || result.length === 0) {
109
- return emptyString;
110
- }
111
-
112
- let match = result[0] as string;
113
- this.advance(this.charLength(match));
114
- return match;
99
+ consumeBytes(byteCount: number): string {
100
+ let byteIndex = this.charIndexToByteIndex();
101
+ let result = this.string.slice(byteIndex, byteIndex + byteCount);
102
+ this.advance(this.charLength(result));
103
+ return result;
115
104
  }
116
105
 
117
106
  /**
118
- * Consumes and returns all characters for which the given function returns a
119
- * truthy value, stopping on the first falsy return value or if the end of the
120
- * input is reached.
107
+ * Consumes and returns all characters for which the given function returns
108
+ * `true`, stopping when `false` is returned or the end of the input is
109
+ * reached.
121
110
  */
122
111
  consumeMatchFn(fn: (char: string) => boolean): string {
123
- let char;
124
- let match = emptyString;
112
+ let { length, multiByteMode, string } = this;
113
+ let startByteIndex = this.charIndexToByteIndex();
114
+ let endByteIndex = startByteIndex;
115
+
116
+ if (multiByteMode) {
117
+ while (endByteIndex < length) {
118
+ let char = string[endByteIndex] as string;
119
+ let isSurrogatePair = char >= '\uD800' && char <= '\uDBFF';
125
120
 
126
- while ((char = this.peek()) && fn(char)) {
127
- match += char;
128
- this.advance();
121
+ if (isSurrogatePair) {
122
+ char += string[endByteIndex + 1];
123
+ }
124
+
125
+ if (!fn(char)) {
126
+ break;
127
+ }
128
+
129
+ endByteIndex += isSurrogatePair ? 2 : 1;
130
+ }
131
+ } else {
132
+ while (endByteIndex < length && fn(string[endByteIndex] as string)) {
133
+ ++endByteIndex;
134
+ }
129
135
  }
130
136
 
131
- return match;
137
+ return this.consumeBytes(endByteIndex - startByteIndex);
132
138
  }
133
139
 
134
140
  /**
@@ -139,35 +145,11 @@ export class StringScanner {
139
145
  * string will be returned and the scanner will not be advanced.
140
146
  */
141
147
  consumeString(stringToConsume: string): string {
142
- if (this.consumeStringFast(stringToConsume)) {
143
- return stringToConsume;
144
- }
145
-
146
- if (this.multiByteMode) {
147
- let { length } = stringToConsume;
148
- let charLengthToMatch = this.charLength(stringToConsume);
149
-
150
- if (charLengthToMatch !== length
151
- && stringToConsume === this.peek(charLengthToMatch)) {
152
-
153
- this.advance(charLengthToMatch);
154
- return stringToConsume;
155
- }
156
- }
157
-
158
- return emptyString;
159
- }
160
-
161
- /**
162
- * Does the same thing as `consumeString()`, but doesn't support consuming
163
- * multibyte characters. This can be faster if you only need to match single
164
- * byte characters.
165
- */
166
- consumeStringFast(stringToConsume: string): string {
167
148
  let { length } = stringToConsume;
149
+ let byteIndex = this.charIndexToByteIndex();
168
150
 
169
- if (this.peek(length) === stringToConsume) {
170
- this.advance(length);
151
+ if (stringToConsume === this.string.slice(byteIndex, byteIndex + length)) {
152
+ this.advance(length === 1 ? 1 : this.charLength(stringToConsume));
171
153
  return stringToConsume;
172
154
  }
173
155
 
@@ -182,16 +164,13 @@ export class StringScanner {
182
164
  * Returns the consumed string, or an empty string if nothing was consumed.
183
165
  */
184
166
  consumeUntilMatch(regex: RegExp): string {
185
- let restOfString = this.string.slice(this.charIndexToByteIndex());
186
- let matchByteIndex = restOfString.search(regex);
167
+ let matchByteIndex = this.string
168
+ .slice(this.charIndexToByteIndex())
169
+ .search(regex);
187
170
 
188
- if (matchByteIndex <= 0) {
189
- return emptyString;
190
- }
191
-
192
- let result = restOfString.slice(0, matchByteIndex);
193
- this.advance(this.charLength(result));
194
- return result;
171
+ return matchByteIndex > 0
172
+ ? this.consumeBytes(matchByteIndex)
173
+ : emptyString;
195
174
  }
196
175
 
197
176
  /**
@@ -202,17 +181,12 @@ export class StringScanner {
202
181
  * Returns the consumed string, or an empty string if nothing was consumed.
203
182
  */
204
183
  consumeUntilString(searchString: string): string {
205
- let { string } = this;
206
184
  let byteIndex = this.charIndexToByteIndex();
207
- let matchByteIndex = string.indexOf(searchString, byteIndex);
208
-
209
- if (matchByteIndex <= 0) {
210
- return emptyString;
211
- }
185
+ let matchByteIndex = this.string.indexOf(searchString, byteIndex);
212
186
 
213
- let result = string.slice(byteIndex, matchByteIndex);
214
- this.advance(this.charLength(result));
215
- return result;
187
+ return matchByteIndex > 0
188
+ ? this.consumeBytes(matchByteIndex - byteIndex)
189
+ : emptyString;
216
190
  }
217
191
 
218
192
  /**
@@ -221,22 +195,11 @@ export class StringScanner {
221
195
  * input string.
222
196
  */
223
197
  peek(count = 1): string {
224
- let { charIndex, multiByteMode, string } = this;
198
+ let { charIndex, string } = this;
225
199
 
226
- if (multiByteMode) {
227
- // Inlining this comparison instead of checking `this.isEnd` improves perf
228
- // slightly since `peek()` is called so frequently.
229
- if (charIndex >= this.charCount) {
230
- return emptyString;
231
- }
232
-
233
- return string.slice(
234
- this.charIndexToByteIndex(charIndex),
235
- this.charIndexToByteIndex(charIndex + count),
236
- );
237
- }
238
-
239
- return string.slice(charIndex, charIndex + count);
200
+ return this.multiByteMode
201
+ ? string.slice(this.charIndexToByteIndex(charIndex), this.charIndexToByteIndex(charIndex + count))
202
+ : string.slice(charIndex, charIndex + count);
240
203
  }
241
204
 
242
205
  /**
package/src/lib/syntax.ts CHANGED
@@ -4,7 +4,7 @@
4
4
  *
5
5
  * @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-AttValue
6
6
  */
7
- export const attValueCharDoubleQuote = /[^"&<]+/y;
7
+ export const attValueCharDoubleQuote = /["&<]/;
8
8
 
9
9
  /**
10
10
  * Regular expression that matches one or more `AttValue` characters in a
@@ -12,7 +12,7 @@ export const attValueCharDoubleQuote = /[^"&<]+/y;
12
12
  *
13
13
  * @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-AttValue
14
14
  */
15
- export const attValueCharSingleQuote = /[^'&<]+/y;
15
+ export const attValueCharSingleQuote = /['&<]/;
16
16
 
17
17
  /**
18
18
  * Regular expression that matches a whitespace character that should be
@@ -49,7 +49,7 @@ export const predefinedEntities: Readonly<{[name: string]: string;}> = Object.fr
49
49
  * @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-NameChar
50
50
  */
51
51
  export function isNameChar(char: string): boolean {
52
- let cp = getCodePoint(char);
52
+ let cp = char.codePointAt(0) as number;
53
53
 
54
54
  // Including the most common NameStartChars here improves performance
55
55
  // slightly.
@@ -60,7 +60,8 @@ export function isNameChar(char: string): boolean {
60
60
  || cp === 0x2E // .
61
61
  || cp === 0xB7
62
62
  || (cp >= 0x300 && cp <= 0x36F)
63
- || (cp >= 0x203F && cp <= 0x2040)
63
+ || cp === 0x203F
64
+ || cp === 0x2040
64
65
  || isNameStartChar(char, cp);
65
66
  }
66
67
 
@@ -69,7 +70,7 @@ export function isNameChar(char: string): boolean {
69
70
  *
70
71
  * @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-NameStartChar
71
72
  */
72
- export function isNameStartChar(char: string, cp = getCodePoint(char)): boolean {
73
+ export function isNameStartChar(char: string, cp = char.codePointAt(0) as number): boolean {
73
74
  return (cp >= 0x61 && cp <= 0x7A) // a-z
74
75
  || (cp >= 0x41 && cp <= 0x5A) // A-Z
75
76
  || cp === 0x3A // :
@@ -79,7 +80,8 @@ export function isNameStartChar(char: string, cp = getCodePoint(char)): boolean
79
80
  || (cp >= 0xF8 && cp <= 0x2FF)
80
81
  || (cp >= 0x370 && cp <= 0x37D)
81
82
  || (cp >= 0x37F && cp <= 0x1FFF)
82
- || (cp >= 0x200C && cp <= 0x200D)
83
+ || cp === 0x200C
84
+ || cp === 0x200D
83
85
  || (cp >= 0x2070 && cp <= 0x218F)
84
86
  || (cp >= 0x2C00 && cp <= 0x2FEF)
85
87
  || (cp >= 0x3001 && cp <= 0xD7FF)
@@ -104,7 +106,7 @@ export function isReferenceChar(char: string): boolean {
104
106
  * @see https://www.w3.org/TR/2008/REC-xml-20081126/#white
105
107
  */
106
108
  export function isWhitespace(char: string): boolean {
107
- let cp = getCodePoint(char);
109
+ let cp = char.codePointAt(0);
108
110
 
109
111
  return cp === 0x20
110
112
  || cp === 0x9
@@ -119,18 +121,10 @@ export function isWhitespace(char: string): boolean {
119
121
  * @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-Char
120
122
  */
121
123
  export function isXmlCodePoint(cp: number): boolean {
122
- return cp === 0x9
124
+ return (cp >= 0x20 && cp <= 0xD7FF)
123
125
  || cp === 0xA
126
+ || cp === 0x9
124
127
  || cp === 0xD
125
- || (cp >= 0x20 && cp <= 0xD7FF)
126
128
  || (cp >= 0xE000 && cp <= 0xFFFD)
127
129
  || (cp >= 0x10000 && cp <= 0x10FFFF);
128
130
  }
129
-
130
- /**
131
- * Returns the Unicode code point value of the given character, or `-1` if
132
- * _char_ is empty.
133
- */
134
- function getCodePoint(char: string): number {
135
- return char.codePointAt(0) || -1;
136
- }