npm - @rgrove/parse-xml - Versions diffs - 4.0.1 → 4.2.0 - Mend

@rgrove/parse-xml 4.0.1 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

package/README.md +46 -31
package/dist/browser.js +692 -300
package/dist/browser.js.map +4 -4
package/dist/global.min.js +9 -8
package/dist/global.min.js.map +4 -4
package/dist/index.d.ts +3 -0
package/dist/index.d.ts.map +1 -1
package/dist/index.js +8 -2
package/dist/index.js.map +1 -1
package/dist/lib/Parser.d.ts +53 -6
package/dist/lib/Parser.d.ts.map +1 -1
package/dist/lib/Parser.js +166 -126
package/dist/lib/Parser.js.map +1 -1
package/dist/lib/StringScanner.d.ts +15 -21
package/dist/lib/StringScanner.d.ts.map +1 -1
package/dist/lib/StringScanner.js +63 -86
package/dist/lib/StringScanner.js.map +1 -1
package/dist/lib/XmlDeclaration.d.ts +30 -0
package/dist/lib/XmlDeclaration.d.ts.map +1 -0
package/dist/lib/XmlDeclaration.js +36 -0
package/dist/lib/XmlDeclaration.js.map +1 -0
package/dist/lib/XmlDocument.d.ts +4 -2
package/dist/lib/XmlDocument.d.ts.map +1 -1
package/dist/lib/XmlDocument.js.map +1 -1
package/dist/lib/XmlDocumentType.d.ts +37 -0
package/dist/lib/XmlDocumentType.d.ts.map +1 -0
package/dist/lib/XmlDocumentType.js +39 -0
package/dist/lib/XmlDocumentType.js.map +1 -0
package/dist/lib/XmlElement.js.map +1 -1
package/dist/lib/XmlError.d.ts +24 -0
package/dist/lib/XmlError.d.ts.map +1 -0
package/dist/lib/XmlError.js +52 -0
package/dist/lib/XmlError.js.map +1 -0
package/dist/lib/XmlNode.d.ts +20 -1
package/dist/lib/XmlNode.d.ts.map +1 -1
package/dist/lib/XmlNode.js +28 -3
package/dist/lib/XmlNode.js.map +1 -1
package/dist/lib/syntax.d.ts.map +1 -1
package/dist/lib/syntax.js +18 -23
package/dist/lib/syntax.js.map +1 -1
package/dist/lib/types.d.ts +2 -2
package/dist/lib/types.d.ts.map +1 -1
package/package.json +20 -23
package/src/index.ts +3 -0
package/src/lib/Parser.ts +228 -141
package/src/lib/StringScanner.ts +66 -103
package/src/lib/XmlDeclaration.ts +58 -0
package/src/lib/XmlDocument.ts +4 -2
package/src/lib/XmlDocumentType.ts +67 -0
package/src/lib/XmlError.ts +80 -0
package/src/lib/XmlNode.ts +33 -3
package/src/lib/syntax.ts +12 -18

package/src/lib/Parser.ts CHANGED Viewed

@@ -2,14 +2,15 @@ import { StringScanner } from './StringScanner.js';
 import * as syntax from './syntax.js';
 import { XmlCdata } from './XmlCdata.js';
 import { XmlComment } from './XmlComment.js';
+import { XmlDeclaration } from './XmlDeclaration.js';
 import { XmlDocument } from './XmlDocument.js';
+import { XmlDocumentType } from './XmlDocumentType.js';
 import { XmlElement } from './XmlElement.js';
+import { XmlError } from './XmlError.js';
+import { XmlNode } from './XmlNode.js';
 import { XmlProcessingInstruction } from './XmlProcessingInstruction.js';
 import { XmlText } from './XmlText.js';
-import type { XmlNode } from './XmlNode.js';
 const emptyString = '';
 /**
@@ -29,56 +30,67 @@ export class Parser {
    * @param options Parser options.
    */
   constructor(xml: string, options: ParserOptions = {}) {
-    this.document = new XmlDocument();
-    this.currentNode = this.document;
-    this.options = options;
-    this.scanner = new StringScanner(normalizeXmlString(xml));
+    let doc = this.document = new XmlDocument();
-    this.consumeProlog();
+    this.currentNode = doc;
+    this.options = options;
+    this.scanner = new StringScanner(xml);
-    if (!this.consumeElement()) {
-      throw this.error('Root element is missing or invalid');
+    if (this.options.includeOffsets) {
+      doc.start = 0;
+      doc.end = xml.length;
     }
-    while (this.consumeMisc()) {} // eslint-disable-line no-empty
-    if (!this.scanner.isEnd) {
-      throw this.error('Extra content at the end of the document');
-    }
+    this.parse();
   }
   /**
    * Adds the given `XmlNode` as a child of `this.currentNode`.
    */
-  addNode(node: XmlNode) {
+  addNode(node: XmlNode, charIndex: number) {
     node.parent = this.currentNode;
+    if (this.options.includeOffsets) {
+      node.start = this.scanner.charIndexToByteIndex(charIndex);
+      node.end = this.scanner.charIndexToByteIndex();
+    }
     // @ts-expect-error: XmlDocument has a more limited set of possible children
     // than XmlElement so TypeScript is unhappy, but we always do the right
     // thing.
     this.currentNode.children.push(node);
+    return true;
   }
   /**
    * Adds the given _text_ to the document, either by appending it to a
    * preceding `XmlText` node (if possible) or by creating a new `XmlText` node.
    */
-  addText(text: string) {
+  addText(text: string, charIndex: number) {
     let { children } = this.currentNode;
     let { length } = children;
+    text = normalizeLineBreaks(text);
     if (length > 0) {
       let prevNode = children[length - 1];
-      if (prevNode instanceof XmlText) {
+      if (prevNode?.type === XmlNode.TYPE_TEXT) {
+        let textNode = prevNode as XmlText;
         // The previous node is a text node, so we can append to it and avoid
         // creating another node.
-        prevNode.text += text;
-        return;
+        textNode.text += text;
+        if (this.options.includeOffsets) {
+          textNode.end = this.scanner.charIndexToByteIndex();
+        }
+        return true;
       }
     }
-    this.addNode(new XmlText(text));
+    return this.addNode(new XmlText(text), charIndex);
   }
   /**
@@ -159,7 +171,7 @@ export class Parser {
       : syntax.attValueCharSingleQuote;
     matchLoop: while (!scanner.isEnd) {
-      chars = scanner.consumeMatch(regex);
+      chars = scanner.consumeUntilMatch(regex);
       if (chars) {
         this.validateChars(chars);
@@ -178,7 +190,7 @@ export class Parser {
         case '<':
           throw this.error('Unescaped `<` is not allowed in an attribute value');
-        case emptyString:
+        default:
           break matchLoop;
       }
     }
@@ -199,25 +211,22 @@ export class Parser {
    */
   consumeCdataSection(): boolean {
     let { scanner } = this;
+    let startIndex = scanner.charIndex;
-    if (!scanner.consumeStringFast('<![CDATA[')) {
+    if (!scanner.consumeString('<![CDATA[')) {
       return false;
     }
     let text = scanner.consumeUntilString(']]>');
     this.validateChars(text);
-    if (!scanner.consumeStringFast(']]>')) {
+    if (!scanner.consumeString(']]>')) {
       throw this.error('Unclosed CDATA section');
     }
-    if (this.options.preserveCdata) {
-      this.addNode(new XmlCdata(text));
-    } else {
-      this.addText(text);
-    }
-    return true;
+    return this.options.preserveCdata
+      ? this.addNode(new XmlCdata(normalizeLineBreaks(text)), startIndex)
+      : this.addText(text, startIndex);
   }
   /**
@@ -228,6 +237,7 @@ export class Parser {
    */
   consumeCharData(): boolean {
     let { scanner } = this;
+    let startIndex = scanner.charIndex;
     let charData = scanner.consumeUntilMatch(syntax.endCharData);
     if (!charData) {
@@ -240,8 +250,7 @@ export class Parser {
       throw this.error('Element content may not contain the CDATA section close delimiter `]]>`');
     }
-    this.addText(charData);
-    return true;
+    return this.addText(charData, startIndex);
   }
   /**
@@ -252,15 +261,16 @@ export class Parser {
    */
   consumeComment(): boolean {
     let { scanner } = this;
+    let startIndex = scanner.charIndex;
-    if (!scanner.consumeStringFast('<!--')) {
+    if (!scanner.consumeString('<!--')) {
       return false;
     }
     let content = scanner.consumeUntilString('--');
     this.validateChars(content);
-    if (!scanner.consumeStringFast('-->')) {
+    if (!scanner.consumeString('-->')) {
       if (scanner.peek(2) === '--') {
         throw this.error("The string `--` isn't allowed inside a comment");
       }
@@ -268,11 +278,9 @@ export class Parser {
       throw this.error('Unclosed comment');
     }
-    if (this.options.preserveComments) {
-      this.addNode(new XmlComment(content.trim()));
-    }
-    return true;
+    return this.options.preserveComments
+      ? this.addNode(new XmlComment(normalizeLineBreaks(content)), startIndex)
+      : true;
   }
   /**
@@ -285,14 +293,12 @@ export class Parser {
    * @see https://www.w3.org/TR/2008/REC-xml-20081126/#entproc
    */
   consumeContentReference(): boolean {
+    let startIndex = this.scanner.charIndex;
     let ref = this.consumeReference();
-    if (ref) {
-      this.addText(ref);
-      return true;
-    }
-    return false;
+    return ref
+      ? this.addText(ref, startIndex)
+      : false;
   }
   /**
@@ -306,25 +312,68 @@ export class Parser {
    */
   consumeDoctypeDeclaration(): boolean {
     let { scanner } = this;
+    let startIndex = scanner.charIndex;
-    if (!scanner.consumeStringFast('<!DOCTYPE')
-        || !this.consumeWhitespace()) {
+    if (!scanner.consumeString('<!DOCTYPE')) {
       return false;
     }
-    scanner.consumeMatch(/[^[>]+/y);
+    let name = this.consumeWhitespace()
+      && this.consumeName();
-    if (scanner.consumeMatch(/\[[\s\S]+?\][\x20\t\r\n]*>/y)) {
-      return true;
+    if (!name) {
+      throw this.error('Expected a name');
+    }
+    let publicId;
+    let systemId;
+    if (this.consumeWhitespace()) {
+      if (scanner.consumeString('PUBLIC')) {
+        publicId = this.consumeWhitespace()
+          && this.consumePubidLiteral();
+        if (publicId === false) {
+          throw this.error('Expected a public identifier');
+        }
+        this.consumeWhitespace();
+      }
+      if (publicId !== undefined || scanner.consumeString('SYSTEM')) {
+        this.consumeWhitespace();
+        systemId = this.consumeSystemLiteral();
+        if (systemId === false) {
+          throw this.error('Expected a system identifier');
+        }
+        this.consumeWhitespace();
+      }
+    }
+    let internalSubset;
+    if (scanner.consumeString('[')) {
+      // The internal subset may contain comments that contain `]` characters,
+      // so we can't use `consumeUntilString()` here.
+      internalSubset = scanner.consumeUntilMatch(/\][\x20\t\r\n]*>/);
+      if (!scanner.consumeString(']')) {
+        throw this.error('Unclosed internal subset');
+      }
+      this.consumeWhitespace();
     }
-    if (!scanner.consumeStringFast('>')) {
+    if (!scanner.consumeString('>')) {
       throw this.error('Unclosed doctype declaration');
     }
-    return true;
-  }
+    return this.options.preserveDocumentType
+      ? this.addNode(new XmlDocumentType(name, publicId, systemId, internalSubset), startIndex)
+      : true;
+    }
   /**
    * Consumes an element if possible.
@@ -334,27 +383,27 @@ export class Parser {
    */
   consumeElement(): boolean {
     let { scanner } = this;
-    let mark = scanner.charIndex;
+    let startIndex = scanner.charIndex;
-    if (!scanner.consumeStringFast('<')) {
+    if (!scanner.consumeString('<')) {
       return false;
     }
     let name = this.consumeName();
     if (!name) {
-      scanner.reset(mark);
+      scanner.reset(startIndex);
       return false;
     }
     let attributes = this.consumeAttributes();
-    let isEmpty = Boolean(scanner.consumeStringFast('/>'));
+    let isEmpty = !!scanner.consumeString('/>');
     let element = new XmlElement(name, attributes);
     element.parent = this.currentNode;
     if (!isEmpty) {
-      if (!scanner.consumeStringFast('>')) {
+      if (!scanner.consumeString('>')) {
         throw this.error(`Unclosed start tag for element \`${name}\``);
       }
@@ -373,7 +422,7 @@ export class Parser {
       let endTagMark = scanner.charIndex;
       let endTagName;
-      if (!scanner.consumeStringFast('</')
+      if (!scanner.consumeString('</')
           || !(endTagName = this.consumeName())
           || endTagName !== name) {
@@ -383,15 +432,14 @@ export class Parser {
       this.consumeWhitespace();
-      if (!scanner.consumeStringFast('>')) {
+      if (!scanner.consumeString('>')) {
         throw this.error(`Unclosed end tag for element ${name}`);
       }
       this.currentNode = element.parent;
     }
-    this.addNode(element);
-    return true;
+    return this.addNode(element, startIndex);
   }
   /**
@@ -403,7 +451,7 @@ export class Parser {
   consumeEqual(): boolean {
     this.consumeWhitespace();
-    if (this.scanner.consumeStringFast('=')) {
+    if (this.scanner.consumeString('=')) {
       this.consumeWhitespace();
       return true;
     }
@@ -443,9 +491,9 @@ export class Parser {
    */
   consumeProcessingInstruction(): boolean {
     let { scanner } = this;
-    let mark = scanner.charIndex;
+    let startIndex = scanner.charIndex;
-    if (!scanner.consumeStringFast('<?')) {
+    if (!scanner.consumeString('<?')) {
       return false;
     }
@@ -453,7 +501,7 @@ export class Parser {
     if (name) {
       if (name.toLowerCase() === 'xml') {
-        scanner.reset(mark);
+        scanner.reset(startIndex);
         throw this.error("XML declaration isn't allowed here");
       }
     } else {
@@ -461,9 +509,8 @@ export class Parser {
     }
     if (!this.consumeWhitespace()) {
-      if (scanner.consumeStringFast('?>')) {
-        this.addNode(new XmlProcessingInstruction(name));
-        return true;
+      if (scanner.consumeString('?>')) {
+        return this.addNode(new XmlProcessingInstruction(name), startIndex);
       }
       throw this.error('Whitespace is required after a processing instruction name');
@@ -472,12 +519,11 @@ export class Parser {
     let content = scanner.consumeUntilString('?>');
     this.validateChars(content);
-    if (!scanner.consumeStringFast('?>')) {
+    if (!scanner.consumeString('?>')) {
       throw this.error('Unterminated processing instruction');
     }
-    this.addNode(new XmlProcessingInstruction(name, content));
-    return true;
+    return this.addNode(new XmlProcessingInstruction(name, normalizeLineBreaks(content)), startIndex);
   }
   /**
@@ -488,7 +534,7 @@ export class Parser {
    */
   consumeProlog(): boolean {
     let { scanner } = this;
-    let mark = scanner.charIndex;
+    let startIndex = scanner.charIndex;
     this.consumeXmlDeclaration();
@@ -498,7 +544,29 @@ export class Parser {
       while (this.consumeMisc()) {} // eslint-disable-line no-empty
     }
-    return mark < scanner.charIndex;
+    return startIndex < scanner.charIndex;
+  }
+  /**
+   * Consumes a public identifier literal if possible.
+   *
+   * @returns
+   *   Value of the public identifier literal minus quotes, or `false` if
+   *   nothing was consumed. An empty string indicates that a public id literal
+   *   was consumed but was empty.
+   *
+   * @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-PubidLiteral
+   */
+  consumePubidLiteral(): string | false {
+    let startIndex = this.scanner.charIndex;
+    let value = this.consumeSystemLiteral();
+    if (value !== false && !/^[-\x20\r\na-zA-Z0-9'()+,./:=?;!*#@$_%]*$/.test(value)) {
+      this.scanner.reset(startIndex);
+      throw this.error('Invalid character in public identifier');
+    }
+    return value;
   }
   /**
@@ -516,7 +584,7 @@ export class Parser {
   consumeReference(): string | false {
     let { scanner } = this;
-    if (!scanner.consumeStringFast('&')) {
+    if (!scanner.consumeString('&')) {
       return false;
     }
@@ -596,7 +664,7 @@ export class Parser {
    */
   consumeSystemLiteral(): string | false {
     let { scanner } = this;
-    let quote = scanner.consumeStringFast('"') || scanner.consumeStringFast("'");
+    let quote = scanner.consumeString('"') || scanner.consumeString("'");
     if (!quote) {
       return false;
@@ -605,7 +673,7 @@ export class Parser {
     let value = scanner.consumeUntilString(quote);
     this.validateChars(value);
-    if (!scanner.consumeStringFast(quote)) {
+    if (!scanner.consumeString(quote)) {
       throw this.error('Missing end quote');
     }
@@ -619,7 +687,7 @@ export class Parser {
    * @see https://www.w3.org/TR/2008/REC-xml-20081126/#white
    */
   consumeWhitespace(): boolean {
-    return Boolean(this.scanner.consumeMatchFn(syntax.isWhitespace));
+    return !!this.scanner.consumeMatchFn(syntax.isWhitespace);
   }
   /**
@@ -630,8 +698,9 @@ export class Parser {
    */
   consumeXmlDeclaration(): boolean {
     let { scanner } = this;
+    let startIndex = scanner.charIndex;
-    if (!scanner.consumeStringFast('<?xml')) {
+    if (!scanner.consumeString('<?xml')) {
       return false;
     }
@@ -639,7 +708,7 @@ export class Parser {
       throw this.error('Invalid XML declaration');
     }
-    let version = Boolean(scanner.consumeStringFast('version'))
+    let version = !!scanner.consumeString('version')
       && this.consumeEqual()
       && this.consumeSystemLiteral();
@@ -649,16 +718,22 @@ export class Parser {
       throw this.error('Invalid character in version number');
     }
+    let encoding;
+    let standalone;
     if (this.consumeWhitespace()) {
-      let encoding = Boolean(scanner.consumeStringFast('encoding'))
+      encoding = !!scanner.consumeString('encoding')
         && this.consumeEqual()
         && this.consumeSystemLiteral();
       if (encoding) {
+        if (!/^[A-Za-z][\w.-]*$/.test(encoding)) {
+          throw this.error('Invalid character in encoding name');
+        }
         this.consumeWhitespace();
       }
-      let standalone = Boolean(scanner.consumeStringFast('standalone'))
+      standalone = !!scanner.consumeString('standalone')
         && this.consumeEqual()
         && this.consumeSystemLiteral();
@@ -671,69 +746,43 @@ export class Parser {
       }
     }
-    if (!scanner.consumeStringFast('?>')) {
+    if (!scanner.consumeString('?>')) {
       throw this.error('Invalid or unclosed XML declaration');
     }
-    return true;
+    return this.options.preserveXmlDeclaration
+      ? this.addNode(new XmlDeclaration(
+          version,
+          encoding || undefined,
+          (standalone as 'yes' | 'no' | false) || undefined,
+        ), startIndex)
+      : true;
   }
   /**
-   * Throws an error at the current scanner position.
+   * Returns an `XmlError` for the current scanner position.
    */
   error(message: string) {
-    let { charIndex, string: xml } = this.scanner;
-    let column = 1;
-    let excerpt = '';
-    let line = 1;
-    // Find the line and column where the error occurred.
-    for (let i = 0; i < charIndex; ++i) {
-      let char = xml[i];
-      if (char === '\n') {
-        column = 1;
-        excerpt = '';
-        line += 1;
-      } else {
-        column += 1;
-        excerpt += char;
-      }
-    }
-    let eol = xml.indexOf('\n', charIndex);
-    excerpt += eol === -1
-      ? xml.slice(charIndex)
-      : xml.slice(charIndex, eol);
+    let { scanner } = this;
+    return new XmlError(message, scanner.charIndex, scanner.string);
+  }
-    let excerptStart = 0;
+  /**
+   * Parses the XML input.
+   */
+  parse() {
+    this.scanner.consumeString('\uFEFF'); // byte order mark
+    this.consumeProlog();
-    // Keep the excerpt below 50 chars, but always keep the error position in
-    // view.
-    if (excerpt.length > 50) {
-      if (column < 40) {
-        excerpt = excerpt.slice(0, 50);
-      } else {
-        excerptStart = column - 20;
-        excerpt = excerpt.slice(excerptStart, column + 30);
-      }
+    if (!this.consumeElement()) {
+      throw this.error('Root element is missing or invalid');
     }
-    let err = new Error(
-      `${message} (line ${line}, column ${column})\n`
-        + `  ${excerpt}\n`
-        + ' '.repeat(column - excerptStart + 1) + '^\n',
-    );
-    Object.assign(err, {
-      column,
-      excerpt,
-      line,
-      pos: charIndex,
-    });
+    while (this.consumeMisc()) {} // eslint-disable-line no-empty
-    return err;
+    if (!this.scanner.isEnd) {
+      throw this.error('Extra content at the end of the document');
+    }
   }
   /**
@@ -761,15 +810,19 @@ export class Parser {
 // -- Private Functions --------------------------------------------------------
 /**
- * Normalizes the given XML string by stripping a byte order mark (if present)
- * and replacing CRLF sequences and lone CR characters with LF characters.
+ * Normalizes line breaks in the given text by replacing CRLF sequences and lone
+ * CR characters with LF characters.
  */
-function normalizeXmlString(xml: string): string {
-  if (xml[0] === '\uFEFF') {
-    xml = xml.slice(1);
+function normalizeLineBreaks(text: string): string {
+  let i = 0;
+  while ((i = text.indexOf('\r', i)) !== -1) {
+    text = text[i + 1] === '\n'
+      ? text.slice(0, i) + text.slice(i + 1)
+      : text.slice(0, i) + '\n' + text.slice(i + 1);
   }
-  return xml.replace(/\r\n?/g, '\n');
+  return text;
 }
 // -- Types --------------------------------------------------------------------
@@ -782,6 +835,14 @@ export type ParserOptions = {
    */
   ignoreUndefinedEntities?: boolean;
+  /**
+   * When `true`, the starting and ending byte offsets of each node in the input
+   * string will be made available via `start` and `end` properties on the node.
+   *
+   * @default false
+   */
+  includeOffsets?: boolean;
   /**
    * When `true`, CDATA sections will be preserved in the document as `XmlCdata`
    * nodes. Otherwise CDATA sections will be represented as `XmlText` nodes,
@@ -799,6 +860,32 @@ export type ParserOptions = {
    */
   preserveComments?: boolean;
+  /**
+   * When `true`, a document type declaration (if present) will be preserved in
+   * the document as an `XmlDocumentType` node. Otherwise the declaration will
+   * not be included in the node tree.
+   *
+   * Note that when this is `true` and a document type declaration is present,
+   * the DTD will precede the root node in the node tree (normally the root
+   * node would be first).
+   *
+   * @default false
+   */
+  preserveDocumentType?: boolean;
+  /**
+   * When `true`, an XML declaration (if present) will be preserved in the
+   * document as an `XmlDeclaration` node. Otherwise the declaration will not be
+   * included in the node tree.
+   *
+   * Note that when this is `true` and an XML declaration is present, the
+   * XML declaration will be the first child of the document (normally the root
+   * node would be first).
+   *
+   * @default false
+   */
+  preserveXmlDeclaration?: boolean;
   /**
    * When an undefined named entity is encountered, this function will be called
    * with the entity as its only argument. It should return a string value with