npm - @rgrove/parse-xml - Versions diffs - 4.0.1 → 4.1.0 - Mend

@rgrove/parse-xml 4.0.1 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

package/README.md +40 -25
package/dist/browser.js +642 -223
package/dist/browser.js.map +4 -4
package/dist/global.min.js +9 -8
package/dist/global.min.js.map +4 -4
package/dist/index.d.ts +3 -0
package/dist/index.d.ts.map +1 -1
package/dist/index.js +7 -1
package/dist/index.js.map +1 -1
package/dist/lib/Parser.d.ts +49 -6
package/dist/lib/Parser.d.ts.map +1 -1
package/dist/lib/Parser.js +133 -102
package/dist/lib/Parser.js.map +1 -1
package/dist/lib/StringScanner.d.ts +5 -5
package/dist/lib/StringScanner.d.ts.map +1 -1
package/dist/lib/StringScanner.js +9 -9
package/dist/lib/StringScanner.js.map +1 -1
package/dist/lib/XmlDeclaration.d.ts +30 -0
package/dist/lib/XmlDeclaration.d.ts.map +1 -0
package/dist/lib/XmlDeclaration.js +36 -0
package/dist/lib/XmlDeclaration.js.map +1 -0
package/dist/lib/XmlDocument.d.ts +4 -2
package/dist/lib/XmlDocument.d.ts.map +1 -1
package/dist/lib/XmlDocument.js.map +1 -1
package/dist/lib/XmlDocumentType.d.ts +37 -0
package/dist/lib/XmlDocumentType.d.ts.map +1 -0
package/dist/lib/XmlDocumentType.js +39 -0
package/dist/lib/XmlDocumentType.js.map +1 -0
package/dist/lib/XmlError.d.ts +24 -0
package/dist/lib/XmlError.d.ts.map +1 -0
package/dist/lib/XmlError.js +52 -0
package/dist/lib/XmlError.js.map +1 -0
package/dist/lib/XmlNode.d.ts +20 -1
package/dist/lib/XmlNode.d.ts.map +1 -1
package/dist/lib/XmlNode.js +28 -3
package/dist/lib/XmlNode.js.map +1 -1
package/dist/lib/syntax.d.ts.map +1 -1
package/dist/lib/syntax.js +1 -1
package/dist/lib/syntax.js.map +1 -1
package/dist/lib/types.d.ts +2 -2
package/dist/lib/types.d.ts.map +1 -1
package/package.json +20 -18
package/src/index.ts +3 -0
package/src/lib/Parser.ts +195 -118
package/src/lib/StringScanner.ts +10 -10
package/src/lib/XmlDeclaration.ts +58 -0
package/src/lib/XmlDocument.ts +4 -2
package/src/lib/XmlDocumentType.ts +67 -0
package/src/lib/XmlError.ts +80 -0
package/src/lib/XmlNode.ts +33 -3
package/src/lib/syntax.ts +1 -1

package/src/lib/Parser.ts CHANGED Viewed

@@ -2,14 +2,15 @@ import { StringScanner } from './StringScanner.js';
 import * as syntax from './syntax.js';
 import { XmlCdata } from './XmlCdata.js';
 import { XmlComment } from './XmlComment.js';
+import { XmlDeclaration } from './XmlDeclaration.js';
 import { XmlDocument } from './XmlDocument.js';
+import { XmlDocumentType } from './XmlDocumentType.js';
 import { XmlElement } from './XmlElement.js';
+import { XmlError } from './XmlError.js';
+import { XmlNode } from './XmlNode.js';
 import { XmlProcessingInstruction } from './XmlProcessingInstruction.js';
 import { XmlText } from './XmlText.js';
-import type { XmlNode } from './XmlNode.js';
 const emptyString = '';
 /**
@@ -29,11 +30,18 @@ export class Parser {
    * @param options Parser options.
    */
   constructor(xml: string, options: ParserOptions = {}) {
-    this.document = new XmlDocument();
-    this.currentNode = this.document;
+    let doc = this.document = new XmlDocument();
+    let scanner = this.scanner = new StringScanner(xml);
+    this.currentNode = doc;
     this.options = options;
-    this.scanner = new StringScanner(normalizeXmlString(xml));
+    if (this.options.includeOffsets) {
+      doc.start = 0;
+      doc.end = xml.length;
+    }
+    scanner.consumeStringFast('\uFEFF'); // byte order mark
     this.consumeProlog();
     if (!this.consumeElement()) {
@@ -42,7 +50,7 @@ export class Parser {
     while (this.consumeMisc()) {} // eslint-disable-line no-empty
-    if (!this.scanner.isEnd) {
+    if (!scanner.isEnd) {
       throw this.error('Extra content at the end of the document');
     }
   }
@@ -50,35 +58,50 @@ export class Parser {
   /**
    * Adds the given `XmlNode` as a child of `this.currentNode`.
    */
-  addNode(node: XmlNode) {
+  addNode(node: XmlNode, charIndex: number) {
     node.parent = this.currentNode;
+    if (this.options.includeOffsets) {
+      node.start = this.scanner.charIndexToByteIndex(charIndex);
+      node.end = this.scanner.charIndexToByteIndex();
+    }
     // @ts-expect-error: XmlDocument has a more limited set of possible children
     // than XmlElement so TypeScript is unhappy, but we always do the right
     // thing.
     this.currentNode.children.push(node);
+    return true;
   }
   /**
    * Adds the given _text_ to the document, either by appending it to a
    * preceding `XmlText` node (if possible) or by creating a new `XmlText` node.
    */
-  addText(text: string) {
+  addText(text: string, charIndex: number) {
     let { children } = this.currentNode;
     let { length } = children;
+    text = normalizeLineBreaks(text);
     if (length > 0) {
       let prevNode = children[length - 1];
-      if (prevNode instanceof XmlText) {
+      if (prevNode?.type === XmlNode.TYPE_TEXT) {
+        let textNode = prevNode as XmlText;
         // The previous node is a text node, so we can append to it and avoid
         // creating another node.
-        prevNode.text += text;
-        return;
+        textNode.text += text;
+        if (this.options.includeOffsets) {
+          textNode.end = this.scanner.charIndexToByteIndex();
+        }
+        return true;
       }
     }
-    this.addNode(new XmlText(text));
+    return this.addNode(new XmlText(text), charIndex);
   }
   /**
@@ -199,6 +222,7 @@ export class Parser {
    */
   consumeCdataSection(): boolean {
     let { scanner } = this;
+    let startIndex = scanner.charIndex;
     if (!scanner.consumeStringFast('<![CDATA[')) {
       return false;
@@ -211,13 +235,9 @@ export class Parser {
       throw this.error('Unclosed CDATA section');
     }
-    if (this.options.preserveCdata) {
-      this.addNode(new XmlCdata(text));
-    } else {
-      this.addText(text);
-    }
-    return true;
+    return this.options.preserveCdata
+      ? this.addNode(new XmlCdata(normalizeLineBreaks(text)), startIndex)
+      : this.addText(text, startIndex);
   }
   /**
@@ -228,6 +248,7 @@ export class Parser {
    */
   consumeCharData(): boolean {
     let { scanner } = this;
+    let startIndex = scanner.charIndex;
     let charData = scanner.consumeUntilMatch(syntax.endCharData);
     if (!charData) {
@@ -240,8 +261,7 @@ export class Parser {
       throw this.error('Element content may not contain the CDATA section close delimiter `]]>`');
     }
-    this.addText(charData);
-    return true;
+    return this.addText(charData, startIndex);
   }
   /**
@@ -252,6 +272,7 @@ export class Parser {
    */
   consumeComment(): boolean {
     let { scanner } = this;
+    let startIndex = scanner.charIndex;
     if (!scanner.consumeStringFast('<!--')) {
       return false;
@@ -268,11 +289,9 @@ export class Parser {
       throw this.error('Unclosed comment');
     }
-    if (this.options.preserveComments) {
-      this.addNode(new XmlComment(content.trim()));
-    }
-    return true;
+    return this.options.preserveComments
+      ? this.addNode(new XmlComment(normalizeLineBreaks(content)), startIndex)
+      : true;
   }
   /**
@@ -285,14 +304,12 @@ export class Parser {
    * @see https://www.w3.org/TR/2008/REC-xml-20081126/#entproc
    */
   consumeContentReference(): boolean {
+    let startIndex = this.scanner.charIndex;
     let ref = this.consumeReference();
-    if (ref) {
-      this.addText(ref);
-      return true;
-    }
-    return false;
+    return ref
+      ? this.addText(ref, startIndex)
+      : false;
   }
   /**
@@ -306,25 +323,68 @@ export class Parser {
    */
   consumeDoctypeDeclaration(): boolean {
     let { scanner } = this;
+    let startIndex = scanner.charIndex;
-    if (!scanner.consumeStringFast('<!DOCTYPE')
-        || !this.consumeWhitespace()) {
+    if (!scanner.consumeStringFast('<!DOCTYPE')) {
       return false;
     }
-    scanner.consumeMatch(/[^[>]+/y);
+    let name = this.consumeWhitespace()
+      && this.consumeName();
-    if (scanner.consumeMatch(/\[[\s\S]+?\][\x20\t\r\n]*>/y)) {
-      return true;
+    if (!name) {
+      throw this.error('Expected a name');
+    }
+    let publicId;
+    let systemId;
+    if (this.consumeWhitespace()) {
+      if (scanner.consumeStringFast('PUBLIC')) {
+        publicId = this.consumeWhitespace()
+          && this.consumePubidLiteral();
+        if (publicId === false) {
+          throw this.error('Expected a public identifier');
+        }
+        this.consumeWhitespace();
+      }
+      if (publicId !== undefined || scanner.consumeStringFast('SYSTEM')) {
+        this.consumeWhitespace();
+        systemId = this.consumeSystemLiteral();
+        if (systemId === false) {
+          throw this.error('Expected a system identifier');
+        }
+        this.consumeWhitespace();
+      }
+    }
+    let internalSubset;
+    if (scanner.consumeStringFast('[')) {
+      // The internal subset may contain comments that contain `]` characters,
+      // so we can't use `consumeUntilString()` here.
+      internalSubset = scanner.consumeUntilMatch(/\][\x20\t\r\n]*>/);
+      if (!scanner.consumeStringFast(']')) {
+        throw this.error('Unclosed internal subset');
+      }
+      this.consumeWhitespace();
     }
     if (!scanner.consumeStringFast('>')) {
       throw this.error('Unclosed doctype declaration');
     }
-    return true;
-  }
+    return this.options.preserveDocumentType
+      ? this.addNode(new XmlDocumentType(name, publicId, systemId, internalSubset), startIndex)
+      : true;
+    }
   /**
    * Consumes an element if possible.
@@ -334,7 +394,7 @@ export class Parser {
    */
   consumeElement(): boolean {
     let { scanner } = this;
-    let mark = scanner.charIndex;
+    let startIndex = scanner.charIndex;
     if (!scanner.consumeStringFast('<')) {
       return false;
@@ -343,12 +403,12 @@ export class Parser {
     let name = this.consumeName();
     if (!name) {
-      scanner.reset(mark);
+      scanner.reset(startIndex);
       return false;
     }
     let attributes = this.consumeAttributes();
-    let isEmpty = Boolean(scanner.consumeStringFast('/>'));
+    let isEmpty = !!scanner.consumeStringFast('/>');
     let element = new XmlElement(name, attributes);
     element.parent = this.currentNode;
@@ -390,8 +450,7 @@ export class Parser {
       this.currentNode = element.parent;
     }
-    this.addNode(element);
-    return true;
+    return this.addNode(element, startIndex);
   }
   /**
@@ -443,7 +502,7 @@ export class Parser {
    */
   consumeProcessingInstruction(): boolean {
     let { scanner } = this;
-    let mark = scanner.charIndex;
+    let startIndex = scanner.charIndex;
     if (!scanner.consumeStringFast('<?')) {
       return false;
@@ -453,7 +512,7 @@ export class Parser {
     if (name) {
       if (name.toLowerCase() === 'xml') {
-        scanner.reset(mark);
+        scanner.reset(startIndex);
         throw this.error("XML declaration isn't allowed here");
       }
     } else {
@@ -462,8 +521,7 @@ export class Parser {
     if (!this.consumeWhitespace()) {
       if (scanner.consumeStringFast('?>')) {
-        this.addNode(new XmlProcessingInstruction(name));
-        return true;
+        return this.addNode(new XmlProcessingInstruction(name), startIndex);
       }
       throw this.error('Whitespace is required after a processing instruction name');
@@ -476,8 +534,7 @@ export class Parser {
       throw this.error('Unterminated processing instruction');
     }
-    this.addNode(new XmlProcessingInstruction(name, content));
-    return true;
+    return this.addNode(new XmlProcessingInstruction(name, normalizeLineBreaks(content)), startIndex);
   }
   /**
@@ -488,7 +545,7 @@ export class Parser {
    */
   consumeProlog(): boolean {
     let { scanner } = this;
-    let mark = scanner.charIndex;
+    let startIndex = scanner.charIndex;
     this.consumeXmlDeclaration();
@@ -498,7 +555,29 @@ export class Parser {
       while (this.consumeMisc()) {} // eslint-disable-line no-empty
     }
-    return mark < scanner.charIndex;
+    return startIndex < scanner.charIndex;
+  }
+  /**
+   * Consumes a public identifier literal if possible.
+   *
+   * @returns
+   *   Value of the public identifier literal minus quotes, or `false` if
+   *   nothing was consumed. An empty string indicates that a public id literal
+   *   was consumed but was empty.
+   *
+   * @see https://www.w3.org/TR/2008/REC-xml-20081126/#NT-PubidLiteral
+   */
+  consumePubidLiteral(): string | false {
+    let startIndex = this.scanner.charIndex;
+    let value = this.consumeSystemLiteral();
+    if (value !== false && !/^[-\x20\r\na-zA-Z0-9'()+,./:=?;!*#@$_%]*$/.test(value)) {
+      this.scanner.reset(startIndex);
+      throw this.error('Invalid character in public identifier');
+    }
+    return value;
   }
   /**
@@ -619,7 +698,7 @@ export class Parser {
    * @see https://www.w3.org/TR/2008/REC-xml-20081126/#white
    */
   consumeWhitespace(): boolean {
-    return Boolean(this.scanner.consumeMatchFn(syntax.isWhitespace));
+    return !!this.scanner.consumeMatchFn(syntax.isWhitespace);
   }
   /**
@@ -630,6 +709,7 @@ export class Parser {
    */
   consumeXmlDeclaration(): boolean {
     let { scanner } = this;
+    let startIndex = scanner.charIndex;
     if (!scanner.consumeStringFast('<?xml')) {
       return false;
@@ -639,7 +719,7 @@ export class Parser {
       throw this.error('Invalid XML declaration');
     }
-    let version = Boolean(scanner.consumeStringFast('version'))
+    let version = !!scanner.consumeStringFast('version')
       && this.consumeEqual()
       && this.consumeSystemLiteral();
@@ -649,8 +729,11 @@ export class Parser {
       throw this.error('Invalid character in version number');
     }
+    let encoding;
+    let standalone;
     if (this.consumeWhitespace()) {
-      let encoding = Boolean(scanner.consumeStringFast('encoding'))
+      encoding = !!scanner.consumeStringFast('encoding')
         && this.consumeEqual()
         && this.consumeSystemLiteral();
@@ -658,7 +741,7 @@ export class Parser {
         this.consumeWhitespace();
       }
-      let standalone = Boolean(scanner.consumeStringFast('standalone'))
+      standalone = !!scanner.consumeStringFast('standalone')
         && this.consumeEqual()
         && this.consumeSystemLiteral();
@@ -675,65 +758,21 @@ export class Parser {
       throw this.error('Invalid or unclosed XML declaration');
     }
-    return true;
+    return this.options.preserveXmlDeclaration
+      ? this.addNode(new XmlDeclaration(
+          version,
+          encoding || undefined,
+          (standalone as 'yes' | 'no' | false) || undefined,
+        ), startIndex)
+      : true;
   }
   /**
-   * Throws an error at the current scanner position.
+   * Returns an `XmlError` for the current scanner position.
    */
   error(message: string) {
-    let { charIndex, string: xml } = this.scanner;
-    let column = 1;
-    let excerpt = '';
-    let line = 1;
-    // Find the line and column where the error occurred.
-    for (let i = 0; i < charIndex; ++i) {
-      let char = xml[i];
-      if (char === '\n') {
-        column = 1;
-        excerpt = '';
-        line += 1;
-      } else {
-        column += 1;
-        excerpt += char;
-      }
-    }
-    let eol = xml.indexOf('\n', charIndex);
-    excerpt += eol === -1
-      ? xml.slice(charIndex)
-      : xml.slice(charIndex, eol);
-    let excerptStart = 0;
-    // Keep the excerpt below 50 chars, but always keep the error position in
-    // view.
-    if (excerpt.length > 50) {
-      if (column < 40) {
-        excerpt = excerpt.slice(0, 50);
-      } else {
-        excerptStart = column - 20;
-        excerpt = excerpt.slice(excerptStart, column + 30);
-      }
-    }
-    let err = new Error(
-      `${message} (line ${line}, column ${column})\n`
-        + `  ${excerpt}\n`
-        + ' '.repeat(column - excerptStart + 1) + '^\n',
-    );
-    Object.assign(err, {
-      column,
-      excerpt,
-      line,
-      pos: charIndex,
-    });
-    return err;
+    let { scanner } = this;
+    return new XmlError(message, scanner.charIndex, scanner.string);
   }
   /**
@@ -761,15 +800,19 @@ export class Parser {
 // -- Private Functions --------------------------------------------------------
 /**
- * Normalizes the given XML string by stripping a byte order mark (if present)
- * and replacing CRLF sequences and lone CR characters with LF characters.
+ * Normalizes line breaks in the given text by replacing CRLF sequences and lone
+ * CR characters with LF characters.
  */
-function normalizeXmlString(xml: string): string {
-  if (xml[0] === '\uFEFF') {
-    xml = xml.slice(1);
+function normalizeLineBreaks(text: string): string {
+  let i = 0;
+  while ((i = text.indexOf('\r', i)) !== -1) {
+    text = text[i + 1] === '\n'
+      ? text.slice(0, i) + text.slice(i + 1)
+      : text.slice(0, i) + '\n' + text.slice(i + 1);
   }
-  return xml.replace(/\r\n?/g, '\n');
+  return text;
 }
 // -- Types --------------------------------------------------------------------
@@ -782,6 +825,14 @@ export type ParserOptions = {
    */
   ignoreUndefinedEntities?: boolean;
+  /**
+   * When `true`, the starting and ending byte offsets of each node in the input
+   * string will be made available via `start` and `end` properties on the node.
+   *
+   * @default false
+   */
+  includeOffsets?: boolean;
   /**
    * When `true`, CDATA sections will be preserved in the document as `XmlCdata`
    * nodes. Otherwise CDATA sections will be represented as `XmlText` nodes,
@@ -799,6 +850,32 @@ export type ParserOptions = {
    */
   preserveComments?: boolean;
+  /**
+   * When `true`, a document type declaration (if present) will be preserved in
+   * the document as an `XmlDocumentType` node. Otherwise the declaration will
+   * not be included in the node tree.
+   *
+   * Note that when this is `true` and a document type declaration is present,
+   * the DTD will precede the root node in the node tree (normally the root
+   * node would be first).
+   *
+   * @default false
+   */
+  preserveDocumentType?: boolean;
+  /**
+   * When `true`, an XML declaration (if present) will be preserved in the
+   * document as an `XmlDeclaration` node. Otherwise the declaration will not be
+   * included in the node tree.
+   *
+   * Note that when this is `true` and an XML declaration is present, the
+   * XML declaration will be the first child of the document (normally the root
+   * node would be first).
+   *
+   * @default false
+   */
+  preserveXmlDeclaration?: boolean;
   /**
    * When an undefined named entity is encountered, this function will be called
    * with the entity as its only argument. It should return a string value with

package/src/lib/StringScanner.ts CHANGED Viewed

@@ -42,16 +42,6 @@ export class StringScanner {
   // -- Protected Methods ------------------------------------------------------
-  /**
-   * Returns the byte index of the given character index in the string. The two
-   * may differ in strings that contain multibyte characters.
-   */
-  protected charIndexToByteIndex(charIndex: number = this.charIndex): number {
-    return this.multiByteMode
-      ? (this.charsToBytes as number[])[charIndex] ?? Infinity
-      : charIndex;
-  }
   /**
    * Returns the number of characters in the given string, which may differ from
    * the byte length if the string contains multibyte characters.
@@ -75,6 +65,16 @@ export class StringScanner {
     this.charIndex = Math.min(this.charCount, this.charIndex + count);
   }
+  /**
+   * Returns the byte index of the given character index in the string. The two
+   * may differ in strings that contain multibyte characters.
+   */
+  charIndexToByteIndex(charIndex: number = this.charIndex): number {
+    return this.multiByteMode
+      ? (this.charsToBytes as number[])[charIndex] ?? Infinity
+      : charIndex;
+  }
   /**
    * Consumes and returns the given number of characters if possible, advancing
    * the scanner and stopping if the end of the string is reached.

package/src/lib/XmlDeclaration.ts ADDED Viewed

@@ -0,0 +1,58 @@
+import { XmlNode } from './XmlNode.js';
+/**
+ * An XML declaration within an XML document.
+ *
+ * @example
+ *
+ * ```xml
+ * <?xml version="1.0" encoding="UTF-8"?>
+ * ```
+ */
+export class XmlDeclaration extends XmlNode {
+  /**
+   * Value of the encoding declaration in this XML declaration, or `null` if no
+   * encoding declaration was present.
+   */
+  encoding: string | null;
+  /**
+   * Value of the standalone declaration in this XML declaration, or `null` if
+   * no standalone declaration was present.
+   */
+  standalone: 'yes' | 'no' | null;
+  /**
+   * Value of the version declaration in this XML declaration.
+   */
+  version: string;
+  constructor(
+    version: string,
+    encoding?: string,
+    standalone?: typeof XmlDeclaration.prototype.standalone,
+  ) {
+    super();
+    this.version = version;
+    this.encoding = encoding ?? null;
+    this.standalone = standalone ?? null;
+  }
+  override get type() {
+    return XmlNode.TYPE_XML_DECLARATION;
+  }
+  override toJSON() {
+    let json = XmlNode.prototype.toJSON.call(this);
+    json.version = this.version;
+    for (let key of ['encoding', 'standalone'] as const) {
+      if (this[key] !== null) {
+        json[key] = this[key];
+      }
+    }
+    return json;
+  }
+}

package/src/lib/XmlDocument.ts CHANGED Viewed

@@ -2,6 +2,8 @@ import { XmlElement } from './XmlElement.js';
 import { XmlNode } from './XmlNode.js';
 import type { XmlComment } from './XmlComment.js';
+import type { XmlDeclaration } from './XmlDeclaration.js';
+import type { XmlDocumentType } from './XmlDocumentType.js';
 import type { XmlProcessingInstruction } from './XmlProcessingInstruction.js';
 /**
@@ -12,9 +14,9 @@ export class XmlDocument extends XmlNode {
   /**
    * Child nodes of this document.
    */
-  readonly children: Array<XmlComment | XmlProcessingInstruction | XmlElement>;
+  readonly children: Array<XmlComment | XmlDeclaration | XmlDocumentType | XmlProcessingInstruction | XmlElement>;
-  constructor(children: Array<XmlComment | XmlElement | XmlProcessingInstruction> = []) {
+  constructor(children: Array<XmlComment | XmlDeclaration | XmlDocumentType | XmlElement | XmlProcessingInstruction> = []) {
     super();
     this.children = children;
   }