npm - @wdprlib/parser - Versions diffs - 3.1.2 → 3.2.0 - Mend

@wdprlib/parser 3.1.2 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (124) hide show

package/dist/index.cjs +295 -118
package/dist/index.js +272 -95
package/package.json +5 -3
package/src/index.ts +163 -0
package/src/lexer/index.ts +20 -0
package/src/lexer/lexer.ts +687 -0
package/src/lexer/tokens.ts +141 -0
package/src/parser/constants.ts +173 -0
package/src/parser/depth.ts +251 -0
package/src/parser/index.ts +18 -0
package/src/parser/parse.ts +315 -0
package/src/parser/postprocess/divAdjacentParagraph.ts +76 -0
package/src/parser/postprocess/index.ts +15 -0
package/src/parser/postprocess/spanStrip.ts +697 -0
package/src/parser/preprocess/expr.ts +265 -0
package/src/parser/preprocess/index.ts +38 -0
package/src/parser/preprocess/typography.ts +67 -0
package/src/parser/preprocess/utils.ts +250 -0
package/src/parser/preprocess/whitespace.ts +111 -0
package/src/parser/rules/block/align.ts +282 -0
package/src/parser/rules/block/bibliography.ts +359 -0
package/src/parser/rules/block/block-list.ts +689 -0
package/src/parser/rules/block/blockquote.ts +238 -0
package/src/parser/rules/block/center.ts +87 -0
package/src/parser/rules/block/clear-float.ts +75 -0
package/src/parser/rules/block/code.ts +187 -0
package/src/parser/rules/block/collapsible.ts +337 -0
package/src/parser/rules/block/comment.ts +73 -0
package/src/parser/rules/block/content-separator.ts +79 -0
package/src/parser/rules/block/definition-list.ts +270 -0
package/src/parser/rules/block/div.ts +400 -0
package/src/parser/rules/block/embed-block.ts +153 -0
package/src/parser/rules/block/footnoteblock.ts +200 -0
package/src/parser/rules/block/heading.ts +142 -0
package/src/parser/rules/block/horizontal-rule.ts +61 -0
package/src/parser/rules/block/html.ts +222 -0
package/src/parser/rules/block/iframe.ts +239 -0
package/src/parser/rules/block/iftags.ts +150 -0
package/src/parser/rules/block/include.ts +179 -0
package/src/parser/rules/block/index.ts +127 -0
package/src/parser/rules/block/list.ts +244 -0
package/src/parser/rules/block/math.ts +183 -0
package/src/parser/rules/block/module/backlinks/index.ts +31 -0
package/src/parser/rules/block/module/backlinks/types.ts +21 -0
package/src/parser/rules/block/module/categories/index.ts +34 -0
package/src/parser/rules/block/module/categories/types.ts +21 -0
package/src/parser/rules/block/module/css/index.ts +37 -0
package/src/parser/rules/block/module/iftags/condition.ts +109 -0
package/src/parser/rules/block/module/iftags/index.ts +26 -0
package/src/parser/rules/block/module/iftags/preprocess.ts +140 -0
package/src/parser/rules/block/module/iftags/resolve.ts +73 -0
package/src/parser/rules/block/module/iftags/types.ts +63 -0
package/src/parser/rules/block/module/include/index.ts +20 -0
package/src/parser/rules/block/module/include/resolve.ts +556 -0
package/src/parser/rules/block/module/index.ts +122 -0
package/src/parser/rules/block/module/join/index.ts +34 -0
package/src/parser/rules/block/module/join/types.ts +23 -0
package/src/parser/rules/block/module/listpages/compiler.ts +453 -0
package/src/parser/rules/block/module/listpages/extract.ts +410 -0
package/src/parser/rules/block/module/listpages/index.ts +83 -0
package/src/parser/rules/block/module/listpages/normalize.ts +390 -0
package/src/parser/rules/block/module/listpages/parser.ts +106 -0
package/src/parser/rules/block/module/listpages/resolve.ts +130 -0
package/src/parser/rules/block/module/listpages/types.ts +513 -0
package/src/parser/rules/block/module/listpages/url-resolver.ts +186 -0
package/src/parser/rules/block/module/listusers/compiler.ts +77 -0
package/src/parser/rules/block/module/listusers/extract.ts +45 -0
package/src/parser/rules/block/module/listusers/index.ts +36 -0
package/src/parser/rules/block/module/listusers/parser.ts +54 -0
package/src/parser/rules/block/module/listusers/resolve.ts +58 -0
package/src/parser/rules/block/module/listusers/types.ts +93 -0
package/src/parser/rules/block/module/mapping.ts +61 -0
package/src/parser/rules/block/module/page-tree/index.ts +38 -0
package/src/parser/rules/block/module/page-tree/types.ts +29 -0
package/src/parser/rules/block/module/rate/index.ts +28 -0
package/src/parser/rules/block/module/rate/types.ts +19 -0
package/src/parser/rules/block/module/resolve.ts +411 -0
package/src/parser/rules/block/module/types-common.ts +59 -0
package/src/parser/rules/block/module/types.ts +61 -0
package/src/parser/rules/block/module/utils.ts +43 -0
package/src/parser/rules/block/module/walk.ts +380 -0
package/src/parser/rules/block/module.ts +164 -0
package/src/parser/rules/block/orphan-li.ts +177 -0
package/src/parser/rules/block/paragraph.ts +157 -0
package/src/parser/rules/block/table-block.ts +726 -0
package/src/parser/rules/block/table.ts +441 -0
package/src/parser/rules/block/tabview.ts +331 -0
package/src/parser/rules/block/toc.ts +129 -0
package/src/parser/rules/block/utils.ts +615 -0
package/src/parser/rules/index.ts +49 -0
package/src/parser/rules/inline/anchor-name.ts +154 -0
package/src/parser/rules/inline/anchor.ts +327 -0
package/src/parser/rules/inline/bibcite.ts +153 -0
package/src/parser/rules/inline/bold.ts +86 -0
package/src/parser/rules/inline/color.ts +140 -0
package/src/parser/rules/inline/comment.ts +90 -0
package/src/parser/rules/inline/equation-ref.ts +115 -0
package/src/parser/rules/inline/expr.ts +526 -0
package/src/parser/rules/inline/footnote.ts +223 -0
package/src/parser/rules/inline/guillemet.ts +64 -0
package/src/parser/rules/inline/html.ts +132 -0
package/src/parser/rules/inline/image.ts +328 -0
package/src/parser/rules/inline/index.ts +150 -0
package/src/parser/rules/inline/italic.ts +74 -0
package/src/parser/rules/inline/line-break.ts +326 -0
package/src/parser/rules/inline/link-anchor.ts +147 -0
package/src/parser/rules/inline/link-single.ts +164 -0
package/src/parser/rules/inline/link-star.ts +134 -0
package/src/parser/rules/inline/link-triple.ts +267 -0
package/src/parser/rules/inline/math-inline.ts +126 -0
package/src/parser/rules/inline/monospace.ts +78 -0
package/src/parser/rules/inline/raw.ts +262 -0
package/src/parser/rules/inline/size.ts +244 -0
package/src/parser/rules/inline/span.ts +424 -0
package/src/parser/rules/inline/strikethrough.ts +115 -0
package/src/parser/rules/inline/subscript.ts +84 -0
package/src/parser/rules/inline/superscript.ts +84 -0
package/src/parser/rules/inline/text.ts +84 -0
package/src/parser/rules/inline/underline.ts +127 -0
package/src/parser/rules/inline/user.ts +147 -0
package/src/parser/rules/inline/utils.ts +344 -0
package/src/parser/rules/types.ts +252 -0
package/src/parser/rules/utils.ts +155 -0
package/src/parser/toc.ts +130 -0

package/src/parser/rules/block/table.ts ADDED Viewed

@@ -0,0 +1,441 @@
+/**
+ *
+ * Block rule for Wikidot pipe-syntax tables.
+ *
+ * Wikidot tables are written using `||` delimiters at the start of a line:
+ *
+ * ```
+ * || Cell 1 || Cell 2 ||
+ * || Cell 3 || Cell 4 ||
+ * ```
+ *
+ * Cell variants:
+ * - `||` -- normal cell (`<td>`)
+ * - `||~` -- header cell (`<th>`)
+ * - `||<` -- left-aligned cell
+ * - `||>` -- right-aligned cell (TABLE_RIGHT)
+ * - `||=` -- center-aligned cell
+ *
+ * Colspan is achieved by using multiple consecutive `||` before content:
+ * `||||` = colspan 2, `||||||` = colspan 3, etc.
+ *
+ * Key Wikidot behaviour:
+ * - Cells MUST be terminated by another `||` (or variant). Unterminated
+ *   cells (reaching end of line without a closing `||`) are discarded.
+ * - If all cells in a row are unterminated, one empty cell is kept.
+ * - Content within cells supports inline markup (bold, links, etc.).
+ * - Leading and trailing whitespace in cell content is trimmed.
+ *
+ * The table element carries `_source: "pipe"` in its attributes to
+ * distinguish it from block-syntax tables (`[[table]]`).
+ *
+ * @module
+ */
+import type { Element, TableData, TableRow, TableCell, Alignment } from "@wdprlib/ast";
+import type { BlockRule, ParseContext, RuleResult } from "../types";
+import { currentToken } from "../types";
+import type { TokenType } from "../../../lexer/tokens";
+import { canApplyInlineRule } from "../inline/utils";
+/** Token types that begin a table cell or act as cell delimiters. */
+const TABLE_COL_TOKENS: TokenType[] = [
+  "TABLE_MARKER",
+  "TABLE_HEADER",
+  "TABLE_LEFT",
+  "TABLE_CENTER",
+  "TABLE_RIGHT",
+];
+/**
+ * Tests whether a token type is one of the table column delimiters.
+ *
+ * @param type - The token type to check.
+ * @returns `true` if the type starts or delimits a table cell.
+ */
+function isTableColToken(type: TokenType): boolean {
+  return TABLE_COL_TOKENS.includes(type);
+}
+/**
+ * Describes the opening properties of a table cell, determined by
+ * the sequence of column delimiter tokens at the start of the cell.
+ */
+interface CellStart {
+  /** Explicit alignment if a styled token (`||<`, `||=`, `||>`) was used. */
+  align?: Alignment;
+  /** Whether this is a header cell (`||~`). */
+  header: boolean;
+  /** Colspan count: consecutive `||` tokens increment this. */
+  colspan: number;
+}
+/**
+ * Block rule for pipe-syntax tables.
+ *
+ * Parsing strategy:
+ * 1. Verify the first token is a table column token at line start.
+ * 2. Parse consecutive rows (each row is a line starting with a table
+ *    column token).
+ * 3. Each row is parsed by `parseTableRow()`, which iterates cells
+ *    via `parseCellStart()` and `parseTableCell()`.
+ * 4. Emit a `table` element with `_source: "pipe"`.
+ */
+export const tableRule: BlockRule = {
+  name: "table",
+  startTokens: ["TABLE_MARKER", "TABLE_HEADER", "TABLE_LEFT", "TABLE_CENTER", "TABLE_RIGHT"],
+  requiresLineStart: true,
+  parse(ctx: ParseContext): RuleResult<Element> {
+    const firstToken = currentToken(ctx);
+    if (!firstToken.lineStart || !isTableColToken(firstToken.type)) {
+      return { success: false };
+    }
+    const rows: TableRow[] = [];
+    let pos = ctx.pos;
+    let consumed = 0;
+    // Parse rows
+    while (pos < ctx.tokens.length) {
+      const token = ctx.tokens[pos];
+      if (!token || !token.lineStart || !isTableColToken(token.type)) {
+        break;
+      }
+      const rowResult = parseTableRow(ctx, pos);
+      rows.push(rowResult.row);
+      pos += rowResult.consumed;
+      consumed += rowResult.consumed;
+    }
+    const tableData: TableData = {
+      attributes: { _source: "pipe" },
+      rows,
+    };
+    return {
+      success: true,
+      elements: [
+        {
+          element: "table",
+          data: tableData,
+        },
+      ],
+      consumed,
+    };
+  },
+};
+/**
+ * Parses the cell-start delimiter tokens to determine alignment, header
+ * status, and colspan.
+ *
+ * Multiple consecutive TABLE_MARKER tokens (`||`) increase the colspan
+ * count. A styled token (`||~`, `||<`, `||=`, `||>`) ends the sequence
+ * and sets the corresponding property.
+ *
+ * @param ctx      - Parse context.
+ * @param startPos - Token index of the first delimiter token.
+ * @returns The cell properties and consumed count, or `null` if no cell.
+ */
+function parseCellStart(
+  ctx: ParseContext,
+  startPos: number,
+): { cellStart: CellStart; consumed: number } | null {
+  let pos = startPos;
+  let colspan = 0;
+  let align: Alignment | undefined;
+  let header = false;
+  while (pos < ctx.tokens.length) {
+    const token = ctx.tokens[pos];
+    if (!token) break;
+    if (token.type === "TABLE_HEADER") {
+      colspan++;
+      header = true;
+      pos++;
+      // Styled token ends the colspan counting
+      break;
+    }
+    if (token.type === "TABLE_LEFT") {
+      colspan++;
+      align = "left";
+      pos++;
+      break;
+    }
+    if (token.type === "TABLE_CENTER") {
+      colspan++;
+      align = "center";
+      pos++;
+      break;
+    }
+    if (token.type === "TABLE_RIGHT") {
+      colspan++;
+      align = "right";
+      pos++;
+      break;
+    }
+    if (token.type === "TABLE_MARKER") {
+      colspan++;
+      pos++;
+      // Keep checking for more column markers (colspan)
+      continue;
+    }
+    // No more table column tokens
+    if (colspan > 0) {
+      return {
+        cellStart: { align, header, colspan },
+        consumed: pos - startPos,
+      };
+    }
+    return null;
+  }
+  if (colspan > 0) {
+    return {
+      cellStart: { align, header, colspan },
+      consumed: pos - startPos,
+    };
+  }
+  return null;
+}
+/**
+ * Parses a single table row (one line of `||`-delimited cells).
+ *
+ * Cells are collected until end of line. Only properly terminated cells
+ * (followed by another `||` token) are added to the row. If all cells
+ * are unterminated, one empty cell is kept as a placeholder.
+ *
+ * @param ctx      - Parse context.
+ * @param startPos - Token index at the first cell delimiter of the row.
+ * @returns The parsed row and consumed token count.
+ */
+function parseTableRow(ctx: ParseContext, startPos: number): { row: TableRow; consumed: number } {
+  const cells: TableCell[] = [];
+  let pos = startPos;
+  let consumed = 0;
+  // Parse cells until end of line
+  while (pos < ctx.tokens.length) {
+    const token = ctx.tokens[pos];
+    if (!token || token.type === "NEWLINE" || token.type === "EOF") {
+      break;
+    }
+    // Parse cell start
+    const startResult = parseCellStart(ctx, pos);
+    if (!startResult) {
+      // Not a cell start, break
+      break;
+    }
+    pos += startResult.consumed;
+    consumed += startResult.consumed;
+    // Check if end of row (followed by newline/EOF)
+    const nextToken = ctx.tokens[pos];
+    if (!nextToken || nextToken.type === "NEWLINE" || nextToken.type === "EOF") {
+      break;
+    }
+    // Parse cell content
+    const cellResult = parseTableCell(ctx, pos, startResult.cellStart);
+    // Only add properly terminated cells to the row
+    // Wikidot behavior: cells without closing || are discarded
+    if (cellResult.terminatedProperly) {
+      cells.push(cellResult.cell);
+    }
+    pos += cellResult.consumed;
+    consumed += cellResult.consumed;
+  }
+  // Consume newline
+  if (ctx.tokens[pos]?.type === "NEWLINE") {
+    pos++;
+    consumed++;
+  }
+  // Wikidot behavior: if all cells are unterminated, keep one empty cell
+  // This handles cases like "|| Missing end" which produces one empty cell
+  if (cells.length === 0) {
+    cells.push({
+      header: false,
+      "column-span": 1,
+      align: null,
+      attributes: {},
+      elements: [],
+    });
+  }
+  return {
+    row: {
+      attributes: {},
+      cells,
+    },
+    consumed,
+  };
+}
+/**
+ * Parses the content of a single table cell.
+ *
+ * Inline content is collected until the next table column token or end
+ * of line. If the cell is not terminated by a column token, its content
+ * is discarded (`terminatedProperly: false`), matching Wikidot behaviour.
+ *
+ * @param ctx       - Parse context.
+ * @param startPos  - Token index after the cell-start delimiter.
+ * @param cellStart - Properties from the cell-start delimiter sequence.
+ * @returns The parsed cell, consumed count, and termination status.
+ */
+function parseTableCell(
+  ctx: ParseContext,
+  startPos: number,
+  cellStart: CellStart,
+): { cell: TableCell; consumed: number; terminatedProperly: boolean } {
+  let pos = startPos;
+  let consumed = 0;
+  const children: Element[] = [];
+  // Skip leading whitespace
+  while (ctx.tokens[pos]?.type === "WHITESPACE") {
+    pos++;
+    consumed++;
+  }
+  const { inlineRules } = ctx;
+  // Parse inline content until next table column token or newline
+  while (pos < ctx.tokens.length) {
+    const token = ctx.tokens[pos];
+    if (!token || token.type === "NEWLINE" || token.type === "EOF") {
+      break;
+    }
+    // Stop at table column tokens
+    if (isTableColToken(token.type)) {
+      break;
+    }
+    // Check for underscore line-break pattern: WHITESPACE + UNDERSCORE + NEWLINE
+    // Wikidot processes " _\n" before table parsing, replacing it with <br />.
+    // This allows cell content to continue on the next line.
+    if (token.type === "WHITESPACE") {
+      const nextTok = ctx.tokens[pos + 1];
+      const afterTok = ctx.tokens[pos + 2];
+      if (
+        nextTok?.type === "UNDERSCORE" &&
+        afterTok &&
+        (afterTok.type === "NEWLINE" || afterTok.type === "EOF")
+      ) {
+        children.push({ element: "line-break" });
+        pos += 3;
+        consumed += 3;
+        continue;
+      }
+    }
+    // Skip whitespace between tokens but preserve it as text if not at start
+    if (token.type === "WHITESPACE") {
+      children.push({ element: "text", data: token.value });
+      pos++;
+      consumed++;
+      continue;
+    }
+    // Try each inline rule
+    const inlineCtx: ParseContext = { ...ctx, pos };
+    let matched = false;
+    for (const rule of inlineRules) {
+      if (canApplyInlineRule(rule, token)) {
+        const result = rule.parse(inlineCtx);
+        if (result.success) {
+          children.push(...result.elements);
+          consumed += result.consumed;
+          pos += result.consumed;
+          matched = true;
+          break;
+        }
+      }
+    }
+    if (!matched) {
+      // Fallback to text
+      children.push({ element: "text", data: token.value });
+      consumed++;
+      pos++;
+    }
+  }
+  // Check if cell is properly terminated with table column token
+  // Wikidot behavior: cells without proper termination have empty content
+  const currentToken = ctx.tokens[pos];
+  const terminatedProperly = currentToken ? isTableColToken(currentToken.type) : false;
+  // Trim leading/trailing whitespace from children
+  const trimmedChildren = terminatedProperly ? trimElements(children) : [];
+  return {
+    cell: {
+      header: cellStart.header,
+      "column-span": cellStart.colspan,
+      align: terminatedProperly ? (cellStart.align ?? null) : null,
+      attributes: {},
+      elements: trimmedChildren,
+    },
+    consumed,
+    terminatedProperly,
+  };
+}
+/**
+ * Trims leading and trailing whitespace-only text elements from an array.
+ *
+ * Partial whitespace at the edges is trimmed in-place (e.g. `"  foo"` becomes
+ * `"foo"` if it is the first element). Non-text elements are left untouched.
+ *
+ * @param elements - The element array to trim.
+ * @returns A new array with edge whitespace removed.
+ */
+function trimElements(elements: Element[]): Element[] {
+  const result = [...elements];
+  // Trim leading whitespace
+  while (result.length > 0) {
+    const first = result[0];
+    if (first?.element === "text" && typeof first.data === "string") {
+      const trimmed = first.data.trimStart();
+      if (trimmed === "") {
+        result.shift();
+      } else {
+        result[0] = { element: "text", data: trimmed };
+        break;
+      }
+    } else {
+      break;
+    }
+  }
+  // Trim trailing whitespace
+  while (result.length > 0) {
+    const last = result[result.length - 1];
+    if (last?.element === "text" && typeof last.data === "string") {
+      const trimmed = last.data.trimEnd();
+      if (trimmed === "") {
+        result.pop();
+      } else {
+        result[result.length - 1] = { element: "text", data: trimmed };
+        break;
+      }
+    } else {
+      break;
+    }
+  }
+  return result;
+}