npm - @wdprlib/parser - Versions diffs - 3.1.1 → 3.2.0 - Mend

@wdprlib/parser 3.1.1 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (124) hide show

package/dist/index.cjs +312 -121
package/dist/index.js +289 -98
package/package.json +5 -3
package/src/index.ts +163 -0
package/src/lexer/index.ts +20 -0
package/src/lexer/lexer.ts +687 -0
package/src/lexer/tokens.ts +141 -0
package/src/parser/constants.ts +173 -0
package/src/parser/depth.ts +251 -0
package/src/parser/index.ts +18 -0
package/src/parser/parse.ts +315 -0
package/src/parser/postprocess/divAdjacentParagraph.ts +76 -0
package/src/parser/postprocess/index.ts +15 -0
package/src/parser/postprocess/spanStrip.ts +697 -0
package/src/parser/preprocess/expr.ts +265 -0
package/src/parser/preprocess/index.ts +38 -0
package/src/parser/preprocess/typography.ts +67 -0
package/src/parser/preprocess/utils.ts +250 -0
package/src/parser/preprocess/whitespace.ts +111 -0
package/src/parser/rules/block/align.ts +282 -0
package/src/parser/rules/block/bibliography.ts +359 -0
package/src/parser/rules/block/block-list.ts +689 -0
package/src/parser/rules/block/blockquote.ts +238 -0
package/src/parser/rules/block/center.ts +87 -0
package/src/parser/rules/block/clear-float.ts +75 -0
package/src/parser/rules/block/code.ts +187 -0
package/src/parser/rules/block/collapsible.ts +337 -0
package/src/parser/rules/block/comment.ts +73 -0
package/src/parser/rules/block/content-separator.ts +79 -0
package/src/parser/rules/block/definition-list.ts +270 -0
package/src/parser/rules/block/div.ts +400 -0
package/src/parser/rules/block/embed-block.ts +153 -0
package/src/parser/rules/block/footnoteblock.ts +200 -0
package/src/parser/rules/block/heading.ts +142 -0
package/src/parser/rules/block/horizontal-rule.ts +61 -0
package/src/parser/rules/block/html.ts +222 -0
package/src/parser/rules/block/iframe.ts +239 -0
package/src/parser/rules/block/iftags.ts +150 -0
package/src/parser/rules/block/include.ts +179 -0
package/src/parser/rules/block/index.ts +127 -0
package/src/parser/rules/block/list.ts +244 -0
package/src/parser/rules/block/math.ts +183 -0
package/src/parser/rules/block/module/backlinks/index.ts +31 -0
package/src/parser/rules/block/module/backlinks/types.ts +21 -0
package/src/parser/rules/block/module/categories/index.ts +34 -0
package/src/parser/rules/block/module/categories/types.ts +21 -0
package/src/parser/rules/block/module/css/index.ts +37 -0
package/src/parser/rules/block/module/iftags/condition.ts +109 -0
package/src/parser/rules/block/module/iftags/index.ts +26 -0
package/src/parser/rules/block/module/iftags/preprocess.ts +140 -0
package/src/parser/rules/block/module/iftags/resolve.ts +73 -0
package/src/parser/rules/block/module/iftags/types.ts +63 -0
package/src/parser/rules/block/module/include/index.ts +20 -0
package/src/parser/rules/block/module/include/resolve.ts +556 -0
package/src/parser/rules/block/module/index.ts +122 -0
package/src/parser/rules/block/module/join/index.ts +34 -0
package/src/parser/rules/block/module/join/types.ts +23 -0
package/src/parser/rules/block/module/listpages/compiler.ts +453 -0
package/src/parser/rules/block/module/listpages/extract.ts +410 -0
package/src/parser/rules/block/module/listpages/index.ts +83 -0
package/src/parser/rules/block/module/listpages/normalize.ts +390 -0
package/src/parser/rules/block/module/listpages/parser.ts +106 -0
package/src/parser/rules/block/module/listpages/resolve.ts +130 -0
package/src/parser/rules/block/module/listpages/types.ts +513 -0
package/src/parser/rules/block/module/listpages/url-resolver.ts +186 -0
package/src/parser/rules/block/module/listusers/compiler.ts +77 -0
package/src/parser/rules/block/module/listusers/extract.ts +45 -0
package/src/parser/rules/block/module/listusers/index.ts +36 -0
package/src/parser/rules/block/module/listusers/parser.ts +54 -0
package/src/parser/rules/block/module/listusers/resolve.ts +58 -0
package/src/parser/rules/block/module/listusers/types.ts +93 -0
package/src/parser/rules/block/module/mapping.ts +61 -0
package/src/parser/rules/block/module/page-tree/index.ts +38 -0
package/src/parser/rules/block/module/page-tree/types.ts +29 -0
package/src/parser/rules/block/module/rate/index.ts +28 -0
package/src/parser/rules/block/module/rate/types.ts +19 -0
package/src/parser/rules/block/module/resolve.ts +411 -0
package/src/parser/rules/block/module/types-common.ts +59 -0
package/src/parser/rules/block/module/types.ts +61 -0
package/src/parser/rules/block/module/utils.ts +43 -0
package/src/parser/rules/block/module/walk.ts +380 -0
package/src/parser/rules/block/module.ts +164 -0
package/src/parser/rules/block/orphan-li.ts +177 -0
package/src/parser/rules/block/paragraph.ts +157 -0
package/src/parser/rules/block/table-block.ts +726 -0
package/src/parser/rules/block/table.ts +441 -0
package/src/parser/rules/block/tabview.ts +331 -0
package/src/parser/rules/block/toc.ts +129 -0
package/src/parser/rules/block/utils.ts +615 -0
package/src/parser/rules/index.ts +49 -0
package/src/parser/rules/inline/anchor-name.ts +154 -0
package/src/parser/rules/inline/anchor.ts +327 -0
package/src/parser/rules/inline/bibcite.ts +153 -0
package/src/parser/rules/inline/bold.ts +86 -0
package/src/parser/rules/inline/color.ts +140 -0
package/src/parser/rules/inline/comment.ts +90 -0
package/src/parser/rules/inline/equation-ref.ts +115 -0
package/src/parser/rules/inline/expr.ts +526 -0
package/src/parser/rules/inline/footnote.ts +223 -0
package/src/parser/rules/inline/guillemet.ts +64 -0
package/src/parser/rules/inline/html.ts +132 -0
package/src/parser/rules/inline/image.ts +328 -0
package/src/parser/rules/inline/index.ts +150 -0
package/src/parser/rules/inline/italic.ts +74 -0
package/src/parser/rules/inline/line-break.ts +326 -0
package/src/parser/rules/inline/link-anchor.ts +147 -0
package/src/parser/rules/inline/link-single.ts +164 -0
package/src/parser/rules/inline/link-star.ts +134 -0
package/src/parser/rules/inline/link-triple.ts +267 -0
package/src/parser/rules/inline/math-inline.ts +126 -0
package/src/parser/rules/inline/monospace.ts +78 -0
package/src/parser/rules/inline/raw.ts +262 -0
package/src/parser/rules/inline/size.ts +244 -0
package/src/parser/rules/inline/span.ts +424 -0
package/src/parser/rules/inline/strikethrough.ts +115 -0
package/src/parser/rules/inline/subscript.ts +84 -0
package/src/parser/rules/inline/superscript.ts +84 -0
package/src/parser/rules/inline/text.ts +84 -0
package/src/parser/rules/inline/underline.ts +127 -0
package/src/parser/rules/inline/user.ts +147 -0
package/src/parser/rules/inline/utils.ts +344 -0
package/src/parser/rules/types.ts +252 -0
package/src/parser/rules/utils.ts +155 -0
package/src/parser/toc.ts +130 -0

package/src/parser/rules/types.ts ADDED Viewed

@@ -0,0 +1,252 @@
+import type { Token, TokenType } from "../../lexer";
+import type { Version, WikitextSettings, Diagnostic } from "@wdprlib/ast";
+import type { Element, CodeBlockData, TocEntry } from "@wdprlib/ast";
+/**
+ * Per-scope state propagated by spread + override semantics.
+ *
+ * Every field is `readonly` so a rule cannot accidentally mutate the
+ * parent scope by writing through a shared reference. Updates must be
+ * expressed as a replacement: `ctx.scope = { ...ctx.scope, X: ... }`
+ * (or, more commonly, by constructing a new child context with the
+ * desired scope override).
+ *
+ * The motivation is to keep speculative parse rollback safe: when a
+ * block rule fails, any scope it built up is discarded with the failed
+ * context. A shared-state design that mutates fields in place does not
+ * survive rollback — grouping per-scope fields here and forbidding
+ * nested mutation makes the semantics explicit at the type level.
+ */
+export interface ScopeContext {
+  /**
+   * Close condition for the current block. The paragraph parser calls
+   * it to decide when to stop collecting inline content.
+   */
+  readonly blockCloseCondition?: (ctx: ParseContext) => boolean;
+  /**
+   * Block names excluded from paragraph-boundary detection. When a
+   * BLOCK_OPEN/BLOCK_END_OPEN for an excluded name appears at line
+   * start, the inline parser does NOT treat it as a paragraph break.
+   * Used by `[[collapsible]]` to prevent nested `[[collapsible]]` from
+   * splitting paragraphs.
+   */
+  readonly excludedBlockNames?: ReadonlySet<string>;
+  /**
+   * Budget for div nesting: tracks how many more nested divs can open.
+   * When 0, the div rule fails (innermost excess opens become text).
+   * `undefined` means "not yet calculated" (top-level or non-div context).
+   */
+  readonly divClosesBudget?: number;
+  /**
+   * Used by the footnote-block rule to reject duplicate occurrences.
+   *
+   * **Scope is per spread copy of `ParseContext`, not document-global.**
+   * `parseBlocksUntil` creates a fresh `{ ...ctx, pos, ... }` on every
+   * iteration, so the flag does not propagate between sibling rules in
+   * a body, between sibling bodies, or up to the top-level parser.
+   *
+   * Practical effect today:
+   * - Two `[[footnoteblock]]` at the top level: the second one fails
+   *   (the top-level dispatch hands the parser's own `ctx` to rules,
+   *   so mutations are visible to the next top-level iteration).
+   * - Two `[[footnoteblock]]` inside the same body, or across nested
+   *   bodies: both currently succeed, even though Wikidot's
+   *   "first-only" rule should reject the duplicate.
+   *
+   * Fixing the cross-scope case requires either an AST-level dedup pass
+   * after parsing (similar to the auto-append walk) or a shared-state
+   * design with proper rollback for speculative parses. Tracked
+   * separately; this flag intentionally keeps the original primitive
+   * semantics to avoid regressing the top-level duplicate-rejection
+   * test fixtures.
+   *
+   * The auto-append decision in `Parser.parse` deliberately ignores
+   * this flag and walks the final AST instead — see `containsFootnoteBlock`.
+   */
+  readonly footnoteBlockParsed: boolean;
+}
+/**
+ * Parser context passed to rules.
+ *
+ * Fields are grouped by lifecycle:
+ * - Static config (`tokens`, `version`, `trackPositions`, `settings`,
+ *   rule arrays): constructor-fixed.
+ * - `pos`: per-scope cursor; kept top-level for ergonomics because
+ *   every rule spread overrides it.
+ * - Accumulators (`footnotes`, `tocEntries`, …, `diagnostics`):
+ *   reference-shared via array identity across spreads.
+ * - `scope`: per-scope state explicitly grouped; see {@link ScopeContext}.
+ */
+export interface ParseContext {
+  tokens: Token[];
+  pos: number;
+  version: Version;
+  trackPositions: boolean;
+  settings: WikitextSettings;
+  // Collections for SyntaxTree output
+  footnotes: Element[][];
+  tocEntries: TocEntry[];
+  codeBlocks: CodeBlockData[];
+  htmlBlocks: string[];
+  // Bibliography citation labels collected during parsing
+  bibcites: string[];
+  // Rules (injected to avoid circular dependency)
+  blockRules: BlockRule[];
+  blockFallbackRule: BlockRule;
+  inlineRules: InlineRule[];
+  // Diagnostics collected during parsing
+  diagnostics: Diagnostic[];
+  // Per-scope state (readonly fields, immutable-replace semantics).
+  scope: ScopeContext;
+}
+/**
+ * Result of a rule attempt
+ * Returns elements array None/Single/Multiple
+ *
+ * During migration: T can be either internal AST node or Element
+ */
+export type RuleResult<T> = { success: true; elements: T[]; consumed: number } | { success: false };
+/**
+ * Block rule interface
+ */
+export interface BlockRule {
+  /** Rule name for debugging */
+  name: string;
+  /** Token types that can start this rule */
+  startTokens: TokenType[];
+  /** Whether this rule requires line start */
+  requiresLineStart: boolean;
+  /** Try to parse this block */
+  parse(ctx: ParseContext): RuleResult<Element>;
+  /**
+   * Check if tokens at the given position match this rule's start pattern.
+   * Used by inline parser to determine behavior before a block boundary
+   * (e.g. whether to generate a trailing line-break).
+   */
+  isStartPattern?(ctx: ParseContext, pos: number): boolean;
+  /**
+   * When true, a single newline before this block becomes a line-break.
+   * Wikidot's Divalign expands content inline, so \n before nested blocks
+   * becomes <br />. Other blocks (Code, Div, etc.) suppress this.
+   */
+  preservesPrecedingLineBreak?: boolean;
+}
+/**
+ * Inline rule interface
+ */
+export interface InlineRule {
+  /** Rule name for debugging */
+  name: string;
+  /** Token types that can start this rule */
+  startTokens: TokenType[];
+  /** Try to parse this inline element */
+  parse(ctx: ParseContext): RuleResult<Element>;
+}
+/**
+ * Helper to get current token
+ */
+export function currentToken(ctx: ParseContext): Token {
+  return ctx.tokens[ctx.pos] ?? eofToken();
+}
+/**
+ * Helper to peek ahead
+ */
+export function peekToken(ctx: ParseContext, n = 1): Token {
+  return ctx.tokens[ctx.pos + n] ?? eofToken();
+}
+/**
+ * Helper to check token type
+ */
+export function checkToken(ctx: ParseContext, type: TokenType): boolean {
+  return currentToken(ctx).type === type;
+}
+/**
+ * Helper to check if at end
+ */
+export function isAtEnd(ctx: ParseContext): boolean {
+  return ctx.pos >= ctx.tokens.length || currentToken(ctx).type === "EOF";
+}
+/**
+ * Create EOF token
+ */
+function eofToken(): Token {
+  return {
+    type: "EOF",
+    value: "",
+    position: { start: { line: 0, column: 0, offset: 0 }, end: { line: 0, column: 0, offset: 0 } },
+    lineStart: false,
+  };
+}
+/**
+ * Check if closing marker exists before newline
+ * If markerValue is provided, also check that the token value matches
+ */
+export function hasClosingMarkerBeforeNewline(
+  ctx: ParseContext,
+  markerType: TokenType,
+  markerValue?: string,
+): boolean {
+  let pos = ctx.pos;
+  while (pos < ctx.tokens.length) {
+    const token = ctx.tokens[pos];
+    if (!token || token.type === "NEWLINE" || token.type === "EOF") {
+      return false;
+    }
+    if (token.type === markerType) {
+      if (markerValue === undefined || token.value === markerValue) {
+        return true;
+      }
+    }
+    pos++;
+  }
+  return false;
+}
+/**
+ * Check if closing marker exists before paragraph break (double newline)
+ * Allows inline formatting to span multiple lines within a paragraph
+ */
+export function hasClosingMarkerBeforeParagraphBreak(
+  ctx: ParseContext,
+  markerType: TokenType,
+  markerValue?: string,
+): boolean {
+  let pos = ctx.pos;
+  while (pos < ctx.tokens.length) {
+    const token = ctx.tokens[pos];
+    if (!token || token.type === "EOF") {
+      return false;
+    }
+    // Check for paragraph break (NEWLINE followed by NEWLINE after optional whitespace)
+    if (token.type === "NEWLINE") {
+      let lookAhead = 1;
+      while (ctx.tokens[pos + lookAhead]?.type === "WHITESPACE") {
+        lookAhead++;
+      }
+      if (
+        ctx.tokens[pos + lookAhead]?.type === "NEWLINE" ||
+        ctx.tokens[pos + lookAhead]?.type === "EOF" ||
+        !ctx.tokens[pos + lookAhead]
+      ) {
+        return false; // Paragraph break - stop
+      }
+    }
+    if (token.type === markerType) {
+      if (markerValue === undefined || token.value === markerValue) {
+        return true;
+      }
+    }
+    pos++;
+  }
+  return false;
+}

package/src/parser/rules/utils.ts ADDED Viewed

@@ -0,0 +1,155 @@
+/**
+ * Common utilities shared between block and inline rules
+ */
+import type { ParseContext } from "./types";
+// =============================================================================
+// Attribute Safety
+// =============================================================================
+// Event handler attributes (on*) are blocked entirely
+const SAFE_ATTRIBUTES = new Set([
+  "accept",
+  "align",
+  "alt",
+  "autocapitalize",
+  "autoplay",
+  "background",
+  "bgcolor",
+  "border",
+  "buffered",
+  "checked",
+  "cite",
+  "class",
+  "cols",
+  "colspan",
+  "contenteditable",
+  "controls",
+  "coords",
+  "datetime",
+  "decoding",
+  "default",
+  "dir",
+  "dirname",
+  "disabled",
+  "download",
+  "draggable",
+  "for",
+  "form",
+  "headers",
+  "height",
+  "hidden",
+  "high",
+  "href",
+  "hreflang",
+  "id",
+  "inputmode",
+  "ismap",
+  "itemprop",
+  "kind",
+  "label",
+  "lang",
+  "list",
+  "loop",
+  "low",
+  "max",
+  "maxlength",
+  "min",
+  "minlength",
+  "multiple",
+  "muted",
+  "name",
+  "optimum",
+  "pattern",
+  "placeholder",
+  "poster",
+  "preload",
+  "readonly",
+  "required",
+  "reversed",
+  "role",
+  "rows",
+  "rowspan",
+  "scope",
+  "selected",
+  "shape",
+  "size",
+  "sizes",
+  "span",
+  "spellcheck",
+  "src",
+  "srclang",
+  "srcset",
+  "start",
+  "step",
+  "style",
+  "tabindex",
+  "target",
+  "title",
+  "translate",
+  "type",
+  "usemap",
+  "value",
+  "width",
+  "wrap",
+]);
+/**
+ * Filter unsafe HTML attributes (blocks event handlers, allows safe attributes + aria-* / data-*)
+ */
+export function filterUnsafeAttributes(attrs: Record<string, string>): Record<string, string> {
+  const result: Record<string, string> = {};
+  for (const [key, value] of Object.entries(attrs)) {
+    const lower = key.toLowerCase();
+    if (lower.startsWith("on")) continue;
+    if (lower.startsWith("aria-") || lower.startsWith("data-")) {
+      result[key] = value;
+      continue;
+    }
+    if (!SAFE_ATTRIBUTES.has(lower)) continue;
+    // Wikidot prefixes user-set IDs with "u-"
+    if (lower === "id") {
+      result[key] = value.startsWith("u-") ? value : `u-${value}`;
+      continue;
+    }
+    result[key] = value;
+  }
+  return result;
+}
+// =============================================================================
+// Block Name Parsing
+// =============================================================================
+/**
+ * Parse block name from tokens (handles [[name or [[/name)
+ * Handles underscore suffix like "div_" which may be tokenized as [IDENTIFIER "div"] [UNDERSCORE "_"]
+ */
+export function parseBlockName(
+  ctx: ParseContext,
+  startPos: number,
+): { name: string; consumed: number } | null {
+  let pos = startPos;
+  let consumed = 0;
+  // Wikidot does NOT allow whitespace between [[ and block name
+  // e.g. [[ code ]] is treated as plain text, not a code block
+  const token = ctx.tokens[pos];
+  if (!token || (token.type !== "TEXT" && token.type !== "IDENTIFIER")) {
+    return null;
+  }
+  // Base name
+  let name = token.value.toLowerCase();
+  consumed++;
+  pos++;
+  // Check for underscore suffix (e.g., "div_" -> "div" + "_")
+  if (ctx.tokens[pos]?.type === "UNDERSCORE") {
+    name += "_";
+    consumed++;
+  }
+  return { name, consumed };
+}

package/src/parser/toc.ts ADDED Viewed

@@ -0,0 +1,130 @@
+/**
+ *
+ * Table of Contents (TOC) generation for Wikidot markup.
+ *
+ * Converts a flat array of `TocEntry` items (collected from heading elements
+ * during parsing) into nested bullet-list `Element` nodes suitable for rendering
+ * as `[[toc]]`. Uses the depth module to transform flat heading levels (h1-h6)
+ * into a properly nested list hierarchy.
+ *
+ * Each TOC entry becomes an anchor link (`#toc0`, `#toc1`, ...) pointing to the
+ * corresponding heading in the rendered page, matching Wikidot's original
+ * anchor naming scheme.
+ *
+ * @module
+ */
+import type { Element, TocEntry, ListItem } from "@wdprlib/ast";
+import { processDepths, type DepthList, type DepthItem } from "./depth";
+/**
+ * Sequential counter for generating unique TOC anchor IDs.
+ *
+ * Wikidot assigns sequential `#toc0`, `#toc1`, ... anchors to headings in
+ * document order. This class maintains a monotonically increasing counter
+ * that is shared across all TOC trees to ensure globally unique anchors.
+ */
+class TocIndexer {
+  private index = 0;
+  /**
+   * Returns the next sequential index and advances the counter.
+   * @returns The current index value (0-based) before incrementing
+   */
+  next(): number {
+    return this.index++;
+  }
+}
+/**
+ * Build a nested bullet-list Element from depth-processed TOC items.
+ *
+ * Each item in the depth list is converted to a `ListItem`, with nested lists
+ * becoming sub-list items and leaf items becoming anchor links.
+ *
+ * @param indexer - Shared counter for generating sequential `#tocN` anchors
+ * @param items - Depth-processed list of heading text strings
+ * @returns A `list` Element with type "bullet" containing the TOC hierarchy
+ */
+function buildTocList(indexer: TocIndexer, items: DepthList<null, string>): Element {
+  const listItems: ListItem[] = items.map((item) => buildTocListItem(indexer, item));
+  return {
+    element: "list",
+    data: {
+      type: "bullet",
+      attributes: {},
+      items: listItems,
+    },
+  };
+}
+/**
+ * Build a single TOC list item from a depth item.
+ *
+ * For leaf items, creates an anchor link element with a `#tocN` href.
+ * For nested list items, recursively builds a sub-list.
+ *
+ * @param indexer - Shared counter for generating sequential `#tocN` anchors
+ * @param item - A single depth item (either a leaf heading or a nested list)
+ * @returns A `ListItem` for inclusion in the TOC list
+ */
+function buildTocListItem(indexer: TocIndexer, item: DepthItem<null, string>): ListItem {
+  if (item.kind === "list") {
+    return {
+      "item-type": "sub-list",
+      element: "list",
+      data: {
+        type: "bullet",
+        attributes: {},
+        items: item.children.map((child) => buildTocListItem(indexer, child)),
+      },
+    };
+  }
+  // item.kind === "item"
+  const anchor = `#toc${indexer.next()}`;
+  const linkElement: Element = {
+    element: "link",
+    data: {
+      type: "table-of-contents",
+      link: anchor,
+      extra: null,
+      label: { text: item.value },
+      target: null,
+    },
+  };
+  return {
+    "item-type": "elements",
+    attributes: {},
+    elements: [linkElement],
+  };
+}
+/**
+ * Convert flat TocEntry[] to nested List elements
+ *
+ * @param entries - Flat list of TOC entries with level and text
+ * @returns Array of List elements (usually one, but can be multiple if levels reset)
+ */
+export function buildTableOfContents(entries: TocEntry[]): Element[] {
+  if (entries.length === 0) {
+    return [];
+  }
+  // Convert entries to depth-annotated items
+  // level is 1-based (h1=1, h2=2, ...), convert to 0-based depth
+  const depthItems = entries.map((entry) => ({
+    depth: entry.level - 1,
+    ltype: null as null, // We don't differentiate list types for TOC
+    value: entry.text,
+  }));
+  // Process into nested structure
+  const trees = processDepths<null, string>(null, depthItems);
+  // Build List elements from each tree
+  const indexer = new TocIndexer();
+  return trees.map((tree) => buildTocList(indexer, tree.list));
+}