npm - @beyondwork/docx-react-component - Versions diffs - 1.0.76 → 1.0.78 - Mend

@beyondwork/docx-react-component 1.0.76 → 1.0.78

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

package/package.json +1 -1
package/src/api/v3/ai/resolve.ts +104 -4
package/src/io/ooxml/parse-bookmark-references.ts +123 -0
package/src/io/ooxml/parse-footnotes.ts +26 -3
package/src/io/ooxml/parse-headers-footers.ts +96 -1
package/src/io/ooxml/parse-main-document.ts +256 -4
package/src/io/ooxml/parse-shapes.ts +58 -4
package/src/io/ooxml/table-opaque-preservation.ts +70 -5
package/src/model/canonical-document.ts +45 -8
package/src/runtime/scopes/action-validation.ts +39 -12
package/src/runtime/scopes/index.ts +3 -0
package/src/runtime/scopes/resolve-reference.ts +99 -43
package/src/session/import/loader-types.ts +26 -0
package/src/session/import/loader.ts +12 -2
package/src/ui-tailwind/chrome-overlay/tw-chrome-overlay.tsx +12 -0
package/src/ui-tailwind/editor-surface/perf-probe.ts +3 -0
package/src/ui-tailwind/editor-surface/pm-decorations.ts +44 -0
package/src/ui-tailwind/editor-surface/preserve-position.ts +28 -9
package/src/ui-tailwind/editor-surface/tw-prosemirror-surface.tsx +13 -13
package/src/ui-tailwind/page-stack/tw-active-band-ribbon.tsx +229 -0
package/src/ui-tailwind/page-stack/tw-page-chrome-entry.tsx +15 -1
package/src/ui-tailwind/page-stack/tw-page-footer-band.tsx +18 -0
package/src/ui-tailwind/page-stack/tw-page-header-band.tsx +20 -0
package/src/ui-tailwind/page-stack/tw-page-stack-chrome-layer.tsx +10 -0
package/src/ui-tailwind/tw-review-workspace.tsx +56 -6

package/src/io/ooxml/parse-main-document.ts CHANGED Viewed

@@ -31,6 +31,11 @@ import type {
 } from "../../model/canonical-document.ts";
 import type { OpcRelationship } from "./part-manifest.ts";
 import { SCOPE_MARKER_BOOKMARK_PREFIX } from "./parse-scope-markers.ts";
+import {
+  scanBookmarkReferences,
+  isRetainedBookmarkName,
+  type BookmarkReferenceScan,
+} from "./parse-bookmark-references.ts";
 import {
   parseInlineMediaXml,
   type InlineMediaPart,
@@ -253,8 +258,39 @@ export interface ParseMainDocumentOptions {
    *
    * Set to `false` to preserve the pre-strip behavior exactly — every
    * cosmetic marker becomes an `opaque_inline` with its source XML.
+   *
+   * **Phase 2 (Slice B):** the same flag also gates orphan-bookmark
+   * stripping. When enabled, a pre-pass scans the source XML for
+   * bookmark references (`<w:hyperlink w:anchor>` /
+   * `<w:instrText>REF/PAGEREF/NOTEREF/TOC</w:instrText>`); bookmarks
+   * whose name is NOT referenced AND not in
+   * {@link retainedBookmarkNames} drop at the four emission sites.
+   * Aggregate counts surface alongside the cosmetic-marker counts on
+   * `skippedCosmeticMarkerCounts.bookmarkStart` /
+   * `skippedCosmeticMarkerCounts.bookmarkEnd`.
    */
   stripCosmeticMarkers?: boolean;
+  /**
+   * Phase 2 bookmark-strip allowlist. When `stripCosmeticMarkers` is
+   * `true`, the parser's reference scan retains bookmarks whose name
+   * is referenced by a `<w:hyperlink>` / `<w:instrText>` AND any name
+   * listed here. Use this when the host depends on a stable host-
+   * authored bookmark name (e.g. `placeholder_party_name`,
+   * `signature_block_2`) that the automatic scan can't infer is
+   * load-bearing.
+   *
+   * Default: `[]`. Always-retained regardless of this list:
+   *   - `_Toc*` (when any TOC field exists)
+   *   - any name explicitly cited by a `<w:hyperlink w:anchor>` or
+   *     `<w:instrText>` field instruction
+   *   - `bw:scope:*` (workflow scope markers — converted to
+   *     `scope_marker_*` by `rewriteScopeMarkerBookmarks` BEFORE the
+   *     strip runs; listed here as defense-in-depth)
+   *   - everything (defensive blanket-retain) when the document
+   *     contains a `<w:dataBinding>` element whose xpath could
+   *     reference bookmarks via paths we cannot statically analyze
+   */
+  retainedBookmarkNames?: ReadonlyArray<string>;
 }
 export type ParsedBlockNode =
@@ -717,6 +753,16 @@ export function setActiveParseTelemetryBus(bus: ParseTelemetryBus | undefined):
 interface CosmeticStripContext {
   readonly strip: boolean;
   readonly counts: Record<string, number>;
+  /** Phase 2 — bookmark-reference scan. `null` when strip is off OR
+   *  when the entry point did not run the scan (e.g. an internal call
+   *  with no XML available). */
+  readonly bookmarkScan: BookmarkReferenceScan | null;
+  /** Phase 2 — per-parse set of bookmark IDs that the bookmarkStart
+   *  decision marked as stripped. The bookmarkEnd site keys off this
+   *  set (since `<w:bookmarkEnd>` carries only the id, not the name).
+   *  Bookmarks can span paragraphs, so this MUST be request-scoped, not
+   *  per-paragraph. */
+  readonly strippedBookmarkIds: Set<string>;
 }
 let activeCosmeticStripContext: CosmeticStripContext | null = null;
@@ -730,6 +776,47 @@ function shouldStripCosmeticMarker(): boolean {
   return activeCosmeticStripContext?.strip === true;
 }
+/**
+ * Phase 2 — true when the bookmark with this `name` should be RETAINED
+ * (its name is referenced by a hyperlink/field in the doc, on the
+ * caller-supplied allowlist, a `bw:scope:*` prefix, or covered by the
+ * `_Toc*` blanket-retain when a TOC field exists). When the bookmark
+ * has no name (`""`), retain — only the body-walker's `bkId` path
+ * reaches this helper, and unnamed bookmarks have no consumer that
+ * could reference them by name so retention is a no-op.
+ */
+function shouldRetainBookmark(name: string): boolean {
+  const ctx = activeCosmeticStripContext;
+  if (!ctx || !ctx.strip) return true; // strip disabled — always retain
+  if (!ctx.bookmarkScan) return true;  // scan absent — defensive retain
+  if (name === "") return true;        // no name → no consumer can reference
+  return isRetainedBookmarkName(name, ctx.bookmarkScan);
+}
+/**
+ * Phase 2 — record that the bookmark with this `bookmarkId` was
+ * stripped at its `bookmarkStart` site so the matching `bookmarkEnd`
+ * (which carries only the id, not the name) can find the decision.
+ */
+function noteStrippedBookmarkId(bookmarkId: string): void {
+  if (!activeCosmeticStripContext) return;
+  activeCosmeticStripContext.strippedBookmarkIds.add(bookmarkId);
+}
+/**
+ * Phase 2 — true when this `bookmarkEnd` corresponds to a `bookmarkStart`
+ * that was previously stripped. When the id is unknown (cross-fragment
+ * end without a matching start in the same parse) the answer is `false`
+ * — we keep the end conservatively rather than risk dropping a load-
+ * bearing pair we didn't see start.
+ */
+function shouldStripBookmarkEndId(bookmarkId: string): boolean {
+  if (!activeCosmeticStripContext) return false;
+  if (!activeCosmeticStripContext.strip) return false;
+  if (bookmarkId === "") return false;
+  return activeCosmeticStripContext.strippedBookmarkIds.has(bookmarkId);
+}
 export function parseMainDocumentXml(
   xml: string,
   relationships: readonly OpcRelationship[] = [],
@@ -739,9 +826,14 @@ export function parseMainDocumentXml(
   parseOptions: ParseMainDocumentOptions = {},
 ): ParsedMainDocument {
   activeChartPartLookup = chartPartLookup;
+  const stripEnabled = parseOptions.stripCosmeticMarkers !== false;
   const stripContext: CosmeticStripContext = {
-    strip: parseOptions.stripCosmeticMarkers !== false,
+    strip: stripEnabled,
     counts: Object.create(null) as Record<string, number>,
+    bookmarkScan: stripEnabled
+      ? scanBookmarkReferences(xml, parseOptions.retainedBookmarkNames ?? [])
+      : null,
+    strippedBookmarkIds: new Set<string>(),
   };
   activeCosmeticStripContext = stripContext;
   const bus = activeParseTelemetryBus;
@@ -848,7 +940,13 @@ function parseMainDocumentXmlInner(
   const allBlocks = bodyElement.children
     .filter((node): node is XmlElementNode => node.type === "element")
-    .map((node) => parseBodyChild(node, xml, relationshipMap, relationships, mediaParts, sourcePartPath));
+    .map((node) => parseBodyChild(node, xml, relationshipMap, relationships, mediaParts, sourcePartPath))
+    // Phase 2 (Slice B) — body-level bookmarkStart/End that the strip
+    // dropped come back from `parseBodyChild` as `opaque_block` with
+    // empty `rawXml` (sentinel — see `parseBodyChild` body-level
+    // bookmark cases). Filter them out here so downstream consumers
+    // don't see empty blocks.
+    .filter((block) => !(block.type === "opaque_block" && block.rawXml === ""));
   // The last body-level sectPr is the final section properties (not an intermediate section break).
   // Extract it from the blocks list and store it separately.
@@ -1071,6 +1169,48 @@ function parseBodyChild(
     return parseSectionBreakElement(node, sourceXml);
   }
+  // Body-level <w:bookmarkStart> / <w:bookmarkEnd> — bookmarks that
+  // span across paragraph boundaries land directly under <w:body>
+  // rather than inside a <w:p>. Pre-Phase-2 these flowed through the
+  // `nodeType !== "p"` default below and became opaque_block (the
+  // dominant 184-of-185 opaque source on EU IT Services Agreement
+  // per `enumerate-opaque-fragments`). Phase 2 strip applies the same
+  // RETAIN-vs-STRIP decision here as at sites 1+2 inside paragraphs.
+  if (nodeType === "bookmarkStart") {
+    const bkId = node.attributes["w:id"] ?? node.attributes.id ?? "";
+    const bkName = node.attributes["w:name"] ?? node.attributes.name ?? "";
+    if (
+      shouldStripCosmeticMarker() &&
+      bkId &&
+      bkName &&
+      !shouldRetainBookmark(bkName)
+    ) {
+      noteStrippedCosmeticMarker("bookmarkStart");
+      noteStrippedBookmarkId(bkId);
+      // Returning an empty paragraph block flushes cleanly through the
+      // body-walker's collected blocks. The block is a no-op zero-content
+      // paragraph that downstream consumers ignore. (Returning `null`
+      // would change the body-walker's signature; the block-shaped
+      // no-op preserves the existing iteration contract.)
+      return { type: "opaque_block", rawXml: "" };
+    }
+    return {
+      type: "opaque_block",
+      rawXml: sourceXml.slice(node.start, node.end),
+    };
+  }
+  if (nodeType === "bookmarkEnd") {
+    const bkEndId = node.attributes["w:id"] ?? node.attributes.id ?? "";
+    if (shouldStripBookmarkEndId(bkEndId)) {
+      noteStrippedCosmeticMarker("bookmarkEnd");
+      return { type: "opaque_block", rawXml: "" };
+    }
+    return {
+      type: "opaque_block",
+      rawXml: sourceXml.slice(node.start, node.end),
+    };
+  }
   if (nodeType !== "p") {
     return {
       type: "opaque_block",
@@ -1186,6 +1326,18 @@ function parseBodyChild(
         const bkId = child.attributes["w:id"] ?? child.attributes.id ?? "";
         const bkName = child.attributes["w:name"] ?? child.attributes.name ?? "";
         if (bkId) {
+          // Phase 2 (Slice B) — orphan-bookmark strip. When the strip
+          // is on AND the name is not load-bearing per the reference
+          // scan, drop the bookmark. The bookmarkEnd site finds the
+          // matching id via `shouldStripBookmarkEndId`.
+          if (bkName && !shouldRetainBookmark(bkName)) {
+            noteStrippedCosmeticMarker("bookmarkStart");
+            noteStrippedBookmarkId(bkId);
+            flushActiveComplexField(children, () => {
+              activeComplexField = null;
+            }, activeComplexField);
+            break;
+          }
           const bookmarkNode = {
             type: "bookmark_start",
             bookmarkId: bkId,
@@ -1207,6 +1359,15 @@ function parseBodyChild(
       case "bookmarkEnd": {
         const bkEndId = child.attributes["w:id"] ?? child.attributes.id ?? "";
         if (bkEndId) {
+          // Phase 2 — strip the matching bookmarkEnd if its bookmarkStart
+          // was previously stripped (id-keyed pairing across the parse).
+          if (shouldStripBookmarkEndId(bkEndId)) {
+            noteStrippedCosmeticMarker("bookmarkEnd");
+            flushActiveComplexField(children, () => {
+              activeComplexField = null;
+            }, activeComplexField);
+            break;
+          }
           const bookmarkNode = {
             type: "bookmark_end",
             bookmarkId: bkEndId,
@@ -2755,14 +2916,53 @@ function parseRevisionContainer(
         ];
       case "commentRangeStart":
       case "commentRangeEnd":
-      case "bookmarkStart":
-      case "bookmarkEnd":
         return [
           {
             type: "opaque_inline",
             rawXml: sourceXml.slice(node.start, node.end),
           },
         ];
+      case "bookmarkStart": {
+        // Site 3 (Slice B) — nested-context bookmarkStart. Pre-Slice-B
+        // ALL nested-context bookmarks fell through to opaque_inline
+        // (the 216-opaque source on CCEP). When the strip is on AND the
+        // name isn't load-bearing, drop the start + record the id so
+        // the matching end (Site 4) drops too.
+        const nestedBkId = child.attributes["w:id"] ?? child.attributes.id ?? "";
+        const nestedBkName = child.attributes["w:name"] ?? child.attributes.name ?? "";
+        if (
+          shouldStripCosmeticMarker() &&
+          nestedBkId &&
+          nestedBkName &&
+          !shouldRetainBookmark(nestedBkName)
+        ) {
+          noteStrippedCosmeticMarker("bookmarkStart");
+          noteStrippedBookmarkId(nestedBkId);
+          break;
+        }
+        return [
+          {
+            type: "opaque_inline",
+            rawXml: sourceXml.slice(node.start, node.end),
+          },
+        ];
+      }
+      case "bookmarkEnd": {
+        // Site 4 (Slice B) — nested-context bookmarkEnd. Strip iff its
+        // matching start (any site) was previously stripped.
+        const nestedBkEndId =
+          child.attributes["w:id"] ?? child.attributes.id ?? "";
+        if (shouldStripBookmarkEndId(nestedBkEndId)) {
+          noteStrippedCosmeticMarker("bookmarkEnd");
+          break;
+        }
+        return [
+          {
+            type: "opaque_inline",
+            rawXml: sourceXml.slice(node.start, node.end),
+          },
+        ];
+      }
       case "permStart":
         result.push(parsePermStartNode(child, sourceXml));
         break;
@@ -2817,6 +3017,18 @@ function parseHyperlink(
   const children: Array<ParsedTextNode | ParsedBreakNode | ParsedColumnBreakNode | ParsedPageBreakNode | ParsedTabNode | ParsedSymbolNode> = [];
+  // Slice A — local field-bracket state machine. CCEP TOC hyperlinks
+  // wrap their entries with a `PAGEREF` field whose runs carry
+  // `<w:fldChar w:fldCharType="begin|separate|end"/>` markers and a
+  // `<w:instrText>` instruction. Pre-strip behavior bailed the whole
+  // hyperlink to opaque_inline because `parseRunContentOnly` returned
+  // `supported: false` on those markers (~48 CCEP opaques). The local
+  // bracketMode is independent of the body-walker's `activeComplexField`:
+  // hyperlink children are a self-contained sub-walk that doesn't
+  // escape the hyperlink scope.
+  type FieldBracketMode = "outside" | "instruction" | "result";
+  let bracketMode: FieldBracketMode = "outside";
   for (const child of node.children) {
     if (child.type !== "element") {
       continue;
@@ -2829,9 +3041,29 @@ function parseHyperlink(
       };
     }
+    // Pre-scan for fldChar bracket transitions. Per OOXML grammar, a
+    // run carrying a fldChar carries no other displayable content;
+    // skip the whole run after updating bracketMode.
+    const fldChar = child.children.find(
+      (c): c is XmlElementNode =>
+        c.type === "element" && localName(c.name) === "fldChar",
+    );
+    if (fldChar) {
+      const fldType =
+        fldChar.attributes["w:fldCharType"] ?? fldChar.attributes.fldCharType;
+      if (fldType === "begin") bracketMode = "instruction";
+      else if (fldType === "separate") bracketMode = "result";
+      else if (fldType === "end") bracketMode = "outside";
+      continue;
+    }
     const run = parseRunContentOnly(child, sourceXml, {
       allowDeletedText: options.allowDeletedText,
       preserveUnsupportedReviewMarkup: options.preserveUnsupportedReviewMarkup,
+      // Tolerate `<w:instrText>` siblings inside hyperlink runs — the
+      // bracket-state machine above takes care of dropping them via
+      // `bracketMode === "instruction"` below.
+      allowFieldMarkers: true,
     });
     if (!run.supported) {
       return {
@@ -2840,6 +3072,8 @@ function parseHyperlink(
       };
     }
+    // Drop nodes during the field-instruction segment; keep result + outside.
+    if (bracketMode === "instruction") continue;
     children.push(...run.nodes);
   }
@@ -2857,6 +3091,16 @@ function parseRunContentOnly(
   options: {
     allowDeletedText?: boolean;
     preserveUnsupportedReviewMarkup?: boolean;
+    /**
+     * Slice A — gracefully skip `<w:fldChar>` / `<w:instrText>` children
+     * instead of bailing to `supported: false`. The hyperlink path
+     * (`parseHyperlink`) opts in so TOC `PAGEREF` field markers inside
+     * hyperlink runs no longer trip exit-B. The body-walker callers do
+     * NOT opt in — they have their own `activeComplexField` state machine
+     * that handles these markers semantically and bailing here is
+     * load-bearing for that machine to see the markers in `parseRun`.
+     */
+    allowFieldMarkers?: boolean;
   } = {},
 ): RunParseResult {
   const marksResult = readRunMarks(node, _sourceXml);
@@ -2940,6 +3184,14 @@ function parseRunContentOnly(
         }
         break;
       }
+      case "fldChar":
+      case "instrText":
+        // Slice A — graceful skip when caller opts in (hyperlink path).
+        // Otherwise fall through to default and bail (body-walker path,
+        // which uses its own activeComplexField state machine in
+        // parseRun to handle these markers).
+        if (options.allowFieldMarkers) break;
+        return { nodes: [], supported: false };
       default:
         return { nodes: [], supported: false };
     }

package/src/io/ooxml/parse-shapes.ts CHANGED Viewed

@@ -10,7 +10,7 @@
  * preserved in the canonical node's rawXml field for lossless round-trip export.
  */
-import type { ShapeContent } from "../../model/canonical-document.ts";
+import type { BlockNode, ShapeContent } from "../../model/canonical-document.ts";
 import { parseFill } from "./parse-fill.ts";
 import {
   type XmlElementNode,
@@ -32,6 +32,15 @@ export interface ParsedWpsShape {
   text?: string;
   /** Raw txbxContent XML for structured re-rendering. */
   txbxContentXml?: string;
+  /**
+   * Parsed block-level structure from `w:txbxContent`, populated when a
+   * `blockParser` callback is supplied (coord-02 §14 / coord-11 §22 —
+   * headers/footers need access to shape-inside text like the CCEP
+   * "Copyright CCEP STRICTLY CONFIDENTIAL" red band, which lives in
+   * shape-textbox paragraphs). Same shape + semantics as
+   * `ShapeContent.txbxBlocks` on the drawing-frame path.
+   */
+  txbxBlocks?: ReadonlyArray<BlockNode>;
   /** DrawML geometry preset, e.g. "rect", "roundRect". */
   geometry?: string;
   /** Original drawing XML for lossless round-trip export. */
@@ -65,7 +74,10 @@ export type ParsedShape = ParsedWpsShape | ParsedWordArt | ParsedVmlShape;
  *
  * Returns null if the drawing does not contain a WPS shape.
  */
-export function parseShapeXml(drawingXml: string): ParsedWpsShape | ParsedWordArt | null {
+export function parseShapeXml(
+  drawingXml: string,
+  blockParser?: TxbxBlockParser,
+): ParsedWpsShape | ParsedWordArt | null {
   const root = parseXml(drawingXml);
   const graphicData = findFirstDescendant(root, "graphicData");
   if (!graphicData) return null;
@@ -104,11 +116,37 @@ export function parseShapeXml(drawingXml: string): ParsedWpsShape | ParsedWordAr
   // Extract raw txbxContent XML for structured re-rendering of text boxes
   const txbxContentXml = txbxContent ? extractRawXml(txbxContent) : undefined;
+  // Coord-02 §14 / coord-11 §22 follow-up (2026-04-24): when a
+  // blockParser is supplied, recurse into the txbxContent to produce a
+  // structured block representation. Without this, shape-textbox
+  // content (CCEP "Copyright CCEP STRICTLY CONFIDENTIAL" footer band)
+  // is reachable only via the `.text` summary string — L03 cascade +
+  // L11 render can't walk runs/marks.
+  let txbxBlocks: ReadonlyArray<BlockNode> | undefined;
+  if (txbxContentXml && blockParser) {
+    try {
+      // The `blockParser` callback is supplied by parse-main-document.ts
+      // as a thin wrapper over `parseBlockStreamFromXml`. That function
+      // returns `ParsedBlockNode[]` — structurally identical to canonical
+      // `BlockNode[]` at runtime for shape-textbox content (verified on
+      // CCEP SOW footer fixture 2026-04-24: paragraph + text + TextMark
+      // shapes land end-to-end with zero `ParsedBlockNode`-only fields
+      // surfaced). The cast is safe here because the runtime output IS
+      // canonical; a structural `as unknown as BlockNode[]` preserves
+      // type safety at every consumer site (L03 cascade, L11 render,
+      // validator walk).
+      txbxBlocks = blockParser(txbxContentXml) as unknown as ReadonlyArray<BlockNode>;
+    } catch {
+      txbxBlocks = undefined;
+    }
+  }
   return {
     type: "shape",
     ...(isTextBox ? { isTextBox: true } : {}),
     ...(text ? { text } : {}),
     ...(txbxContentXml ? { txbxContentXml } : {}),
+    ...(txbxBlocks && txbxBlocks.length > 0 ? { txbxBlocks } : {}),
     ...(prst ? { geometry: prst } : {}),
     rawXml: drawingXml,
   };
@@ -186,6 +224,17 @@ function extractAllText(node: XmlElementNode): string {
 // txbxContentXml, optional recursive txbxBlocks).
 // ───────────────────────────────────────────────────────────────────────────
+/**
+ * Callback signature for the txbx-content block parser supplied by
+ * parse-main-document.ts / parse-headers-footers.ts. The actual
+ * implementation wraps `parseBlockStreamFromXml` which returns
+ * `ParsedBlockNode[]`; its runtime output is canonical `BlockNode[]`
+ * for shape-textbox content (no `ParsedBlockNode`-only fields surface
+ * at the shape boundary — verified on CCEP SOW footer fixture
+ * 2026-04-24). The structural `unknown` return keeps the parse layer
+ * layer-pure; `parseShapeContent` + `parseShapeXml` cast to canonical
+ * `BlockNode[]` at the assembly seam.
+ */
 export type TxbxBlockParser = (xml: string) => ReadonlyArray<{ type: string; [key: string]: unknown }>;
 export function parseShapeContent(
@@ -212,10 +261,15 @@ export function parseShapeContent(
   const txbxContent = txbx ? findFirstDescendant(txbx, "txbxContent") : undefined;
   const txbxContentXml = txbxContent ? extractRawXml(txbxContent) : undefined;
-  let txbxBlocks: ReadonlyArray<{ type: string; [key: string]: unknown }> | undefined;
+  let txbxBlocks: ReadonlyArray<BlockNode> | undefined;
   if (txbxContentXml && blockParser) {
     try {
-      txbxBlocks = blockParser(txbxContentXml);
+      // See `TxbxBlockParser` doc above: runtime output is canonical
+      // `BlockNode[]` for shape-textbox content (verified on CCEP SOW
+      // footer fixture 2026-04-24). Cast at the assembly seam so
+      // downstream consumers (L03, L11, validator) get canonical types
+      // without local `as unknown` ceremony.
+      txbxBlocks = blockParser(txbxContentXml) as unknown as ReadonlyArray<BlockNode>;
     } catch {
       // Preserve-only fallback: keep txbxContentXml for serialization; leave
       // txbxBlocks undefined so consumers know recursion did not succeed.

package/src/io/ooxml/table-opaque-preservation.ts CHANGED Viewed

@@ -55,8 +55,21 @@ import { classifyFieldInstruction } from "./parse-fields.ts";
 /**
  * Field families safe enough to leave a `<w:tbl>` in structured
- * canonical form. Widening this set commits L06 / L08 to cell-level
- * edit semantics for that family — don't expand opportunistically.
+ * canonical form.
+ *
+ * The principle: the body-direct paragraph parser accepts every
+ * field family and emits a typed `FieldInlineNode` — classified
+ * families get a refresh slot, preserve-only families round-trip
+ * via the shared `FieldInlineNode` shape. Cells inside a `<w:tbl>`
+ * run through the same parser + serializer. Flattening the whole
+ * table to `opaque_block` because one cell carries a field is
+ * over-conservative: preserve-only fields round-trip identically
+ * whether they sit in a body paragraph or a table cell.
+ *
+ * This set is consulted AFTER `isWellFormedFieldInstruction` — for
+ * the rare case where a field instruction doesn't start with a
+ * recognizable OOXML family identifier, we still accept it iff the
+ * classifier happened to recognize it.
  */
 export const SAFE_TABLE_FIELD_FAMILIES: ReadonlySet<string> = new Set([
   "REF",
@@ -65,6 +78,18 @@ export const SAFE_TABLE_FIELD_FAMILIES: ReadonlySet<string> = new Set([
   "TOC",
   "PAGE",
   "NUMPAGES",
+  "STYLEREF",
+  "SECTIONPAGES",
+  "DATE",
+  "TIME",
+  "AUTHOR",
+  "FILENAME",
+  "MERGEFIELD",
+  "IF",
+  "SEQ",
+  "INDEX",
+  "TC",
+  "FORMULA",
 ]);
 /**
@@ -127,23 +152,63 @@ export function extractComplexFieldInstructionsFromRaw(rawXml: string): string[]
  * `FORMDROPDOWN`. These are fully supported by the body-direct
  * paragraph parser via `parseFFDataFromFldChar` but classify as
  * `UNKNOWN` under `FIELD_FAMILY_PATTERN` (which targets data-field
- * families like REF / TOC / MERGEFIELD). Short-circuiting them lets
- * form-field cells stay in structured canonical tables instead of
- * flattening the entire table to `opaque_block`. Coord-01 §11, 2026-04-24.
+ * families like REF / TOC / MERGEFIELD). Kept as a named helper for
+ * readability; `isWellFormedFieldInstruction` would also accept them
+ * via the generic identifier pattern, but the named check documents
+ * the carve-out's origin (coord-01 §11, 2026-04-24).
  */
 export function isLegacyFormFieldInstruction(instruction: string): boolean {
   return /^\s*(FORMTEXT|FORMCHECKBOX|FORMDROPDOWN)\b/i.test(instruction);
 }
+/**
+ * Matches any well-formed OOXML field instruction. OOXML field
+ * instructions (ECMA-376 §17.16) begin with an ALL-CAPS family name
+ * — `REF`, `PAGE`, `TOC`, `MERGEFIELD`, `DOCPROPERTY`,
+ * `LISTNUM`, and so on through the full §17.16 catalog (60+
+ * families). Pattern-matching the family-name prefix lets us
+ * recognize every standard field shape WITHOUT adding each one to
+ * the L02 `PreserveOnlyFieldFamily` union (which would require a
+ * cross-lane slice) OR to `FIELD_FAMILY_PATTERN` (which expands
+ * classification-time behavior).
+ *
+ * The narrow `SAFE_TABLE_FIELD_FAMILIES` above is consulted as a
+ * fallback for the rare case of a field instruction that doesn't
+ * start with a family-name prefix but DOES classify to a known
+ * family (e.g. leading whitespace quirks we haven't seen in the
+ * wild).
+ *
+ * Rejection remains for:
+ *  - Instructions that don't start with a family-name shape — these
+ *    could be garbled / truncated / mid-field text; flattening the
+ *    table is the safe preservation path.
+ *  - Revision markup anywhere in the table (tracked changes —
+ *    unaffected by this check; handled by `RISKY_TABLE_MARKUP_RE`
+ *    below).
+ */
+const WELL_FORMED_FIELD_INSTRUCTION_RE = /^\s*[A-Z][A-Z0-9_]*\b/;
+export function isWellFormedFieldInstruction(instruction: string): boolean {
+  return WELL_FORMED_FIELD_INSTRUCTION_RE.test(instruction);
+}
 /**
  * Decides whether a single field instruction (either `w:instr`
  * attribute value or concatenated `instrText` run) is safe for
  * structured-table parsing. Used by the shared predicate below;
  * exposed for direct callers (the debug diagnostics script runs
  * this to classify source instructions alongside the canonical).
+ *
+ * Order matters: the well-formed prefix check covers every standard
+ * OOXML field family in one pass; the classifier fallback catches
+ * edge cases where `FIELD_FAMILY_PATTERN` happens to match but the
+ * prefix shape doesn't (unlikely, but the fallback keeps behavior
+ * consistent with classification for any instruction the classifier
+ * recognizes).
  */
 export function isSafeTableFieldInstruction(instruction: string): boolean {
   if (isLegacyFormFieldInstruction(instruction)) return true;
+  if (isWellFormedFieldInstruction(instruction)) return true;
   const family = classifyFieldInstruction(instruction).family;
   return SAFE_TABLE_FIELD_FAMILIES.has(family);
 }

package/src/model/canonical-document.ts CHANGED Viewed

@@ -1786,12 +1786,29 @@ export interface SmartArtPreviewNode {
 /**
  * Read-only rendering of a wps:wsp WordprocessingShape. Text content is
  * extracted for display. The original drawing XML is preserved in rawXml.
+ *
+ * When the shape is a text-box (`isTextBox: true`), the raw textbox XML
+ * is preserved in `txbxContentXml` for lossless round-trip, and the
+ * parsed block structure lands in `txbxBlocks` — canonical `BlockNode[]`
+ * with styles already resolved (coord-02 §14 / coord-11 §22 closed L01
+ * side 2026-04-24 in `7d87f1189`; L02 type-promoted 2026-04-24 once the
+ * runtime contract was confirmed canonical).
  */
 export interface ShapeNode {
   type: "shape";
   text?: string;
   geometry?: string;
   isTextBox?: boolean;
+  /** Raw `<w:txbxContent>` XML, preserved for serialization + round-trip. */
+  txbxContentXml?: string;
+  /**
+   * Parsed canonical block-level structure from `<w:txbxContent>`,
+   * populated when the parse path supplies a `blockParser` callback
+   * (headers/footers via `src/io/ooxml/parse-headers-footers.ts`;
+   * body via `src/io/ooxml/parse-main-document.ts`). Shape + semantics
+   * identical to `ShapeContent.txbxBlocks` on the drawing-frame path.
+   */
+  txbxBlocks?: ReadonlyArray<BlockNode>;
   rawXml: string;
 }
@@ -1971,14 +1988,16 @@ export interface ShapeContent {
    * Parsed block-level structure from `w:txbxContent`, populated when a
    * `blockParser` callback is supplied during parse (CO4 F3.3).
    *
-   * Type is deliberately structural (`{ type: string; ... }`) rather than
-   * canonical `BlockNode[]` because the recursion stops at the parse layer
-   * before the style + numbering normalization pass that converts
-   * `ParsedBlockNode` → canonical `BlockNode`. Consumers that need the fully
-   * normalized form run normalization on this subtree explicitly. Testing
-   * that `txbxBlocks.length > 0` proves the recursion executed.
+   * Canonical `BlockNode[]` — the parse path produces fully-normalized
+   * blocks (styles resolved, marks attached, no `ParsedBlockNode`-only
+   * fields at runtime). Verified on the CCEP SOW footer fixture 2026-04-24:
+   * paragraph + text + `TextMark` shapes land end-to-end. Type promoted
+   * 2026-04-24 from the earlier weakly-typed escape hatch once the L01
+   * shape-textbox parse (commit `7d87f1189`) confirmed the runtime
+   * contract — unblocks L03 cascade + L11 render walking `txbxBlocks`
+   * without `as unknown as BlockNode[]` casts at the consumer site.
    */
-  txbxBlocks?: ReadonlyArray<{ type: string; [key: string]: unknown }>;
+  txbxBlocks?: ReadonlyArray<BlockNode>;
   rawXml: string;
 }
@@ -2860,11 +2879,29 @@ function validateDocumentNode(
       return;
     case "chart_preview":
     case "smartart_preview":
-    case "shape":
     case "wordart":
     case "vml_shape":
       expectString(record.rawXml, `${path}.rawXml`, issues);
       return;
+    case "shape":
+      expectString(record.rawXml, `${path}.rawXml`, issues);
+      if (record.txbxBlocks !== undefined) {
+        if (!Array.isArray(record.txbxBlocks)) {
+          issues.push({
+            path: `${path}.txbxBlocks`,
+            message: "shape.txbxBlocks must be an array when present.",
+          });
+        } else {
+          // coord-02 §14 follow-up (2026-04-24): `ShapeNode.txbxBlocks`
+          // is canonical `BlockNode[]`. Walk it with the same validator
+          // used for top-level document content so run marks / paragraph
+          // structure / nested shapes all enforce the normal rules.
+          record.txbxBlocks.forEach((child, index) => {
+            validateDocumentNode(child, `${path}.txbxBlocks[${index}]`, issues);
+          });
+        }
+      }
+      return;
     case "drawing_frame": {
       const anchor = asPlainObject(record.anchor, `${path}.anchor`, issues);
       const content = asPlainObject(record.content, `${path}.content`, issues);