@beyondwork/docx-react-component 1.0.76 → 1.0.78

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,6 +31,11 @@ import type {
31
31
  } from "../../model/canonical-document.ts";
32
32
  import type { OpcRelationship } from "./part-manifest.ts";
33
33
  import { SCOPE_MARKER_BOOKMARK_PREFIX } from "./parse-scope-markers.ts";
34
+ import {
35
+ scanBookmarkReferences,
36
+ isRetainedBookmarkName,
37
+ type BookmarkReferenceScan,
38
+ } from "./parse-bookmark-references.ts";
34
39
  import {
35
40
  parseInlineMediaXml,
36
41
  type InlineMediaPart,
@@ -253,8 +258,39 @@ export interface ParseMainDocumentOptions {
253
258
  *
254
259
  * Set to `false` to preserve the pre-strip behavior exactly — every
255
260
  * cosmetic marker becomes an `opaque_inline` with its source XML.
261
+ *
262
+ * **Phase 2 (Slice B):** the same flag also gates orphan-bookmark
263
+ * stripping. When enabled, a pre-pass scans the source XML for
264
+ * bookmark references (`<w:hyperlink w:anchor>` /
265
+ * `<w:instrText>REF/PAGEREF/NOTEREF/TOC</w:instrText>`); bookmarks
266
+ * whose name is NOT referenced AND not in
267
+ * {@link retainedBookmarkNames} drop at the four emission sites.
268
+ * Aggregate counts surface alongside the cosmetic-marker counts on
269
+ * `skippedCosmeticMarkerCounts.bookmarkStart` /
270
+ * `skippedCosmeticMarkerCounts.bookmarkEnd`.
256
271
  */
257
272
  stripCosmeticMarkers?: boolean;
273
+ /**
274
+ * Phase 2 bookmark-strip allowlist. When `stripCosmeticMarkers` is
275
+ * `true`, the parser's reference scan retains bookmarks whose name
276
+ * is referenced by a `<w:hyperlink>` / `<w:instrText>` AND any name
277
+ * listed here. Use this when the host depends on a stable host-
278
+ * authored bookmark name (e.g. `placeholder_party_name`,
279
+ * `signature_block_2`) that the automatic scan can't infer is
280
+ * load-bearing.
281
+ *
282
+ * Default: `[]`. Always-retained regardless of this list:
283
+ * - `_Toc*` (when any TOC field exists)
284
+ * - any name explicitly cited by a `<w:hyperlink w:anchor>` or
285
+ * `<w:instrText>` field instruction
286
+ * - `bw:scope:*` (workflow scope markers — converted to
287
+ * `scope_marker_*` by `rewriteScopeMarkerBookmarks` BEFORE the
288
+ * strip runs; listed here as defense-in-depth)
289
+ * - everything (defensive blanket-retain) when the document
290
+ * contains a `<w:dataBinding>` element whose xpath could
291
+ * reference bookmarks via paths we cannot statically analyze
292
+ */
293
+ retainedBookmarkNames?: ReadonlyArray<string>;
258
294
  }
259
295
 
260
296
  export type ParsedBlockNode =
@@ -717,6 +753,16 @@ export function setActiveParseTelemetryBus(bus: ParseTelemetryBus | undefined):
717
753
  interface CosmeticStripContext {
718
754
  readonly strip: boolean;
719
755
  readonly counts: Record<string, number>;
756
+ /** Phase 2 — bookmark-reference scan. `null` when strip is off OR
757
+ * when the entry point did not run the scan (e.g. an internal call
758
+ * with no XML available). */
759
+ readonly bookmarkScan: BookmarkReferenceScan | null;
760
+ /** Phase 2 — per-parse set of bookmark IDs that the bookmarkStart
761
+ * decision marked as stripped. The bookmarkEnd site keys off this
762
+ * set (since `<w:bookmarkEnd>` carries only the id, not the name).
763
+ * Bookmarks can span paragraphs, so this MUST be request-scoped, not
764
+ * per-paragraph. */
765
+ readonly strippedBookmarkIds: Set<string>;
720
766
  }
721
767
  let activeCosmeticStripContext: CosmeticStripContext | null = null;
722
768
 
@@ -730,6 +776,47 @@ function shouldStripCosmeticMarker(): boolean {
730
776
  return activeCosmeticStripContext?.strip === true;
731
777
  }
732
778
 
779
+ /**
780
+ * Phase 2 — true when the bookmark with this `name` should be RETAINED
781
+ * (its name is referenced by a hyperlink/field in the doc, on the
782
+ * caller-supplied allowlist, a `bw:scope:*` prefix, or covered by the
783
+ * `_Toc*` blanket-retain when a TOC field exists). When the bookmark
784
+ * has no name (`""`), retain — only the body-walker's `bkId` path
785
+ * reaches this helper, and unnamed bookmarks have no consumer that
786
+ * could reference them by name so retention is a no-op.
787
+ */
788
+ function shouldRetainBookmark(name: string): boolean {
789
+ const ctx = activeCosmeticStripContext;
790
+ if (!ctx || !ctx.strip) return true; // strip disabled — always retain
791
+ if (!ctx.bookmarkScan) return true; // scan absent — defensive retain
792
+ if (name === "") return true; // no name → no consumer can reference
793
+ return isRetainedBookmarkName(name, ctx.bookmarkScan);
794
+ }
795
+
796
+ /**
797
+ * Phase 2 — record that the bookmark with this `bookmarkId` was
798
+ * stripped at its `bookmarkStart` site so the matching `bookmarkEnd`
799
+ * (which carries only the id, not the name) can find the decision.
800
+ */
801
+ function noteStrippedBookmarkId(bookmarkId: string): void {
802
+ if (!activeCosmeticStripContext) return;
803
+ activeCosmeticStripContext.strippedBookmarkIds.add(bookmarkId);
804
+ }
805
+
806
+ /**
807
+ * Phase 2 — true when this `bookmarkEnd` corresponds to a `bookmarkStart`
808
+ * that was previously stripped. When the id is unknown (cross-fragment
809
+ * end without a matching start in the same parse) the answer is `false`
810
+ * — we keep the end conservatively rather than risk dropping a load-
811
+ * bearing pair we didn't see start.
812
+ */
813
+ function shouldStripBookmarkEndId(bookmarkId: string): boolean {
814
+ if (!activeCosmeticStripContext) return false;
815
+ if (!activeCosmeticStripContext.strip) return false;
816
+ if (bookmarkId === "") return false;
817
+ return activeCosmeticStripContext.strippedBookmarkIds.has(bookmarkId);
818
+ }
819
+
733
820
  export function parseMainDocumentXml(
734
821
  xml: string,
735
822
  relationships: readonly OpcRelationship[] = [],
@@ -739,9 +826,14 @@ export function parseMainDocumentXml(
739
826
  parseOptions: ParseMainDocumentOptions = {},
740
827
  ): ParsedMainDocument {
741
828
  activeChartPartLookup = chartPartLookup;
829
+ const stripEnabled = parseOptions.stripCosmeticMarkers !== false;
742
830
  const stripContext: CosmeticStripContext = {
743
- strip: parseOptions.stripCosmeticMarkers !== false,
831
+ strip: stripEnabled,
744
832
  counts: Object.create(null) as Record<string, number>,
833
+ bookmarkScan: stripEnabled
834
+ ? scanBookmarkReferences(xml, parseOptions.retainedBookmarkNames ?? [])
835
+ : null,
836
+ strippedBookmarkIds: new Set<string>(),
745
837
  };
746
838
  activeCosmeticStripContext = stripContext;
747
839
  const bus = activeParseTelemetryBus;
@@ -848,7 +940,13 @@ function parseMainDocumentXmlInner(
848
940
 
849
941
  const allBlocks = bodyElement.children
850
942
  .filter((node): node is XmlElementNode => node.type === "element")
851
- .map((node) => parseBodyChild(node, xml, relationshipMap, relationships, mediaParts, sourcePartPath));
943
+ .map((node) => parseBodyChild(node, xml, relationshipMap, relationships, mediaParts, sourcePartPath))
944
+ // Phase 2 (Slice B) — body-level bookmarkStart/End that the strip
945
+ // dropped come back from `parseBodyChild` as `opaque_block` with
946
+ // empty `rawXml` (sentinel — see `parseBodyChild` body-level
947
+ // bookmark cases). Filter them out here so downstream consumers
948
+ // don't see empty blocks.
949
+ .filter((block) => !(block.type === "opaque_block" && block.rawXml === ""));
852
950
 
853
951
  // The last body-level sectPr is the final section properties (not an intermediate section break).
854
952
  // Extract it from the blocks list and store it separately.
@@ -1071,6 +1169,48 @@ function parseBodyChild(
1071
1169
  return parseSectionBreakElement(node, sourceXml);
1072
1170
  }
1073
1171
 
1172
+ // Body-level <w:bookmarkStart> / <w:bookmarkEnd> — bookmarks that
1173
+ // span across paragraph boundaries land directly under <w:body>
1174
+ // rather than inside a <w:p>. Pre-Phase-2 these flowed through the
1175
+ // `nodeType !== "p"` default below and became opaque_block (the
1176
+ // dominant 184-of-185 opaque source on EU IT Services Agreement
1177
+ // per `enumerate-opaque-fragments`). Phase 2 strip applies the same
1178
+ // RETAIN-vs-STRIP decision here as at sites 1+2 inside paragraphs.
1179
+ if (nodeType === "bookmarkStart") {
1180
+ const bkId = node.attributes["w:id"] ?? node.attributes.id ?? "";
1181
+ const bkName = node.attributes["w:name"] ?? node.attributes.name ?? "";
1182
+ if (
1183
+ shouldStripCosmeticMarker() &&
1184
+ bkId &&
1185
+ bkName &&
1186
+ !shouldRetainBookmark(bkName)
1187
+ ) {
1188
+ noteStrippedCosmeticMarker("bookmarkStart");
1189
+ noteStrippedBookmarkId(bkId);
1190
+ // Returning an empty paragraph block flushes cleanly through the
1191
+ // body-walker's collected blocks. The block is a no-op zero-content
1192
+ // paragraph that downstream consumers ignore. (Returning `null`
1193
+ // would change the body-walker's signature; the block-shaped
1194
+ // no-op preserves the existing iteration contract.)
1195
+ return { type: "opaque_block", rawXml: "" };
1196
+ }
1197
+ return {
1198
+ type: "opaque_block",
1199
+ rawXml: sourceXml.slice(node.start, node.end),
1200
+ };
1201
+ }
1202
+ if (nodeType === "bookmarkEnd") {
1203
+ const bkEndId = node.attributes["w:id"] ?? node.attributes.id ?? "";
1204
+ if (shouldStripBookmarkEndId(bkEndId)) {
1205
+ noteStrippedCosmeticMarker("bookmarkEnd");
1206
+ return { type: "opaque_block", rawXml: "" };
1207
+ }
1208
+ return {
1209
+ type: "opaque_block",
1210
+ rawXml: sourceXml.slice(node.start, node.end),
1211
+ };
1212
+ }
1213
+
1074
1214
  if (nodeType !== "p") {
1075
1215
  return {
1076
1216
  type: "opaque_block",
@@ -1186,6 +1326,18 @@ function parseBodyChild(
1186
1326
  const bkId = child.attributes["w:id"] ?? child.attributes.id ?? "";
1187
1327
  const bkName = child.attributes["w:name"] ?? child.attributes.name ?? "";
1188
1328
  if (bkId) {
1329
+ // Phase 2 (Slice B) — orphan-bookmark strip. When the strip
1330
+ // is on AND the name is not load-bearing per the reference
1331
+ // scan, drop the bookmark. The bookmarkEnd site finds the
1332
+ // matching id via `shouldStripBookmarkEndId`.
1333
+ if (bkName && !shouldRetainBookmark(bkName)) {
1334
+ noteStrippedCosmeticMarker("bookmarkStart");
1335
+ noteStrippedBookmarkId(bkId);
1336
+ flushActiveComplexField(children, () => {
1337
+ activeComplexField = null;
1338
+ }, activeComplexField);
1339
+ break;
1340
+ }
1189
1341
  const bookmarkNode = {
1190
1342
  type: "bookmark_start",
1191
1343
  bookmarkId: bkId,
@@ -1207,6 +1359,15 @@ function parseBodyChild(
1207
1359
  case "bookmarkEnd": {
1208
1360
  const bkEndId = child.attributes["w:id"] ?? child.attributes.id ?? "";
1209
1361
  if (bkEndId) {
1362
+ // Phase 2 — strip the matching bookmarkEnd if its bookmarkStart
1363
+ // was previously stripped (id-keyed pairing across the parse).
1364
+ if (shouldStripBookmarkEndId(bkEndId)) {
1365
+ noteStrippedCosmeticMarker("bookmarkEnd");
1366
+ flushActiveComplexField(children, () => {
1367
+ activeComplexField = null;
1368
+ }, activeComplexField);
1369
+ break;
1370
+ }
1210
1371
  const bookmarkNode = {
1211
1372
  type: "bookmark_end",
1212
1373
  bookmarkId: bkEndId,
@@ -2755,14 +2916,53 @@ function parseRevisionContainer(
2755
2916
  ];
2756
2917
  case "commentRangeStart":
2757
2918
  case "commentRangeEnd":
2758
- case "bookmarkStart":
2759
- case "bookmarkEnd":
2760
2919
  return [
2761
2920
  {
2762
2921
  type: "opaque_inline",
2763
2922
  rawXml: sourceXml.slice(node.start, node.end),
2764
2923
  },
2765
2924
  ];
2925
+ case "bookmarkStart": {
2926
+ // Site 3 (Slice B) — nested-context bookmarkStart. Pre-Slice-B
2927
+ // ALL nested-context bookmarks fell through to opaque_inline
2928
+ // (the 216-opaque source on CCEP). When the strip is on AND the
2929
+ // name isn't load-bearing, drop the start + record the id so
2930
+ // the matching end (Site 4) drops too.
2931
+ const nestedBkId = child.attributes["w:id"] ?? child.attributes.id ?? "";
2932
+ const nestedBkName = child.attributes["w:name"] ?? child.attributes.name ?? "";
2933
+ if (
2934
+ shouldStripCosmeticMarker() &&
2935
+ nestedBkId &&
2936
+ nestedBkName &&
2937
+ !shouldRetainBookmark(nestedBkName)
2938
+ ) {
2939
+ noteStrippedCosmeticMarker("bookmarkStart");
2940
+ noteStrippedBookmarkId(nestedBkId);
2941
+ break;
2942
+ }
2943
+ return [
2944
+ {
2945
+ type: "opaque_inline",
2946
+ rawXml: sourceXml.slice(node.start, node.end),
2947
+ },
2948
+ ];
2949
+ }
2950
+ case "bookmarkEnd": {
2951
+ // Site 4 (Slice B) — nested-context bookmarkEnd. Strip iff its
2952
+ // matching start (any site) was previously stripped.
2953
+ const nestedBkEndId =
2954
+ child.attributes["w:id"] ?? child.attributes.id ?? "";
2955
+ if (shouldStripBookmarkEndId(nestedBkEndId)) {
2956
+ noteStrippedCosmeticMarker("bookmarkEnd");
2957
+ break;
2958
+ }
2959
+ return [
2960
+ {
2961
+ type: "opaque_inline",
2962
+ rawXml: sourceXml.slice(node.start, node.end),
2963
+ },
2964
+ ];
2965
+ }
2766
2966
  case "permStart":
2767
2967
  result.push(parsePermStartNode(child, sourceXml));
2768
2968
  break;
@@ -2817,6 +3017,18 @@ function parseHyperlink(
2817
3017
 
2818
3018
  const children: Array<ParsedTextNode | ParsedBreakNode | ParsedColumnBreakNode | ParsedPageBreakNode | ParsedTabNode | ParsedSymbolNode> = [];
2819
3019
 
3020
+ // Slice A — local field-bracket state machine. CCEP TOC hyperlinks
3021
+ // wrap their entries with a `PAGEREF` field whose runs carry
3022
+ // `<w:fldChar w:fldCharType="begin|separate|end"/>` markers and a
3023
+ // `<w:instrText>` instruction. Pre-strip behavior bailed the whole
3024
+ // hyperlink to opaque_inline because `parseRunContentOnly` returned
3025
+ // `supported: false` on those markers (~48 CCEP opaques). The local
3026
+ // bracketMode is independent of the body-walker's `activeComplexField`:
3027
+ // hyperlink children are a self-contained sub-walk that doesn't
3028
+ // escape the hyperlink scope.
3029
+ type FieldBracketMode = "outside" | "instruction" | "result";
3030
+ let bracketMode: FieldBracketMode = "outside";
3031
+
2820
3032
  for (const child of node.children) {
2821
3033
  if (child.type !== "element") {
2822
3034
  continue;
@@ -2829,9 +3041,29 @@ function parseHyperlink(
2829
3041
  };
2830
3042
  }
2831
3043
 
3044
+ // Pre-scan for fldChar bracket transitions. Per OOXML grammar, a
3045
+ // run carrying a fldChar carries no other displayable content;
3046
+ // skip the whole run after updating bracketMode.
3047
+ const fldChar = child.children.find(
3048
+ (c): c is XmlElementNode =>
3049
+ c.type === "element" && localName(c.name) === "fldChar",
3050
+ );
3051
+ if (fldChar) {
3052
+ const fldType =
3053
+ fldChar.attributes["w:fldCharType"] ?? fldChar.attributes.fldCharType;
3054
+ if (fldType === "begin") bracketMode = "instruction";
3055
+ else if (fldType === "separate") bracketMode = "result";
3056
+ else if (fldType === "end") bracketMode = "outside";
3057
+ continue;
3058
+ }
3059
+
2832
3060
  const run = parseRunContentOnly(child, sourceXml, {
2833
3061
  allowDeletedText: options.allowDeletedText,
2834
3062
  preserveUnsupportedReviewMarkup: options.preserveUnsupportedReviewMarkup,
3063
+ // Tolerate `<w:instrText>` siblings inside hyperlink runs — the
3064
+ // bracket-state machine above takes care of dropping them via
3065
+ // `bracketMode === "instruction"` below.
3066
+ allowFieldMarkers: true,
2835
3067
  });
2836
3068
  if (!run.supported) {
2837
3069
  return {
@@ -2840,6 +3072,8 @@ function parseHyperlink(
2840
3072
  };
2841
3073
  }
2842
3074
 
3075
+ // Drop nodes during the field-instruction segment; keep result + outside.
3076
+ if (bracketMode === "instruction") continue;
2843
3077
  children.push(...run.nodes);
2844
3078
  }
2845
3079
 
@@ -2857,6 +3091,16 @@ function parseRunContentOnly(
2857
3091
  options: {
2858
3092
  allowDeletedText?: boolean;
2859
3093
  preserveUnsupportedReviewMarkup?: boolean;
3094
+ /**
3095
+ * Slice A — gracefully skip `<w:fldChar>` / `<w:instrText>` children
3096
+ * instead of bailing to `supported: false`. The hyperlink path
3097
+ * (`parseHyperlink`) opts in so TOC `PAGEREF` field markers inside
3098
+ * hyperlink runs no longer trip exit-B. The body-walker callers do
3099
+ * NOT opt in — they have their own `activeComplexField` state machine
3100
+ * that handles these markers semantically and bailing here is
3101
+ * load-bearing for that machine to see the markers in `parseRun`.
3102
+ */
3103
+ allowFieldMarkers?: boolean;
2860
3104
  } = {},
2861
3105
  ): RunParseResult {
2862
3106
  const marksResult = readRunMarks(node, _sourceXml);
@@ -2940,6 +3184,14 @@ function parseRunContentOnly(
2940
3184
  }
2941
3185
  break;
2942
3186
  }
3187
+ case "fldChar":
3188
+ case "instrText":
3189
+ // Slice A — graceful skip when caller opts in (hyperlink path).
3190
+ // Otherwise fall through to default and bail (body-walker path,
3191
+ // which uses its own activeComplexField state machine in
3192
+ // parseRun to handle these markers).
3193
+ if (options.allowFieldMarkers) break;
3194
+ return { nodes: [], supported: false };
2943
3195
  default:
2944
3196
  return { nodes: [], supported: false };
2945
3197
  }
@@ -10,7 +10,7 @@
10
10
  * preserved in the canonical node's rawXml field for lossless round-trip export.
11
11
  */
12
12
 
13
- import type { ShapeContent } from "../../model/canonical-document.ts";
13
+ import type { BlockNode, ShapeContent } from "../../model/canonical-document.ts";
14
14
  import { parseFill } from "./parse-fill.ts";
15
15
  import {
16
16
  type XmlElementNode,
@@ -32,6 +32,15 @@ export interface ParsedWpsShape {
32
32
  text?: string;
33
33
  /** Raw txbxContent XML for structured re-rendering. */
34
34
  txbxContentXml?: string;
35
+ /**
36
+ * Parsed block-level structure from `w:txbxContent`, populated when a
37
+ * `blockParser` callback is supplied (coord-02 §14 / coord-11 §22 —
38
+ * headers/footers need access to shape-inside text like the CCEP
39
+ * "Copyright CCEP STRICTLY CONFIDENTIAL" red band, which lives in
40
+ * shape-textbox paragraphs). Same shape + semantics as
41
+ * `ShapeContent.txbxBlocks` on the drawing-frame path.
42
+ */
43
+ txbxBlocks?: ReadonlyArray<BlockNode>;
35
44
  /** DrawML geometry preset, e.g. "rect", "roundRect". */
36
45
  geometry?: string;
37
46
  /** Original drawing XML for lossless round-trip export. */
@@ -65,7 +74,10 @@ export type ParsedShape = ParsedWpsShape | ParsedWordArt | ParsedVmlShape;
65
74
  *
66
75
  * Returns null if the drawing does not contain a WPS shape.
67
76
  */
68
- export function parseShapeXml(drawingXml: string): ParsedWpsShape | ParsedWordArt | null {
77
+ export function parseShapeXml(
78
+ drawingXml: string,
79
+ blockParser?: TxbxBlockParser,
80
+ ): ParsedWpsShape | ParsedWordArt | null {
69
81
  const root = parseXml(drawingXml);
70
82
  const graphicData = findFirstDescendant(root, "graphicData");
71
83
  if (!graphicData) return null;
@@ -104,11 +116,37 @@ export function parseShapeXml(drawingXml: string): ParsedWpsShape | ParsedWordAr
104
116
  // Extract raw txbxContent XML for structured re-rendering of text boxes
105
117
  const txbxContentXml = txbxContent ? extractRawXml(txbxContent) : undefined;
106
118
 
119
+ // Coord-02 §14 / coord-11 §22 follow-up (2026-04-24): when a
120
+ // blockParser is supplied, recurse into the txbxContent to produce a
121
+ // structured block representation. Without this, shape-textbox
122
+ // content (CCEP "Copyright CCEP STRICTLY CONFIDENTIAL" footer band)
123
+ // is reachable only via the `.text` summary string — L03 cascade +
124
+ // L11 render can't walk runs/marks.
125
+ let txbxBlocks: ReadonlyArray<BlockNode> | undefined;
126
+ if (txbxContentXml && blockParser) {
127
+ try {
128
+ // The `blockParser` callback is supplied by parse-main-document.ts
129
+ // as a thin wrapper over `parseBlockStreamFromXml`. That function
130
+ // returns `ParsedBlockNode[]` — structurally identical to canonical
131
+ // `BlockNode[]` at runtime for shape-textbox content (verified on
132
+ // CCEP SOW footer fixture 2026-04-24: paragraph + text + TextMark
133
+ // shapes land end-to-end with zero `ParsedBlockNode`-only fields
134
+ // surfaced). The cast is safe here because the runtime output IS
135
+ // canonical; a structural `as unknown as BlockNode[]` preserves
136
+ // type safety at every consumer site (L03 cascade, L11 render,
137
+ // validator walk).
138
+ txbxBlocks = blockParser(txbxContentXml) as unknown as ReadonlyArray<BlockNode>;
139
+ } catch {
140
+ txbxBlocks = undefined;
141
+ }
142
+ }
143
+
107
144
  return {
108
145
  type: "shape",
109
146
  ...(isTextBox ? { isTextBox: true } : {}),
110
147
  ...(text ? { text } : {}),
111
148
  ...(txbxContentXml ? { txbxContentXml } : {}),
149
+ ...(txbxBlocks && txbxBlocks.length > 0 ? { txbxBlocks } : {}),
112
150
  ...(prst ? { geometry: prst } : {}),
113
151
  rawXml: drawingXml,
114
152
  };
@@ -186,6 +224,17 @@ function extractAllText(node: XmlElementNode): string {
186
224
  // txbxContentXml, optional recursive txbxBlocks).
187
225
  // ───────────────────────────────────────────────────────────────────────────
188
226
 
227
+ /**
228
+ * Callback signature for the txbx-content block parser supplied by
229
+ * parse-main-document.ts / parse-headers-footers.ts. The actual
230
+ * implementation wraps `parseBlockStreamFromXml` which returns
231
+ * `ParsedBlockNode[]`; its runtime output is canonical `BlockNode[]`
232
+ * for shape-textbox content (no `ParsedBlockNode`-only fields surface
233
+ * at the shape boundary — verified on CCEP SOW footer fixture
234
+ * 2026-04-24). The structural `unknown` return keeps the parse layer
235
+ * layer-pure; `parseShapeContent` + `parseShapeXml` cast to canonical
236
+ * `BlockNode[]` at the assembly seam.
237
+ */
189
238
  export type TxbxBlockParser = (xml: string) => ReadonlyArray<{ type: string; [key: string]: unknown }>;
190
239
 
191
240
  export function parseShapeContent(
@@ -212,10 +261,15 @@ export function parseShapeContent(
212
261
  const txbxContent = txbx ? findFirstDescendant(txbx, "txbxContent") : undefined;
213
262
  const txbxContentXml = txbxContent ? extractRawXml(txbxContent) : undefined;
214
263
 
215
- let txbxBlocks: ReadonlyArray<{ type: string; [key: string]: unknown }> | undefined;
264
+ let txbxBlocks: ReadonlyArray<BlockNode> | undefined;
216
265
  if (txbxContentXml && blockParser) {
217
266
  try {
218
- txbxBlocks = blockParser(txbxContentXml);
267
+ // See `TxbxBlockParser` doc above: runtime output is canonical
268
+ // `BlockNode[]` for shape-textbox content (verified on CCEP SOW
269
+ // footer fixture 2026-04-24). Cast at the assembly seam so
270
+ // downstream consumers (L03, L11, validator) get canonical types
271
+ // without local `as unknown` ceremony.
272
+ txbxBlocks = blockParser(txbxContentXml) as unknown as ReadonlyArray<BlockNode>;
219
273
  } catch {
220
274
  // Preserve-only fallback: keep txbxContentXml for serialization; leave
221
275
  // txbxBlocks undefined so consumers know recursion did not succeed.
@@ -55,8 +55,21 @@ import { classifyFieldInstruction } from "./parse-fields.ts";
55
55
 
56
56
  /**
57
57
  * Field families safe enough to leave a `<w:tbl>` in structured
58
- * canonical form. Widening this set commits L06 / L08 to cell-level
59
- * edit semantics for that family — don't expand opportunistically.
58
+ * canonical form.
59
+ *
60
+ * The principle: the body-direct paragraph parser accepts every
61
+ * field family and emits a typed `FieldInlineNode` — classified
62
+ * families get a refresh slot, preserve-only families round-trip
63
+ * via the shared `FieldInlineNode` shape. Cells inside a `<w:tbl>`
64
+ * run through the same parser + serializer. Flattening the whole
65
+ * table to `opaque_block` because one cell carries a field is
66
+ * over-conservative: preserve-only fields round-trip identically
67
+ * whether they sit in a body paragraph or a table cell.
68
+ *
69
+ * This set is consulted AFTER `isWellFormedFieldInstruction` — for
70
+ * the rare case where a field instruction doesn't start with a
71
+ * recognizable OOXML family identifier, we still accept it iff the
72
+ * classifier happened to recognize it.
60
73
  */
61
74
  export const SAFE_TABLE_FIELD_FAMILIES: ReadonlySet<string> = new Set([
62
75
  "REF",
@@ -65,6 +78,18 @@ export const SAFE_TABLE_FIELD_FAMILIES: ReadonlySet<string> = new Set([
65
78
  "TOC",
66
79
  "PAGE",
67
80
  "NUMPAGES",
81
+ "STYLEREF",
82
+ "SECTIONPAGES",
83
+ "DATE",
84
+ "TIME",
85
+ "AUTHOR",
86
+ "FILENAME",
87
+ "MERGEFIELD",
88
+ "IF",
89
+ "SEQ",
90
+ "INDEX",
91
+ "TC",
92
+ "FORMULA",
68
93
  ]);
69
94
 
70
95
  /**
@@ -127,23 +152,63 @@ export function extractComplexFieldInstructionsFromRaw(rawXml: string): string[]
127
152
  * `FORMDROPDOWN`. These are fully supported by the body-direct
128
153
  * paragraph parser via `parseFFDataFromFldChar` but classify as
129
154
  * `UNKNOWN` under `FIELD_FAMILY_PATTERN` (which targets data-field
130
- * families like REF / TOC / MERGEFIELD). Short-circuiting them lets
131
- * form-field cells stay in structured canonical tables instead of
132
- * flattening the entire table to `opaque_block`. Coord-01 §11, 2026-04-24.
155
+ * families like REF / TOC / MERGEFIELD). Kept as a named helper for
156
+ * readability; `isWellFormedFieldInstruction` would also accept them
157
+ * via the generic identifier pattern, but the named check documents
158
+ * the carve-out's origin (coord-01 §11, 2026-04-24).
133
159
  */
134
160
  export function isLegacyFormFieldInstruction(instruction: string): boolean {
135
161
  return /^\s*(FORMTEXT|FORMCHECKBOX|FORMDROPDOWN)\b/i.test(instruction);
136
162
  }
137
163
 
164
+ /**
165
+ * Matches any well-formed OOXML field instruction. OOXML field
166
+ * instructions (ECMA-376 §17.16) begin with an ALL-CAPS family name
167
+ * — `REF`, `PAGE`, `TOC`, `MERGEFIELD`, `DOCPROPERTY`,
168
+ * `LISTNUM`, and so on through the full §17.16 catalog (60+
169
+ * families). Pattern-matching the family-name prefix lets us
170
+ * recognize every standard field shape WITHOUT adding each one to
171
+ * the L02 `PreserveOnlyFieldFamily` union (which would require a
172
+ * cross-lane slice) OR to `FIELD_FAMILY_PATTERN` (which expands
173
+ * classification-time behavior).
174
+ *
175
+ * The narrow `SAFE_TABLE_FIELD_FAMILIES` above is consulted as a
176
+ * fallback for the rare case of a field instruction that doesn't
177
+ * start with a family-name prefix but DOES classify to a known
178
+ * family (e.g. leading whitespace quirks we haven't seen in the
179
+ * wild).
180
+ *
181
+ * Rejection remains for:
182
+ * - Instructions that don't start with a family-name shape — these
183
+ * could be garbled / truncated / mid-field text; flattening the
184
+ * table is the safe preservation path.
185
+ * - Revision markup anywhere in the table (tracked changes —
186
+ * unaffected by this check; handled by `RISKY_TABLE_MARKUP_RE`
187
+ * below).
188
+ */
189
+ const WELL_FORMED_FIELD_INSTRUCTION_RE = /^\s*[A-Z][A-Z0-9_]*\b/;
190
+
191
+ export function isWellFormedFieldInstruction(instruction: string): boolean {
192
+ return WELL_FORMED_FIELD_INSTRUCTION_RE.test(instruction);
193
+ }
194
+
138
195
  /**
139
196
  * Decides whether a single field instruction (either `w:instr`
140
197
  * attribute value or concatenated `instrText` run) is safe for
141
198
  * structured-table parsing. Used by the shared predicate below;
142
199
  * exposed for direct callers (the debug diagnostics script runs
143
200
  * this to classify source instructions alongside the canonical).
201
+ *
202
+ * Order matters: the well-formed prefix check covers every standard
203
+ * OOXML field family in one pass; the classifier fallback catches
204
+ * edge cases where `FIELD_FAMILY_PATTERN` happens to match but the
205
+ * prefix shape doesn't (unlikely, but the fallback keeps behavior
206
+ * consistent with classification for any instruction the classifier
207
+ * recognizes).
144
208
  */
145
209
  export function isSafeTableFieldInstruction(instruction: string): boolean {
146
210
  if (isLegacyFormFieldInstruction(instruction)) return true;
211
+ if (isWellFormedFieldInstruction(instruction)) return true;
147
212
  const family = classifyFieldInstruction(instruction).family;
148
213
  return SAFE_TABLE_FIELD_FAMILIES.has(family);
149
214
  }
@@ -1786,12 +1786,29 @@ export interface SmartArtPreviewNode {
1786
1786
  /**
1787
1787
  * Read-only rendering of a wps:wsp WordprocessingShape. Text content is
1788
1788
  * extracted for display. The original drawing XML is preserved in rawXml.
1789
+ *
1790
+ * When the shape is a text-box (`isTextBox: true`), the raw textbox XML
1791
+ * is preserved in `txbxContentXml` for lossless round-trip, and the
1792
+ * parsed block structure lands in `txbxBlocks` — canonical `BlockNode[]`
1793
+ * with styles already resolved (coord-02 §14 / coord-11 §22 closed L01
1794
+ * side 2026-04-24 in `7d87f1189`; L02 type-promoted 2026-04-24 once the
1795
+ * runtime contract was confirmed canonical).
1789
1796
  */
1790
1797
  export interface ShapeNode {
1791
1798
  type: "shape";
1792
1799
  text?: string;
1793
1800
  geometry?: string;
1794
1801
  isTextBox?: boolean;
1802
+ /** Raw `<w:txbxContent>` XML, preserved for serialization + round-trip. */
1803
+ txbxContentXml?: string;
1804
+ /**
1805
+ * Parsed canonical block-level structure from `<w:txbxContent>`,
1806
+ * populated when the parse path supplies a `blockParser` callback
1807
+ * (headers/footers via `src/io/ooxml/parse-headers-footers.ts`;
1808
+ * body via `src/io/ooxml/parse-main-document.ts`). Shape + semantics
1809
+ * identical to `ShapeContent.txbxBlocks` on the drawing-frame path.
1810
+ */
1811
+ txbxBlocks?: ReadonlyArray<BlockNode>;
1795
1812
  rawXml: string;
1796
1813
  }
1797
1814
 
@@ -1971,14 +1988,16 @@ export interface ShapeContent {
1971
1988
  * Parsed block-level structure from `w:txbxContent`, populated when a
1972
1989
  * `blockParser` callback is supplied during parse (CO4 F3.3).
1973
1990
  *
1974
- * Type is deliberately structural (`{ type: string; ... }`) rather than
1975
- * canonical `BlockNode[]` because the recursion stops at the parse layer
1976
- * before the style + numbering normalization pass that converts
1977
- * `ParsedBlockNode` canonical `BlockNode`. Consumers that need the fully
1978
- * normalized form run normalization on this subtree explicitly. Testing
1979
- * that `txbxBlocks.length > 0` proves the recursion executed.
1991
+ * Canonical `BlockNode[]` the parse path produces fully-normalized
1992
+ * blocks (styles resolved, marks attached, no `ParsedBlockNode`-only
1993
+ * fields at runtime). Verified on the CCEP SOW footer fixture 2026-04-24:
1994
+ * paragraph + text + `TextMark` shapes land end-to-end. Type promoted
1995
+ * 2026-04-24 from the earlier weakly-typed escape hatch once the L01
1996
+ * shape-textbox parse (commit `7d87f1189`) confirmed the runtime
1997
+ * contract — unblocks L03 cascade + L11 render walking `txbxBlocks`
1998
+ * without `as unknown as BlockNode[]` casts at the consumer site.
1980
1999
  */
1981
- txbxBlocks?: ReadonlyArray<{ type: string; [key: string]: unknown }>;
2000
+ txbxBlocks?: ReadonlyArray<BlockNode>;
1982
2001
  rawXml: string;
1983
2002
  }
1984
2003
 
@@ -2860,11 +2879,29 @@ function validateDocumentNode(
2860
2879
  return;
2861
2880
  case "chart_preview":
2862
2881
  case "smartart_preview":
2863
- case "shape":
2864
2882
  case "wordart":
2865
2883
  case "vml_shape":
2866
2884
  expectString(record.rawXml, `${path}.rawXml`, issues);
2867
2885
  return;
2886
+ case "shape":
2887
+ expectString(record.rawXml, `${path}.rawXml`, issues);
2888
+ if (record.txbxBlocks !== undefined) {
2889
+ if (!Array.isArray(record.txbxBlocks)) {
2890
+ issues.push({
2891
+ path: `${path}.txbxBlocks`,
2892
+ message: "shape.txbxBlocks must be an array when present.",
2893
+ });
2894
+ } else {
2895
+ // coord-02 §14 follow-up (2026-04-24): `ShapeNode.txbxBlocks`
2896
+ // is canonical `BlockNode[]`. Walk it with the same validator
2897
+ // used for top-level document content so run marks / paragraph
2898
+ // structure / nested shapes all enforce the normal rules.
2899
+ record.txbxBlocks.forEach((child, index) => {
2900
+ validateDocumentNode(child, `${path}.txbxBlocks[${index}]`, issues);
2901
+ });
2902
+ }
2903
+ }
2904
+ return;
2868
2905
  case "drawing_frame": {
2869
2906
  const anchor = asPlainObject(record.anchor, `${path}.anchor`, issues);
2870
2907
  const content = asPlainObject(record.content, `${path}.content`, issues);