@beyondwork/docx-react-component 1.0.71 → 1.0.73

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/README.md +964 -75
  2. package/package.json +1 -1
  3. package/src/api/public-types.ts +280 -1
  4. package/src/api/v3/_create.ts +16 -1
  5. package/src/api/v3/_runtime-handle.ts +2 -0
  6. package/src/api/v3/ai/evaluate.ts +113 -0
  7. package/src/api/v3/ai/outline.ts +140 -0
  8. package/src/api/v3/ai/policy.ts +31 -0
  9. package/src/api/v3/ai/replacement.ts +8 -0
  10. package/src/api/v3/ai/review.ts +342 -0
  11. package/src/api/v3/ai/stats.ts +62 -0
  12. package/src/api/v3/runtime/viewport.ts +181 -0
  13. package/src/api/v3/runtime/workflow.ts +114 -1
  14. package/src/api/v3/ui/_types.ts +35 -0
  15. package/src/api/v3/ui/chrome-preset-model.ts +6 -0
  16. package/src/api/v3/ui/index.ts +1 -0
  17. package/src/api/v3/ui/viewport.ts +112 -0
  18. package/src/compare/diff-engine.ts +2 -0
  19. package/src/core/commands/formatting-commands.ts +1 -0
  20. package/src/core/commands/table-structure-commands.ts +1 -0
  21. package/src/core/state/editor-state.ts +49 -6
  22. package/src/io/export/serialize-footnotes.ts +6 -0
  23. package/src/io/export/serialize-headers-footers.ts +7 -0
  24. package/src/io/export/serialize-main-document.ts +20 -0
  25. package/src/io/export/serialize-paragraph-formatting.ts +34 -0
  26. package/src/io/export/split-review-boundaries.ts +1 -0
  27. package/src/io/normalize/normalize-text.ts +49 -2
  28. package/src/io/ooxml/parse-headers-footers.ts +31 -0
  29. package/src/io/ooxml/parse-main-document.ts +148 -7
  30. package/src/io/ooxml/parse-paragraph-formatting.ts +105 -0
  31. package/src/model/canonical-document.ts +401 -1
  32. package/src/runtime/formatting/formatting-context.ts +2 -1
  33. package/src/runtime/geometry/overlay-rects.ts +7 -10
  34. package/src/runtime/layout/layout-engine-version.ts +278 -1
  35. package/src/runtime/layout/paginated-layout-engine.ts +181 -8
  36. package/src/runtime/layout/resolved-formatting-state.ts +108 -13
  37. package/src/runtime/markdown-sanitizer.ts +21 -4
  38. package/src/runtime/render/render-kernel.ts +21 -1
  39. package/src/runtime/scopes/action-validation.ts +30 -4
  40. package/src/runtime/scopes/audit-bundle.ts +8 -0
  41. package/src/runtime/scopes/compiler-service.ts +1 -0
  42. package/src/runtime/scopes/enumerate-scopes.ts +61 -3
  43. package/src/runtime/scopes/replacement/apply.ts +50 -3
  44. package/src/runtime/scopes/scope-kinds/paragraph.ts +170 -7
  45. package/src/runtime/scopes/semantic-scope-types.ts +27 -0
  46. package/src/runtime/surface-projection.ts +77 -0
  47. package/src/runtime/workflow/coordinator.ts +3 -0
  48. package/src/runtime/workflow/scope-writer.ts +34 -0
  49. package/src/session/export/embedded-reconstitute.ts +37 -3
  50. package/src/session/import/embedded-offload.ts +26 -1
  51. package/src/session/import/loader-types.ts +18 -0
  52. package/src/session/import/loader.ts +2 -0
  53. package/src/shell/media-previews.ts +8 -6
  54. package/src/ui/WordReviewEditor.tsx +1 -0
  55. package/src/ui/editor-surface-controller.tsx +11 -0
  56. package/src/ui/headless/selection-helpers.ts +2 -2
  57. package/src/ui/runtime-shortcut-dispatch.ts +4 -4
  58. package/src/ui-tailwind/chrome/tw-runtime-repl-dialog.tsx +22 -4
  59. package/src/ui-tailwind/chrome/tw-table-context-toolbar.tsx +11 -11
  60. package/src/ui-tailwind/chrome/tw-table-grip-layer.tsx +1 -1
  61. package/src/ui-tailwind/chrome-overlay/tw-chrome-overlay.tsx +5 -0
  62. package/src/ui-tailwind/chrome-overlay/tw-comment-balloon-layer.tsx +18 -1
  63. package/src/ui-tailwind/chrome-overlay/tw-page-stack-overlay-layer.tsx +22 -6
  64. package/src/ui-tailwind/chrome-overlay/tw-revision-margin-bar-layer.tsx +18 -1
  65. package/src/ui-tailwind/editor-surface/pm-page-break-decorations.ts +98 -3
  66. package/src/ui-tailwind/editor-surface/pm-schema.ts +50 -4
  67. package/src/ui-tailwind/editor-surface/pm-state-from-snapshot.ts +6 -0
  68. package/src/ui-tailwind/editor-surface/scroll-anchor.ts +8 -1
  69. package/src/ui-tailwind/editor-surface/search-plugin.ts +2 -4
  70. package/src/ui-tailwind/editor-surface/tw-page-block-view.helpers.ts +114 -0
  71. package/src/ui-tailwind/editor-surface/tw-page-block-view.tsx +12 -4
  72. package/src/ui-tailwind/editor-surface/tw-prosemirror-surface.tsx +29 -4
  73. package/src/ui-tailwind/index.ts +4 -2
  74. package/src/ui-tailwind/page-chrome-model.ts +5 -7
  75. package/src/ui-tailwind/page-stack/floating-image-overlay-model.ts +54 -34
  76. package/src/ui-tailwind/page-stack/tw-endnote-area.tsx +4 -1
  77. package/src/ui-tailwind/page-stack/tw-footnote-area.tsx +4 -1
  78. package/src/ui-tailwind/page-stack/tw-page-chrome-entry.tsx +10 -1
  79. package/src/ui-tailwind/page-stack/tw-page-footer-band.tsx +8 -1
  80. package/src/ui-tailwind/page-stack/tw-page-header-band.tsx +11 -1
  81. package/src/ui-tailwind/page-stack/tw-page-stack-chrome-layer.tsx +7 -1
  82. package/src/ui-tailwind/page-stack/tw-region-block-renderer.tsx +139 -10
  83. package/src/ui-tailwind/review/comment-markdown-renderer.tsx +1 -1
  84. package/src/ui-tailwind/review-workspace/page-chrome.ts +4 -4
  85. package/src/ui-tailwind/review-workspace/use-workspace-side-effects.ts +1 -1
  86. package/src/ui-tailwind/theme/editor-theme.css +15 -16
  87. package/src/ui-tailwind/tw-review-workspace.tsx +22 -14
@@ -18,6 +18,7 @@ import {
18
18
  } from "./table-properties-xml.ts";
19
19
  import { twip } from "./twip.ts";
20
20
  import { escapeXmlAttribute } from "./escape-xml-attribute.ts";
21
+ import { buildFrameXml } from "./serialize-paragraph-formatting.ts";
21
22
 
22
23
  export const WORD_FOOTNOTES_CONTENT_TYPE =
23
24
  "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml";
@@ -222,6 +223,11 @@ function buildParagraphPropertiesXml(paragraph: ParagraphNode): string {
222
223
  if (paragraph.styleId) {
223
224
  parts.push(`<w:pStyle w:val="${escapeXmlAttribute(paragraph.styleId)}"/>`);
224
225
  }
226
+ // Coord-04 §1.19.d — direct-paragraph framePr (footnotes path).
227
+ {
228
+ const frameXml = buildFrameXml(paragraph.frameProperties);
229
+ if (frameXml) parts.push(frameXml);
230
+ }
225
231
  if (paragraph.alignment) {
226
232
  parts.push(`<w:jc w:val="${escapeXmlAttribute(paragraph.alignment)}"/>`);
227
233
  }
@@ -18,6 +18,7 @@ import {
18
18
  } from "./table-properties-xml.ts";
19
19
  import { twip } from "./twip.ts";
20
20
  import { escapeXmlAttribute } from "./escape-xml-attribute.ts";
21
+ import { buildFrameXml } from "./serialize-paragraph-formatting.ts";
21
22
 
22
23
  export const WORD_HEADER_CONTENT_TYPE =
23
24
  "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml";
@@ -186,6 +187,11 @@ function buildParagraphPropertiesXml(paragraph: ParagraphNode): string {
186
187
  if (paragraph.styleId) {
187
188
  parts.push(`<w:pStyle w:val="${escapeXmlAttribute(paragraph.styleId)}"/>`);
188
189
  }
190
+ // Coord-04 §1.19.d — direct-paragraph framePr (headers/footers path).
191
+ {
192
+ const frameXml = buildFrameXml(paragraph.frameProperties);
193
+ if (frameXml) parts.push(frameXml);
194
+ }
189
195
  if (paragraph.spacing) {
190
196
  const s = paragraph.spacing;
191
197
  const attrs: string[] = [];
@@ -284,6 +290,7 @@ function serializeInlineNode(node: InlineNode): string {
284
290
  throw new Error(`Cannot safely serialize ${node.type} content in header/footer sub-parts.`);
285
291
  case "image":
286
292
  case "column_break":
293
+ case "page_break":
287
294
  case "symbol":
288
295
  default:
289
296
  throw new Error(`Cannot safely serialize ${node.type} content in header/footer sub-parts.`);
@@ -22,6 +22,7 @@ import { SCOPE_MARKER_BOOKMARK_PREFIX } from "../ooxml/parse-scope-markers.ts";
22
22
  import { getOpaqueFragment } from "../../preservation/store.ts";
23
23
  import { retainRelationshipsForFragment } from "../../preservation/relationship-retention.ts";
24
24
  import { serializeParagraphNumberingProperties } from "./serialize-numbering.ts";
25
+ import { buildFrameXml } from "./serialize-paragraph-formatting.ts";
25
26
  import {
26
27
  serializeTableCellPropertiesXml,
27
28
  serializeTablePropertiesXml,
@@ -581,6 +582,8 @@ function serializeTableInlineNode(
581
582
  return "<w:r><w:tab/></w:r>";
582
583
  case "column_break":
583
584
  return "<w:r><w:br w:type=\"column\"/></w:r>";
585
+ case "page_break":
586
+ return "<w:r><w:br w:type=\"page\"/></w:r>";
584
587
  case "hard_break":
585
588
  return "<w:r><w:br/></w:r>";
586
589
  case "symbol": {
@@ -714,6 +717,12 @@ function buildParagraphPropertiesXml(paragraph: ParagraphNode): string {
714
717
  pushOnOffParagraphProperty(children, "keepNext", paragraph.keepNext);
715
718
  pushOnOffParagraphProperty(children, "keepLines", paragraph.keepLines);
716
719
  pushOnOffParagraphProperty(children, "pageBreakBefore", paragraph.pageBreakBefore);
720
+ // ECMA-376 §17.3.1 canonical slot for framePr: between pageBreakBefore
721
+ // and pBdr. Coord-04 §1.19.d — direct-paragraph path.
722
+ {
723
+ const frameXml = buildFrameXml(paragraph.frameProperties);
724
+ if (frameXml) children.push(frameXml);
725
+ }
717
726
  pushOnOffParagraphProperty(children, "widowControl", paragraph.widowControl);
718
727
  if (paragraph.outlineLevel !== undefined) {
719
728
  children.push(`<w:outlineLvl w:val="${paragraph.outlineLevel}"/>`);
@@ -1010,6 +1019,17 @@ function serializeInlineNode(
1010
1019
  boundaries,
1011
1020
  };
1012
1021
  }
1022
+ case "page_break": {
1023
+ const xml = `<w:r><w:br w:type="page"/></w:r>`;
1024
+ const boundaries = new Map<number, number>();
1025
+ boundaries.set(cursor, xmlOffset);
1026
+ boundaries.set(cursor + 1, xmlOffset + xml.length);
1027
+ return {
1028
+ xml,
1029
+ cursor: cursor + 1,
1030
+ boundaries,
1031
+ };
1032
+ }
1013
1033
  case "hard_break": {
1014
1034
  const xml = serializeRun({ kind: "hard_break" });
1015
1035
  const boundaries = new Map<number, number>();
@@ -6,6 +6,7 @@
6
6
 
7
7
  import type {
8
8
  CanonicalParagraphFormatting,
9
+ FrameProperties,
9
10
  ParagraphBorders,
10
11
  ParagraphIndentation,
11
12
  ParagraphShading,
@@ -92,6 +93,34 @@ function buildSpacingXml(s: ParagraphSpacing | undefined): string {
92
93
  return attrs.length > 0 ? `<w:spacing ${attrs.join(" ")}/>` : "";
93
94
  }
94
95
 
96
+ export function buildFrameXml(f: FrameProperties | undefined): string {
97
+ if (!f) return "";
98
+ // Prefer parsed rawXml when available — preserves extension attributes
99
+ // (`w14:*`, `w15:*`, `mc:Ignorable`) that the typed field set doesn't
100
+ // cover. When rawXml isn't present (parser couldn't capture the source
101
+ // string), emit from typed fields only; extension attrs are lost in
102
+ // that round-trip path, but every CCEP-class framed paragraph we've
103
+ // seen uses only modelled attributes.
104
+ if (f.rawXml) return f.rawXml;
105
+ const attrs: string[] = [];
106
+ if (f.widthTwips !== undefined) attrs.push(`w:w="${f.widthTwips}"`);
107
+ if (f.heightTwips !== undefined) attrs.push(`w:h="${f.heightTwips}"`);
108
+ if (f.hRule) attrs.push(`w:hRule="${escXml(f.hRule)}"`);
109
+ if (f.xTwips !== undefined) attrs.push(`w:x="${f.xTwips}"`);
110
+ if (f.yTwips !== undefined) attrs.push(`w:y="${f.yTwips}"`);
111
+ if (f.xAlign) attrs.push(`w:xAlign="${escXml(f.xAlign)}"`);
112
+ if (f.yAlign) attrs.push(`w:yAlign="${escXml(f.yAlign)}"`);
113
+ if (f.hAnchor) attrs.push(`w:hAnchor="${escXml(f.hAnchor)}"`);
114
+ if (f.vAnchor) attrs.push(`w:vAnchor="${escXml(f.vAnchor)}"`);
115
+ if (f.wrap) attrs.push(`w:wrap="${escXml(f.wrap)}"`);
116
+ if (f.hSpaceTwips !== undefined) attrs.push(`w:hSpace="${f.hSpaceTwips}"`);
117
+ if (f.vSpaceTwips !== undefined) attrs.push(`w:vSpace="${f.vSpaceTwips}"`);
118
+ if (f.dropCap) attrs.push(`w:dropCap="${escXml(f.dropCap)}"`);
119
+ if (f.lines !== undefined) attrs.push(`w:lines="${f.lines}"`);
120
+ if (f.anchorLock !== undefined) attrs.push(`w:anchorLock="${f.anchorLock ? "1" : "0"}"`);
121
+ return attrs.length > 0 ? `<w:framePr ${attrs.join(" ")}/>` : "";
122
+ }
123
+
95
124
  function buildIndentXml(i: ParagraphIndentation | undefined): string {
96
125
  if (!i) return "";
97
126
  const attrs: string[] = [];
@@ -114,6 +143,11 @@ export function buildParagraphPropertiesXml(
114
143
  parts.push(toggleEl("keepLines", pPr.keepLines));
115
144
  parts.push(toggleEl("pageBreakBefore", pPr.pageBreakBefore));
116
145
 
146
+ // 2. framePr (ECMA-376 §17.3.1 canonical order slot, between pageBreakBefore
147
+ // and pBdr). Emit before pBdr so the OpenXML SDK validator accepts a framed
148
+ // paragraph that also carries borders (coord-04 §1.18.d).
149
+ parts.push(buildFrameXml(pPr.frameProperties));
150
+
117
151
  // 4. pBdr
118
152
  parts.push(buildBordersXml(pPr.borders));
119
153
 
@@ -340,6 +340,7 @@ function measureInlineNodeForReviewBoundaries(node: InlineNode): number {
340
340
  case "tab":
341
341
  case "hard_break":
342
342
  case "column_break":
343
+ case "page_break":
343
344
  case "footnote_ref":
344
345
  case "image":
345
346
  case "opaque_inline":
@@ -264,6 +264,7 @@ function normalizeParagraph(
264
264
  ...(paragraph.suppressLineNumbers !== undefined
265
265
  ? { suppressLineNumbers: paragraph.suppressLineNumbers }
266
266
  : {}),
267
+ ...(paragraph.frameProperties ? { frameProperties: paragraph.frameProperties } : {}),
267
268
  // A.7: preserve w14:paraId / w14:textId across import → export so
268
269
  // downstream tools that diff documents by paragraph id stay stable.
269
270
  ...(paragraph.wordExtensionIds
@@ -481,6 +482,17 @@ function normalizeInlineChildren(
481
482
  normalized.push({ type: "column_break" });
482
483
  state.cursor += 1;
483
484
  break;
485
+ case "page_break":
486
+ // coord-04 §1.18.5 follow-up: the fde93da3 cross-layer page_break
487
+ // ship added parse + surface-projection + pagination but missed
488
+ // this normalize-text switch. Without this case, every
489
+ // `<w:br w:type="page"/>` run parsed by L01 falls through and gets
490
+ // silently dropped during canonical assembly — so L04's
491
+ // `hasPageBreak` never fires on real documents. Mirrors the
492
+ // `column_break` branch.
493
+ normalized.push({ type: "page_break" });
494
+ state.cursor += 1;
495
+ break;
484
496
  case "chart_preview":
485
497
  registerComplexPreviewMedia(state, node);
486
498
  normalized.push({
@@ -704,9 +716,30 @@ function registerComplexPreviewMedia(
704
716
  function normalizeHyperlink(node: ParsedHyperlinkNode): {
705
717
  type: "hyperlink";
706
718
  href: string;
707
- children: Array<TextNode | { type: "hard_break" } | { type: "tab" }>;
719
+ children: Array<
720
+ | TextNode
721
+ | { type: "hard_break" }
722
+ | { type: "column_break" }
723
+ | { type: "page_break" }
724
+ | { type: "tab" }
725
+ | { type: "symbol"; char: string; font?: string; marks?: TextMark[] }
726
+ >;
708
727
  } {
709
- const children: Array<TextNode | { type: "hard_break" } | { type: "tab" }> = [];
728
+ // Canonical `HyperlinkNode.children` accepts the full inline-leaf set
729
+ // (TextNode | HardBreakNode | ColumnBreakNode | PageBreakNode | TabNode |
730
+ // SymbolNode). Matching the canonical shape here keeps rare
731
+ // hyperlink-inside-break patterns (a link spanning a column or page
732
+ // break in Word's output) from silently dropping at the normalize step —
733
+ // same class of drop that `coord-04 §1.19.b` fixed one level up in
734
+ // `normalizeInlineChildren`.
735
+ const children: Array<
736
+ | TextNode
737
+ | { type: "hard_break" }
738
+ | { type: "column_break" }
739
+ | { type: "page_break" }
740
+ | { type: "tab" }
741
+ | { type: "symbol"; char: string; font?: string; marks?: TextMark[] }
742
+ > = [];
710
743
 
711
744
  for (const child of node.children) {
712
745
  switch (child.type) {
@@ -732,6 +765,20 @@ function normalizeHyperlink(node: ParsedHyperlinkNode): {
732
765
  case "hard_break":
733
766
  children.push({ type: "hard_break" });
734
767
  break;
768
+ case "column_break":
769
+ children.push({ type: "column_break" });
770
+ break;
771
+ case "page_break":
772
+ children.push({ type: "page_break" });
773
+ break;
774
+ case "symbol":
775
+ children.push({
776
+ type: "symbol",
777
+ char: child.char,
778
+ ...(child.font ? { font: child.font } : {}),
779
+ ...(child.marks && child.marks.length > 0 ? { marks: child.marks } : {}),
780
+ });
781
+ break;
735
782
  }
736
783
  }
737
784
 
@@ -328,6 +328,37 @@ function parseParagraphElement(
328
328
  activeComplexField = null;
329
329
  }
330
330
  pushFieldNode(children, child, "simple");
331
+ } else if (name === "sdt") {
332
+ // coord-11 §22 — structured-document-tag wrapping run-level content
333
+ // inside a header/footer paragraph. Word commonly uses these to
334
+ // bundle the page-number field + decorative drawings (e.g. CCEP's
335
+ // footer "Copyright CCEP STRICTLY CONFIDENTIAL" red rectangle +
336
+ // "Page N" label both sit inside one `<w:sdt>` in footer1.xml).
337
+ // Without this case the sdt was silently dropped at the paragraph
338
+ // walker and every run it carried — including WPS shapes bearing
339
+ // the brand-strip text — never reached the canonical tree.
340
+ // Treat `<w:sdtContent>` as a transparent wrapper and re-process
341
+ // its `<w:r>` / `<w:hyperlink>` / `<w:sdt>` children as if they
342
+ // were direct paragraph children.
343
+ const sdtContent = findChildElementOptional(child, "sdtContent");
344
+ if (sdtContent) {
345
+ for (const grandchild of sdtContent.children) {
346
+ if (grandchild.type !== "element") continue;
347
+ const gname = localName(grandchild.name);
348
+ if (gname === "r") {
349
+ activeComplexField = appendRunNodes(grandchild, children, activeComplexField, sourceXml, opts);
350
+ } else if (gname === "hyperlink") {
351
+ children.push(parseHyperlinkElement(grandchild, opts));
352
+ } else if (gname === "bookmarkStart" || gname === "bookmarkEnd") {
353
+ children.push(parseBookmarkElement(grandchild));
354
+ } else if (gname === "fldSimple") {
355
+ pushFieldNode(children, grandchild, "simple");
356
+ }
357
+ // Nested sdt / other elements ignored — deeper nesting is rare
358
+ // enough that opaque round-trip via the block-level sdt parser
359
+ // handles it if it matters.
360
+ }
361
+ }
331
362
  }
332
363
  }
333
364
 
@@ -1,6 +1,7 @@
1
1
  import type {
2
2
  BorderSpec,
3
3
  CellShading,
4
+ FrameProperties,
4
5
  TextMark,
5
6
  ParagraphBorders,
6
7
  ParagraphShading,
@@ -39,6 +40,7 @@ import { parseComplexContentXml, type ChartPartLookup } from "./parse-complex-co
39
40
  import { parseShapeXml, parseVmlXml } from "./parse-shapes.ts";
40
41
  import { parseObject } from "./parse-object.ts";
41
42
  import { parseDrawingFrame } from "./parse-drawing.ts";
43
+ import { readFrameProperties } from "./parse-paragraph-formatting.ts";
42
44
  import { classifyFieldInstruction } from "./parse-fields.ts";
43
45
  import { parseFFDataFromFldChar } from "./parse-ffdata.ts";
44
46
  import { resolveHighlightColor } from "./highlight-colors.ts";
@@ -217,6 +219,41 @@ function captureGrabBagFromContainer(
217
219
  export interface ParsedMainDocument {
218
220
  blocks: ParsedBlockNode[];
219
221
  finalSectionProperties?: SectionProperties;
222
+ /**
223
+ * Aggregate count of cosmetic markers stripped during parse (see
224
+ * {@link ParseMainDocumentOptions.stripCosmeticMarkers}). Keyed by
225
+ * local element name (e.g. `lastRenderedPageBreak`). Absent when no
226
+ * markers were stripped.
227
+ */
228
+ skippedCosmeticMarkerCounts?: Readonly<Record<string, number>>;
229
+ }
230
+
231
+ /**
232
+ * Cosmetic markers that Word re-inserts on reopen and that carry no
233
+ * contract semantics. Stripping them at parse time unblocks
234
+ * `replaceText` on ranges that today cross them as `opaque_inline`
235
+ * boundaries. See `docs/architecture/cosmetic-marker-strip.md`.
236
+ *
237
+ * This is the Phase 1 set. Bookmark-pair stripping (with reference
238
+ * scan) is Phase 2.
239
+ */
240
+ export const COSMETIC_MARKER_ELEMENT_NAMES: ReadonlySet<string> = new Set([
241
+ "lastRenderedPageBreak",
242
+ "proofErr",
243
+ "noBreakHyphen",
244
+ ]);
245
+
246
+ export interface ParseMainDocumentOptions {
247
+ /**
248
+ * When `true` (the default), drops `<w:lastRenderedPageBreak/>`,
249
+ * `<w:proofErr/>`, and `<w:noBreakHyphen/>` during the parse walk
250
+ * instead of emitting them as `opaque_inline` nodes. Counts are
251
+ * reported on {@link ParsedMainDocument.skippedCosmeticMarkerCounts}.
252
+ *
253
+ * Set to `false` to preserve the pre-strip behavior exactly — every
254
+ * cosmetic marker becomes an `opaque_inline` with its source XML.
255
+ */
256
+ stripCosmeticMarkers?: boolean;
220
257
  }
221
258
 
222
259
  export type ParsedBlockNode =
@@ -256,6 +293,15 @@ export interface ParsedParagraphNode {
256
293
  bidi?: boolean;
257
294
  suppressLineNumbers?: boolean;
258
295
  cnfStyle?: string;
296
+ /**
297
+ * `<w:framePr>` declared directly on the paragraph's own `<w:pPr>`.
298
+ * Coord-04 §1.19.d step 2 (inline path). The style-cascade path
299
+ * flows through `CanonicalParagraphFormatting.frameProperties` on
300
+ * the style side; this slot captures the direct-override path so
301
+ * L02 `ParagraphNode.frameProperties` (added 2026-04-24 `4b3ea0b2`)
302
+ * can reach its canonical shape.
303
+ */
304
+ frameProperties?: FrameProperties;
259
305
  /** A.7: preserved w14 extension ids (paraId/textId). */
260
306
  wordExtensionIds?: {
261
307
  paraId?: string;
@@ -271,6 +317,7 @@ export type ParsedInlineNode =
271
317
  | ParsedTextNode
272
318
  | ParsedBreakNode
273
319
  | ParsedColumnBreakNode
320
+ | ParsedPageBreakNode
274
321
  | ParsedTabNode
275
322
  | ParsedSymbolNode
276
323
  | ParsedImageNode
@@ -306,6 +353,10 @@ export interface ParsedColumnBreakNode {
306
353
  type: "column_break";
307
354
  }
308
355
 
356
+ export interface ParsedPageBreakNode {
357
+ type: "page_break";
358
+ }
359
+
309
360
  export interface ParsedTabNode {
310
361
  type: "tab";
311
362
  }
@@ -350,7 +401,7 @@ export interface ParsedImageNode {
350
401
  export interface ParsedHyperlinkNode {
351
402
  type: "hyperlink";
352
403
  href: string;
353
- children: Array<ParsedTextNode | ParsedBreakNode | ParsedColumnBreakNode | ParsedTabNode | ParsedSymbolNode>;
404
+ children: Array<ParsedTextNode | ParsedBreakNode | ParsedColumnBreakNode | ParsedPageBreakNode | ParsedTabNode | ParsedSymbolNode>;
354
405
  rawXml: string;
355
406
  }
356
407
 
@@ -606,7 +657,7 @@ interface XmlTextNode {
606
657
  type XmlNode = XmlElementNode | XmlTextNode;
607
658
 
608
659
  interface RunParseResult {
609
- nodes: Array<ParsedTextNode | ParsedBreakNode | ParsedColumnBreakNode | ParsedTabNode | ParsedSymbolNode>;
660
+ nodes: Array<ParsedTextNode | ParsedBreakNode | ParsedColumnBreakNode | ParsedPageBreakNode | ParsedTabNode | ParsedSymbolNode>;
610
661
  supported: boolean;
611
662
  }
612
663
 
@@ -651,24 +702,61 @@ export function setActiveParseTelemetryBus(bus: ParseTelemetryBus | undefined):
651
702
  activeParseTelemetryBus = bus;
652
703
  }
653
704
 
705
+ /**
706
+ * Request-scoped cosmetic-marker strip context. Set by
707
+ * `parseMainDocumentXml` for the duration of a single parse; read at
708
+ * the four emission sites in `parseBodyChild` / `parseRun` /
709
+ * `parseRunContentOnly` / `parseRevisionContainer`. Using a module
710
+ * variable instead of threading the flag through ~15 intermediate
711
+ * function signatures keeps the call sites readable; the try/finally
712
+ * in the entry point ensures the variable never leaks across calls.
713
+ *
714
+ * Re-entrancy invariant matches `activeChartPartLookup` above.
715
+ */
716
+ interface CosmeticStripContext {
717
+ readonly strip: boolean;
718
+ readonly counts: Record<string, number>;
719
+ }
720
+ let activeCosmeticStripContext: CosmeticStripContext | null = null;
721
+
722
+ function noteStrippedCosmeticMarker(tag: string): void {
723
+ if (!activeCosmeticStripContext) return;
724
+ activeCosmeticStripContext.counts[tag] =
725
+ (activeCosmeticStripContext.counts[tag] ?? 0) + 1;
726
+ }
727
+
728
+ function shouldStripCosmeticMarker(): boolean {
729
+ return activeCosmeticStripContext?.strip === true;
730
+ }
731
+
654
732
  export function parseMainDocumentXml(
655
733
  xml: string,
656
734
  relationships: readonly OpcRelationship[] = [],
657
735
  mediaParts: ReadonlyMap<string, InlineMediaPart> = new Map(),
658
736
  sourcePartPath = "/word/document.xml",
659
737
  chartPartLookup?: ChartPartLookup,
738
+ parseOptions: ParseMainDocumentOptions = {},
660
739
  ): ParsedMainDocument {
661
740
  activeChartPartLookup = chartPartLookup;
741
+ const stripContext: CosmeticStripContext = {
742
+ strip: parseOptions.stripCosmeticMarkers !== false,
743
+ counts: Object.create(null) as Record<string, number>,
744
+ };
745
+ activeCosmeticStripContext = stripContext;
662
746
  const bus = activeParseTelemetryBus;
663
747
  const started = bus?.isEnabled("parse") ? performanceNow() : 0;
664
748
  try {
665
749
  const result = parseMainDocumentXmlInner(xml, relationships, mediaParts, sourcePartPath);
750
+ if (Object.keys(stripContext.counts).length > 0) {
751
+ result.skippedCosmeticMarkerCounts = Object.freeze({ ...stripContext.counts });
752
+ }
666
753
  if (bus?.isEnabled("parse")) {
667
754
  emitParseSummary(bus, result, sourcePartPath, performanceNow() - started);
668
755
  }
669
756
  return result;
670
757
  } finally {
671
758
  activeChartPartLookup = undefined;
759
+ activeCosmeticStripContext = null;
672
760
  }
673
761
  }
674
762
 
@@ -699,6 +787,13 @@ function emitParseSummary(
699
787
  blockCount: result.blocks.length,
700
788
  blockKindCounts: counts,
701
789
  ms,
790
+ // Strip counts are surfaced here (telemetry-only) rather than as a
791
+ // warning on `diagnostics.warnings` — the markers carry no
792
+ // contract semantics and surfacing them in the user-visible
793
+ // warnings feed would be noise. Available to debug UX / tests via
794
+ // the `parse` channel; absent when the feature is disabled or no
795
+ // markers were stripped.
796
+ skippedCosmeticMarkerCounts: result.skippedCosmeticMarkerCounts,
702
797
  },
703
798
  });
704
799
  }
@@ -999,6 +1094,7 @@ function parseBodyChild(
999
1094
  let bidi: ParsedParagraphNode["bidi"];
1000
1095
  let suppressLineNumbers: ParsedParagraphNode["suppressLineNumbers"];
1001
1096
  let cnfStyle: ParsedParagraphNode["cnfStyle"];
1097
+ let frameProperties: ParsedParagraphNode["frameProperties"];
1002
1098
  let sectionProperties: SectionProperties | undefined;
1003
1099
  let sectionPropertiesXml: string | undefined;
1004
1100
  let paragraphSupported = true;
@@ -1045,6 +1141,12 @@ function parseBodyChild(
1045
1141
  bidi = readOnOffParagraphProperty(child, "bidi");
1046
1142
  suppressLineNumbers = readOnOffParagraphProperty(child, "suppressLineNumbers");
1047
1143
  cnfStyle = readParagraphCnfStyle(child);
1144
+ {
1145
+ const framePrNode = child.children.find(
1146
+ (c): c is XmlElementNode => c.type === "element" && localName(c.name) === "framePr",
1147
+ );
1148
+ if (framePrNode) frameProperties = readFrameProperties(framePrNode);
1149
+ }
1048
1150
  sectionProperties = readSectionPropertiesFromPPr(child);
1049
1151
  sectionPropertiesXml = readSectionPropertiesXmlFromPPr(child, sourceXml);
1050
1152
  paragraphSupported = paragraphSupported && supportsParagraphProperties(child);
@@ -1143,6 +1245,10 @@ function parseBodyChild(
1143
1245
  flushActiveComplexField(children, () => {
1144
1246
  activeComplexField = null;
1145
1247
  }, activeComplexField);
1248
+ if (shouldStripCosmeticMarker()) {
1249
+ noteStrippedCosmeticMarker("proofErr");
1250
+ break;
1251
+ }
1146
1252
  children.push({
1147
1253
  type: "opaque_inline",
1148
1254
  rawXml: sourceXml.slice(child.start, child.end),
@@ -1230,6 +1336,7 @@ function parseBodyChild(
1230
1336
  ...(bidi !== undefined ? { bidi } : {}),
1231
1337
  ...(suppressLineNumbers !== undefined ? { suppressLineNumbers } : {}),
1232
1338
  ...(cnfStyle ? { cnfStyle } : {}),
1339
+ ...(frameProperties ? { frameProperties } : {}),
1233
1340
  ...(wordExtensionIds ? { wordExtensionIds } : {}),
1234
1341
  ...(sectionProperties ? { sectionProperties } : {}),
1235
1342
  ...(sectionPropertiesXml ? { sectionPropertiesXml } : {}),
@@ -2390,7 +2497,9 @@ function parseRun(
2390
2497
  break;
2391
2498
  }
2392
2499
  case "br":
2393
- if (isColumnBreak(child)) {
2500
+ if (isPageBreak(child)) {
2501
+ result.push({ type: "page_break" });
2502
+ } else if (isColumnBreak(child)) {
2394
2503
  result.push({ type: "column_break" });
2395
2504
  } else if (isSimpleLineBreak(child)) {
2396
2505
  result.push({ type: "hard_break" });
@@ -2577,6 +2686,11 @@ function parseRun(
2577
2686
  }
2578
2687
  case "lastRenderedPageBreak":
2579
2688
  case "proofErr":
2689
+ case "noBreakHyphen":
2690
+ if (shouldStripCosmeticMarker()) {
2691
+ noteStrippedCosmeticMarker(localName(child.name));
2692
+ break;
2693
+ }
2580
2694
  result.push({
2581
2695
  type: "opaque_inline",
2582
2696
  rawXml: sourceXml.slice(child.start, child.end),
@@ -2650,12 +2764,23 @@ function parseRevisionContainer(
2650
2764
  result.push(hyperlink);
2651
2765
  break;
2652
2766
  }
2767
+ case "proofErr":
2768
+ case "lastRenderedPageBreak":
2769
+ case "noBreakHyphen":
2770
+ if (shouldStripCosmeticMarker()) {
2771
+ noteStrippedCosmeticMarker(localName(child.name));
2772
+ break;
2773
+ }
2774
+ return [
2775
+ {
2776
+ type: "opaque_inline",
2777
+ rawXml: sourceXml.slice(node.start, node.end),
2778
+ },
2779
+ ];
2653
2780
  case "commentRangeStart":
2654
2781
  case "commentRangeEnd":
2655
2782
  case "bookmarkStart":
2656
2783
  case "bookmarkEnd":
2657
- case "proofErr":
2658
- case "lastRenderedPageBreak":
2659
2784
  return [
2660
2785
  {
2661
2786
  type: "opaque_inline",
@@ -2714,7 +2839,7 @@ function parseHyperlink(
2714
2839
  };
2715
2840
  }
2716
2841
 
2717
- const children: Array<ParsedTextNode | ParsedBreakNode | ParsedColumnBreakNode | ParsedTabNode | ParsedSymbolNode> = [];
2842
+ const children: Array<ParsedTextNode | ParsedBreakNode | ParsedColumnBreakNode | ParsedPageBreakNode | ParsedTabNode | ParsedSymbolNode> = [];
2718
2843
 
2719
2844
  for (const child of node.children) {
2720
2845
  if (child.type !== "element") {
@@ -2764,7 +2889,7 @@ function parseRunContentOnly(
2764
2889
  }
2765
2890
 
2766
2891
  const marks = marksResult.marks;
2767
- const nodes: Array<ParsedTextNode | ParsedBreakNode | ParsedColumnBreakNode | ParsedTabNode | ParsedSymbolNode> = [];
2892
+ const nodes: Array<ParsedTextNode | ParsedBreakNode | ParsedColumnBreakNode | ParsedPageBreakNode | ParsedTabNode | ParsedSymbolNode> = [];
2768
2893
 
2769
2894
  for (const child of node.children) {
2770
2895
  if (child.type !== "element") {
@@ -2812,6 +2937,10 @@ function parseRunContentOnly(
2812
2937
  break;
2813
2938
  }
2814
2939
  case "br":
2940
+ if (isPageBreak(child)) {
2941
+ nodes.push({ type: "page_break" });
2942
+ break;
2943
+ }
2815
2944
  if (isColumnBreak(child)) {
2816
2945
  nodes.push({ type: "column_break" });
2817
2946
  break;
@@ -2824,10 +2953,17 @@ function parseRunContentOnly(
2824
2953
  case "commentReference":
2825
2954
  case "lastRenderedPageBreak":
2826
2955
  case "proofErr":
2956
+ case "noBreakHyphen": {
2957
+ const tag = localName(child.name);
2958
+ if (shouldStripCosmeticMarker() && tag !== "commentReference") {
2959
+ noteStrippedCosmeticMarker(tag);
2960
+ break;
2961
+ }
2827
2962
  if (options.preserveUnsupportedReviewMarkup) {
2828
2963
  return { nodes: [], supported: false };
2829
2964
  }
2830
2965
  break;
2966
+ }
2831
2967
  default:
2832
2968
  return { nodes: [], supported: false };
2833
2969
  }
@@ -3149,6 +3285,11 @@ function isColumnBreak(node: XmlElementNode): boolean {
3149
3285
  return value === "column";
3150
3286
  }
3151
3287
 
3288
+ function isPageBreak(node: XmlElementNode): boolean {
3289
+ const value = (node.attributes["w:type"] ?? node.attributes.type ?? "").toLowerCase();
3290
+ return value === "page";
3291
+ }
3292
+
3152
3293
  function findChildElement(node: XmlElementNode, childLocalName: string): XmlElementNode {
3153
3294
  const child = node.children.find(
3154
3295
  (entry): entry is XmlElementNode =>