@beyondwork/docx-react-component 1.0.71 → 1.0.73
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +964 -75
- package/package.json +1 -1
- package/src/api/public-types.ts +280 -1
- package/src/api/v3/_create.ts +16 -1
- package/src/api/v3/_runtime-handle.ts +2 -0
- package/src/api/v3/ai/evaluate.ts +113 -0
- package/src/api/v3/ai/outline.ts +140 -0
- package/src/api/v3/ai/policy.ts +31 -0
- package/src/api/v3/ai/replacement.ts +8 -0
- package/src/api/v3/ai/review.ts +342 -0
- package/src/api/v3/ai/stats.ts +62 -0
- package/src/api/v3/runtime/viewport.ts +181 -0
- package/src/api/v3/runtime/workflow.ts +114 -1
- package/src/api/v3/ui/_types.ts +35 -0
- package/src/api/v3/ui/chrome-preset-model.ts +6 -0
- package/src/api/v3/ui/index.ts +1 -0
- package/src/api/v3/ui/viewport.ts +112 -0
- package/src/compare/diff-engine.ts +2 -0
- package/src/core/commands/formatting-commands.ts +1 -0
- package/src/core/commands/table-structure-commands.ts +1 -0
- package/src/core/state/editor-state.ts +49 -6
- package/src/io/export/serialize-footnotes.ts +6 -0
- package/src/io/export/serialize-headers-footers.ts +7 -0
- package/src/io/export/serialize-main-document.ts +20 -0
- package/src/io/export/serialize-paragraph-formatting.ts +34 -0
- package/src/io/export/split-review-boundaries.ts +1 -0
- package/src/io/normalize/normalize-text.ts +49 -2
- package/src/io/ooxml/parse-headers-footers.ts +31 -0
- package/src/io/ooxml/parse-main-document.ts +148 -7
- package/src/io/ooxml/parse-paragraph-formatting.ts +105 -0
- package/src/model/canonical-document.ts +401 -1
- package/src/runtime/formatting/formatting-context.ts +2 -1
- package/src/runtime/geometry/overlay-rects.ts +7 -10
- package/src/runtime/layout/layout-engine-version.ts +278 -1
- package/src/runtime/layout/paginated-layout-engine.ts +181 -8
- package/src/runtime/layout/resolved-formatting-state.ts +108 -13
- package/src/runtime/markdown-sanitizer.ts +21 -4
- package/src/runtime/render/render-kernel.ts +21 -1
- package/src/runtime/scopes/action-validation.ts +30 -4
- package/src/runtime/scopes/audit-bundle.ts +8 -0
- package/src/runtime/scopes/compiler-service.ts +1 -0
- package/src/runtime/scopes/enumerate-scopes.ts +61 -3
- package/src/runtime/scopes/replacement/apply.ts +50 -3
- package/src/runtime/scopes/scope-kinds/paragraph.ts +170 -7
- package/src/runtime/scopes/semantic-scope-types.ts +27 -0
- package/src/runtime/surface-projection.ts +77 -0
- package/src/runtime/workflow/coordinator.ts +3 -0
- package/src/runtime/workflow/scope-writer.ts +34 -0
- package/src/session/export/embedded-reconstitute.ts +37 -3
- package/src/session/import/embedded-offload.ts +26 -1
- package/src/session/import/loader-types.ts +18 -0
- package/src/session/import/loader.ts +2 -0
- package/src/shell/media-previews.ts +8 -6
- package/src/ui/WordReviewEditor.tsx +1 -0
- package/src/ui/editor-surface-controller.tsx +11 -0
- package/src/ui/headless/selection-helpers.ts +2 -2
- package/src/ui/runtime-shortcut-dispatch.ts +4 -4
- package/src/ui-tailwind/chrome/tw-runtime-repl-dialog.tsx +22 -4
- package/src/ui-tailwind/chrome/tw-table-context-toolbar.tsx +11 -11
- package/src/ui-tailwind/chrome/tw-table-grip-layer.tsx +1 -1
- package/src/ui-tailwind/chrome-overlay/tw-chrome-overlay.tsx +5 -0
- package/src/ui-tailwind/chrome-overlay/tw-comment-balloon-layer.tsx +18 -1
- package/src/ui-tailwind/chrome-overlay/tw-page-stack-overlay-layer.tsx +22 -6
- package/src/ui-tailwind/chrome-overlay/tw-revision-margin-bar-layer.tsx +18 -1
- package/src/ui-tailwind/editor-surface/pm-page-break-decorations.ts +98 -3
- package/src/ui-tailwind/editor-surface/pm-schema.ts +50 -4
- package/src/ui-tailwind/editor-surface/pm-state-from-snapshot.ts +6 -0
- package/src/ui-tailwind/editor-surface/scroll-anchor.ts +8 -1
- package/src/ui-tailwind/editor-surface/search-plugin.ts +2 -4
- package/src/ui-tailwind/editor-surface/tw-page-block-view.helpers.ts +114 -0
- package/src/ui-tailwind/editor-surface/tw-page-block-view.tsx +12 -4
- package/src/ui-tailwind/editor-surface/tw-prosemirror-surface.tsx +29 -4
- package/src/ui-tailwind/index.ts +4 -2
- package/src/ui-tailwind/page-chrome-model.ts +5 -7
- package/src/ui-tailwind/page-stack/floating-image-overlay-model.ts +54 -34
- package/src/ui-tailwind/page-stack/tw-endnote-area.tsx +4 -1
- package/src/ui-tailwind/page-stack/tw-footnote-area.tsx +4 -1
- package/src/ui-tailwind/page-stack/tw-page-chrome-entry.tsx +10 -1
- package/src/ui-tailwind/page-stack/tw-page-footer-band.tsx +8 -1
- package/src/ui-tailwind/page-stack/tw-page-header-band.tsx +11 -1
- package/src/ui-tailwind/page-stack/tw-page-stack-chrome-layer.tsx +7 -1
- package/src/ui-tailwind/page-stack/tw-region-block-renderer.tsx +139 -10
- package/src/ui-tailwind/review/comment-markdown-renderer.tsx +1 -1
- package/src/ui-tailwind/review-workspace/page-chrome.ts +4 -4
- package/src/ui-tailwind/review-workspace/use-workspace-side-effects.ts +1 -1
- package/src/ui-tailwind/theme/editor-theme.css +15 -16
- package/src/ui-tailwind/tw-review-workspace.tsx +22 -14
|
@@ -18,6 +18,7 @@ import {
|
|
|
18
18
|
} from "./table-properties-xml.ts";
|
|
19
19
|
import { twip } from "./twip.ts";
|
|
20
20
|
import { escapeXmlAttribute } from "./escape-xml-attribute.ts";
|
|
21
|
+
import { buildFrameXml } from "./serialize-paragraph-formatting.ts";
|
|
21
22
|
|
|
22
23
|
export const WORD_FOOTNOTES_CONTENT_TYPE =
|
|
23
24
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml";
|
|
@@ -222,6 +223,11 @@ function buildParagraphPropertiesXml(paragraph: ParagraphNode): string {
|
|
|
222
223
|
if (paragraph.styleId) {
|
|
223
224
|
parts.push(`<w:pStyle w:val="${escapeXmlAttribute(paragraph.styleId)}"/>`);
|
|
224
225
|
}
|
|
226
|
+
// Coord-04 §1.19.d — direct-paragraph framePr (footnotes path).
|
|
227
|
+
{
|
|
228
|
+
const frameXml = buildFrameXml(paragraph.frameProperties);
|
|
229
|
+
if (frameXml) parts.push(frameXml);
|
|
230
|
+
}
|
|
225
231
|
if (paragraph.alignment) {
|
|
226
232
|
parts.push(`<w:jc w:val="${escapeXmlAttribute(paragraph.alignment)}"/>`);
|
|
227
233
|
}
|
|
@@ -18,6 +18,7 @@ import {
|
|
|
18
18
|
} from "./table-properties-xml.ts";
|
|
19
19
|
import { twip } from "./twip.ts";
|
|
20
20
|
import { escapeXmlAttribute } from "./escape-xml-attribute.ts";
|
|
21
|
+
import { buildFrameXml } from "./serialize-paragraph-formatting.ts";
|
|
21
22
|
|
|
22
23
|
export const WORD_HEADER_CONTENT_TYPE =
|
|
23
24
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml";
|
|
@@ -186,6 +187,11 @@ function buildParagraphPropertiesXml(paragraph: ParagraphNode): string {
|
|
|
186
187
|
if (paragraph.styleId) {
|
|
187
188
|
parts.push(`<w:pStyle w:val="${escapeXmlAttribute(paragraph.styleId)}"/>`);
|
|
188
189
|
}
|
|
190
|
+
// Coord-04 §1.19.d — direct-paragraph framePr (headers/footers path).
|
|
191
|
+
{
|
|
192
|
+
const frameXml = buildFrameXml(paragraph.frameProperties);
|
|
193
|
+
if (frameXml) parts.push(frameXml);
|
|
194
|
+
}
|
|
189
195
|
if (paragraph.spacing) {
|
|
190
196
|
const s = paragraph.spacing;
|
|
191
197
|
const attrs: string[] = [];
|
|
@@ -284,6 +290,7 @@ function serializeInlineNode(node: InlineNode): string {
|
|
|
284
290
|
throw new Error(`Cannot safely serialize ${node.type} content in header/footer sub-parts.`);
|
|
285
291
|
case "image":
|
|
286
292
|
case "column_break":
|
|
293
|
+
case "page_break":
|
|
287
294
|
case "symbol":
|
|
288
295
|
default:
|
|
289
296
|
throw new Error(`Cannot safely serialize ${node.type} content in header/footer sub-parts.`);
|
|
@@ -22,6 +22,7 @@ import { SCOPE_MARKER_BOOKMARK_PREFIX } from "../ooxml/parse-scope-markers.ts";
|
|
|
22
22
|
import { getOpaqueFragment } from "../../preservation/store.ts";
|
|
23
23
|
import { retainRelationshipsForFragment } from "../../preservation/relationship-retention.ts";
|
|
24
24
|
import { serializeParagraphNumberingProperties } from "./serialize-numbering.ts";
|
|
25
|
+
import { buildFrameXml } from "./serialize-paragraph-formatting.ts";
|
|
25
26
|
import {
|
|
26
27
|
serializeTableCellPropertiesXml,
|
|
27
28
|
serializeTablePropertiesXml,
|
|
@@ -581,6 +582,8 @@ function serializeTableInlineNode(
|
|
|
581
582
|
return "<w:r><w:tab/></w:r>";
|
|
582
583
|
case "column_break":
|
|
583
584
|
return "<w:r><w:br w:type=\"column\"/></w:r>";
|
|
585
|
+
case "page_break":
|
|
586
|
+
return "<w:r><w:br w:type=\"page\"/></w:r>";
|
|
584
587
|
case "hard_break":
|
|
585
588
|
return "<w:r><w:br/></w:r>";
|
|
586
589
|
case "symbol": {
|
|
@@ -714,6 +717,12 @@ function buildParagraphPropertiesXml(paragraph: ParagraphNode): string {
|
|
|
714
717
|
pushOnOffParagraphProperty(children, "keepNext", paragraph.keepNext);
|
|
715
718
|
pushOnOffParagraphProperty(children, "keepLines", paragraph.keepLines);
|
|
716
719
|
pushOnOffParagraphProperty(children, "pageBreakBefore", paragraph.pageBreakBefore);
|
|
720
|
+
// ECMA-376 §17.3.1 canonical slot for framePr: between pageBreakBefore
|
|
721
|
+
// and pBdr. Coord-04 §1.19.d — direct-paragraph path.
|
|
722
|
+
{
|
|
723
|
+
const frameXml = buildFrameXml(paragraph.frameProperties);
|
|
724
|
+
if (frameXml) children.push(frameXml);
|
|
725
|
+
}
|
|
717
726
|
pushOnOffParagraphProperty(children, "widowControl", paragraph.widowControl);
|
|
718
727
|
if (paragraph.outlineLevel !== undefined) {
|
|
719
728
|
children.push(`<w:outlineLvl w:val="${paragraph.outlineLevel}"/>`);
|
|
@@ -1010,6 +1019,17 @@ function serializeInlineNode(
|
|
|
1010
1019
|
boundaries,
|
|
1011
1020
|
};
|
|
1012
1021
|
}
|
|
1022
|
+
case "page_break": {
|
|
1023
|
+
const xml = `<w:r><w:br w:type="page"/></w:r>`;
|
|
1024
|
+
const boundaries = new Map<number, number>();
|
|
1025
|
+
boundaries.set(cursor, xmlOffset);
|
|
1026
|
+
boundaries.set(cursor + 1, xmlOffset + xml.length);
|
|
1027
|
+
return {
|
|
1028
|
+
xml,
|
|
1029
|
+
cursor: cursor + 1,
|
|
1030
|
+
boundaries,
|
|
1031
|
+
};
|
|
1032
|
+
}
|
|
1013
1033
|
case "hard_break": {
|
|
1014
1034
|
const xml = serializeRun({ kind: "hard_break" });
|
|
1015
1035
|
const boundaries = new Map<number, number>();
|
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
import type {
|
|
8
8
|
CanonicalParagraphFormatting,
|
|
9
|
+
FrameProperties,
|
|
9
10
|
ParagraphBorders,
|
|
10
11
|
ParagraphIndentation,
|
|
11
12
|
ParagraphShading,
|
|
@@ -92,6 +93,34 @@ function buildSpacingXml(s: ParagraphSpacing | undefined): string {
|
|
|
92
93
|
return attrs.length > 0 ? `<w:spacing ${attrs.join(" ")}/>` : "";
|
|
93
94
|
}
|
|
94
95
|
|
|
96
|
+
export function buildFrameXml(f: FrameProperties | undefined): string {
|
|
97
|
+
if (!f) return "";
|
|
98
|
+
// Prefer parsed rawXml when available — preserves extension attributes
|
|
99
|
+
// (`w14:*`, `w15:*`, `mc:Ignorable`) that the typed field set doesn't
|
|
100
|
+
// cover. When rawXml isn't present (parser couldn't capture the source
|
|
101
|
+
// string), emit from typed fields only; extension attrs are lost in
|
|
102
|
+
// that round-trip path, but every CCEP-class framed paragraph we've
|
|
103
|
+
// seen uses only modelled attributes.
|
|
104
|
+
if (f.rawXml) return f.rawXml;
|
|
105
|
+
const attrs: string[] = [];
|
|
106
|
+
if (f.widthTwips !== undefined) attrs.push(`w:w="${f.widthTwips}"`);
|
|
107
|
+
if (f.heightTwips !== undefined) attrs.push(`w:h="${f.heightTwips}"`);
|
|
108
|
+
if (f.hRule) attrs.push(`w:hRule="${escXml(f.hRule)}"`);
|
|
109
|
+
if (f.xTwips !== undefined) attrs.push(`w:x="${f.xTwips}"`);
|
|
110
|
+
if (f.yTwips !== undefined) attrs.push(`w:y="${f.yTwips}"`);
|
|
111
|
+
if (f.xAlign) attrs.push(`w:xAlign="${escXml(f.xAlign)}"`);
|
|
112
|
+
if (f.yAlign) attrs.push(`w:yAlign="${escXml(f.yAlign)}"`);
|
|
113
|
+
if (f.hAnchor) attrs.push(`w:hAnchor="${escXml(f.hAnchor)}"`);
|
|
114
|
+
if (f.vAnchor) attrs.push(`w:vAnchor="${escXml(f.vAnchor)}"`);
|
|
115
|
+
if (f.wrap) attrs.push(`w:wrap="${escXml(f.wrap)}"`);
|
|
116
|
+
if (f.hSpaceTwips !== undefined) attrs.push(`w:hSpace="${f.hSpaceTwips}"`);
|
|
117
|
+
if (f.vSpaceTwips !== undefined) attrs.push(`w:vSpace="${f.vSpaceTwips}"`);
|
|
118
|
+
if (f.dropCap) attrs.push(`w:dropCap="${escXml(f.dropCap)}"`);
|
|
119
|
+
if (f.lines !== undefined) attrs.push(`w:lines="${f.lines}"`);
|
|
120
|
+
if (f.anchorLock !== undefined) attrs.push(`w:anchorLock="${f.anchorLock ? "1" : "0"}"`);
|
|
121
|
+
return attrs.length > 0 ? `<w:framePr ${attrs.join(" ")}/>` : "";
|
|
122
|
+
}
|
|
123
|
+
|
|
95
124
|
function buildIndentXml(i: ParagraphIndentation | undefined): string {
|
|
96
125
|
if (!i) return "";
|
|
97
126
|
const attrs: string[] = [];
|
|
@@ -114,6 +143,11 @@ export function buildParagraphPropertiesXml(
|
|
|
114
143
|
parts.push(toggleEl("keepLines", pPr.keepLines));
|
|
115
144
|
parts.push(toggleEl("pageBreakBefore", pPr.pageBreakBefore));
|
|
116
145
|
|
|
146
|
+
// 2. framePr (ECMA-376 §17.3.1 canonical order slot, between pageBreakBefore
|
|
147
|
+
// and pBdr). Emit before pBdr so the OpenXML SDK validator accepts a framed
|
|
148
|
+
// paragraph that also carries borders (coord-04 §1.18.d).
|
|
149
|
+
parts.push(buildFrameXml(pPr.frameProperties));
|
|
150
|
+
|
|
117
151
|
// 4. pBdr
|
|
118
152
|
parts.push(buildBordersXml(pPr.borders));
|
|
119
153
|
|
|
@@ -264,6 +264,7 @@ function normalizeParagraph(
|
|
|
264
264
|
...(paragraph.suppressLineNumbers !== undefined
|
|
265
265
|
? { suppressLineNumbers: paragraph.suppressLineNumbers }
|
|
266
266
|
: {}),
|
|
267
|
+
...(paragraph.frameProperties ? { frameProperties: paragraph.frameProperties } : {}),
|
|
267
268
|
// A.7: preserve w14:paraId / w14:textId across import → export so
|
|
268
269
|
// downstream tools that diff documents by paragraph id stay stable.
|
|
269
270
|
...(paragraph.wordExtensionIds
|
|
@@ -481,6 +482,17 @@ function normalizeInlineChildren(
|
|
|
481
482
|
normalized.push({ type: "column_break" });
|
|
482
483
|
state.cursor += 1;
|
|
483
484
|
break;
|
|
485
|
+
case "page_break":
|
|
486
|
+
// coord-04 §1.18.5 follow-up: the fde93da3 cross-layer page_break
|
|
487
|
+
// ship added parse + surface-projection + pagination but missed
|
|
488
|
+
// this normalize-text switch. Without this case, every
|
|
489
|
+
// `<w:br w:type="page"/>` run parsed by L01 falls through and gets
|
|
490
|
+
// silently dropped during canonical assembly — so L04's
|
|
491
|
+
// `hasPageBreak` never fires on real documents. Mirrors the
|
|
492
|
+
// `column_break` branch.
|
|
493
|
+
normalized.push({ type: "page_break" });
|
|
494
|
+
state.cursor += 1;
|
|
495
|
+
break;
|
|
484
496
|
case "chart_preview":
|
|
485
497
|
registerComplexPreviewMedia(state, node);
|
|
486
498
|
normalized.push({
|
|
@@ -704,9 +716,30 @@ function registerComplexPreviewMedia(
|
|
|
704
716
|
function normalizeHyperlink(node: ParsedHyperlinkNode): {
|
|
705
717
|
type: "hyperlink";
|
|
706
718
|
href: string;
|
|
707
|
-
children: Array<
|
|
719
|
+
children: Array<
|
|
720
|
+
| TextNode
|
|
721
|
+
| { type: "hard_break" }
|
|
722
|
+
| { type: "column_break" }
|
|
723
|
+
| { type: "page_break" }
|
|
724
|
+
| { type: "tab" }
|
|
725
|
+
| { type: "symbol"; char: string; font?: string; marks?: TextMark[] }
|
|
726
|
+
>;
|
|
708
727
|
} {
|
|
709
|
-
|
|
728
|
+
// Canonical `HyperlinkNode.children` accepts the full inline-leaf set
|
|
729
|
+
// (TextNode | HardBreakNode | ColumnBreakNode | PageBreakNode | TabNode |
|
|
730
|
+
// SymbolNode). Matching the canonical shape here keeps rare
|
|
731
|
+
// hyperlink-inside-break patterns (a link spanning a column or page
|
|
732
|
+
// break in Word's output) from silently dropping at the normalize step —
|
|
733
|
+
// same class of drop that `coord-04 §1.19.b` fixed one level up in
|
|
734
|
+
// `normalizeInlineChildren`.
|
|
735
|
+
const children: Array<
|
|
736
|
+
| TextNode
|
|
737
|
+
| { type: "hard_break" }
|
|
738
|
+
| { type: "column_break" }
|
|
739
|
+
| { type: "page_break" }
|
|
740
|
+
| { type: "tab" }
|
|
741
|
+
| { type: "symbol"; char: string; font?: string; marks?: TextMark[] }
|
|
742
|
+
> = [];
|
|
710
743
|
|
|
711
744
|
for (const child of node.children) {
|
|
712
745
|
switch (child.type) {
|
|
@@ -732,6 +765,20 @@ function normalizeHyperlink(node: ParsedHyperlinkNode): {
|
|
|
732
765
|
case "hard_break":
|
|
733
766
|
children.push({ type: "hard_break" });
|
|
734
767
|
break;
|
|
768
|
+
case "column_break":
|
|
769
|
+
children.push({ type: "column_break" });
|
|
770
|
+
break;
|
|
771
|
+
case "page_break":
|
|
772
|
+
children.push({ type: "page_break" });
|
|
773
|
+
break;
|
|
774
|
+
case "symbol":
|
|
775
|
+
children.push({
|
|
776
|
+
type: "symbol",
|
|
777
|
+
char: child.char,
|
|
778
|
+
...(child.font ? { font: child.font } : {}),
|
|
779
|
+
...(child.marks && child.marks.length > 0 ? { marks: child.marks } : {}),
|
|
780
|
+
});
|
|
781
|
+
break;
|
|
735
782
|
}
|
|
736
783
|
}
|
|
737
784
|
|
|
@@ -328,6 +328,37 @@ function parseParagraphElement(
|
|
|
328
328
|
activeComplexField = null;
|
|
329
329
|
}
|
|
330
330
|
pushFieldNode(children, child, "simple");
|
|
331
|
+
} else if (name === "sdt") {
|
|
332
|
+
// coord-11 §22 — structured-document-tag wrapping run-level content
|
|
333
|
+
// inside a header/footer paragraph. Word commonly uses these to
|
|
334
|
+
// bundle the page-number field + decorative drawings (e.g. CCEP's
|
|
335
|
+
// footer "Copyright CCEP STRICTLY CONFIDENTIAL" red rectangle +
|
|
336
|
+
// "Page N" label both sit inside one `<w:sdt>` in footer1.xml).
|
|
337
|
+
// Without this case the sdt was silently dropped at the paragraph
|
|
338
|
+
// walker and every run it carried — including WPS shapes bearing
|
|
339
|
+
// the brand-strip text — never reached the canonical tree.
|
|
340
|
+
// Treat `<w:sdtContent>` as a transparent wrapper and re-process
|
|
341
|
+
// its `<w:r>` / `<w:hyperlink>` / `<w:sdt>` children as if they
|
|
342
|
+
// were direct paragraph children.
|
|
343
|
+
const sdtContent = findChildElementOptional(child, "sdtContent");
|
|
344
|
+
if (sdtContent) {
|
|
345
|
+
for (const grandchild of sdtContent.children) {
|
|
346
|
+
if (grandchild.type !== "element") continue;
|
|
347
|
+
const gname = localName(grandchild.name);
|
|
348
|
+
if (gname === "r") {
|
|
349
|
+
activeComplexField = appendRunNodes(grandchild, children, activeComplexField, sourceXml, opts);
|
|
350
|
+
} else if (gname === "hyperlink") {
|
|
351
|
+
children.push(parseHyperlinkElement(grandchild, opts));
|
|
352
|
+
} else if (gname === "bookmarkStart" || gname === "bookmarkEnd") {
|
|
353
|
+
children.push(parseBookmarkElement(grandchild));
|
|
354
|
+
} else if (gname === "fldSimple") {
|
|
355
|
+
pushFieldNode(children, grandchild, "simple");
|
|
356
|
+
}
|
|
357
|
+
// Nested sdt / other elements ignored — deeper nesting is rare
|
|
358
|
+
// enough that opaque round-trip via the block-level sdt parser
|
|
359
|
+
// handles it if it matters.
|
|
360
|
+
}
|
|
361
|
+
}
|
|
331
362
|
}
|
|
332
363
|
}
|
|
333
364
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import type {
|
|
2
2
|
BorderSpec,
|
|
3
3
|
CellShading,
|
|
4
|
+
FrameProperties,
|
|
4
5
|
TextMark,
|
|
5
6
|
ParagraphBorders,
|
|
6
7
|
ParagraphShading,
|
|
@@ -39,6 +40,7 @@ import { parseComplexContentXml, type ChartPartLookup } from "./parse-complex-co
|
|
|
39
40
|
import { parseShapeXml, parseVmlXml } from "./parse-shapes.ts";
|
|
40
41
|
import { parseObject } from "./parse-object.ts";
|
|
41
42
|
import { parseDrawingFrame } from "./parse-drawing.ts";
|
|
43
|
+
import { readFrameProperties } from "./parse-paragraph-formatting.ts";
|
|
42
44
|
import { classifyFieldInstruction } from "./parse-fields.ts";
|
|
43
45
|
import { parseFFDataFromFldChar } from "./parse-ffdata.ts";
|
|
44
46
|
import { resolveHighlightColor } from "./highlight-colors.ts";
|
|
@@ -217,6 +219,41 @@ function captureGrabBagFromContainer(
|
|
|
217
219
|
export interface ParsedMainDocument {
|
|
218
220
|
blocks: ParsedBlockNode[];
|
|
219
221
|
finalSectionProperties?: SectionProperties;
|
|
222
|
+
/**
|
|
223
|
+
* Aggregate count of cosmetic markers stripped during parse (see
|
|
224
|
+
* {@link ParseMainDocumentOptions.stripCosmeticMarkers}). Keyed by
|
|
225
|
+
* local element name (e.g. `lastRenderedPageBreak`). Absent when no
|
|
226
|
+
* markers were stripped.
|
|
227
|
+
*/
|
|
228
|
+
skippedCosmeticMarkerCounts?: Readonly<Record<string, number>>;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
/**
|
|
232
|
+
* Cosmetic markers that Word re-inserts on reopen and that carry no
|
|
233
|
+
* contract semantics. Stripping them at parse time unblocks
|
|
234
|
+
* `replaceText` on ranges that today cross them as `opaque_inline`
|
|
235
|
+
* boundaries. See `docs/architecture/cosmetic-marker-strip.md`.
|
|
236
|
+
*
|
|
237
|
+
* This is the Phase 1 set. Bookmark-pair stripping (with reference
|
|
238
|
+
* scan) is Phase 2.
|
|
239
|
+
*/
|
|
240
|
+
export const COSMETIC_MARKER_ELEMENT_NAMES: ReadonlySet<string> = new Set([
|
|
241
|
+
"lastRenderedPageBreak",
|
|
242
|
+
"proofErr",
|
|
243
|
+
"noBreakHyphen",
|
|
244
|
+
]);
|
|
245
|
+
|
|
246
|
+
export interface ParseMainDocumentOptions {
|
|
247
|
+
/**
|
|
248
|
+
* When `true` (the default), drops `<w:lastRenderedPageBreak/>`,
|
|
249
|
+
* `<w:proofErr/>`, and `<w:noBreakHyphen/>` during the parse walk
|
|
250
|
+
* instead of emitting them as `opaque_inline` nodes. Counts are
|
|
251
|
+
* reported on {@link ParsedMainDocument.skippedCosmeticMarkerCounts}.
|
|
252
|
+
*
|
|
253
|
+
* Set to `false` to preserve the pre-strip behavior exactly — every
|
|
254
|
+
* cosmetic marker becomes an `opaque_inline` with its source XML.
|
|
255
|
+
*/
|
|
256
|
+
stripCosmeticMarkers?: boolean;
|
|
220
257
|
}
|
|
221
258
|
|
|
222
259
|
export type ParsedBlockNode =
|
|
@@ -256,6 +293,15 @@ export interface ParsedParagraphNode {
|
|
|
256
293
|
bidi?: boolean;
|
|
257
294
|
suppressLineNumbers?: boolean;
|
|
258
295
|
cnfStyle?: string;
|
|
296
|
+
/**
|
|
297
|
+
* `<w:framePr>` declared directly on the paragraph's own `<w:pPr>`.
|
|
298
|
+
* Coord-04 §1.19.d step 2 (inline path). The style-cascade path
|
|
299
|
+
* flows through `CanonicalParagraphFormatting.frameProperties` on
|
|
300
|
+
* the style side; this slot captures the direct-override path so
|
|
301
|
+
* L02 `ParagraphNode.frameProperties` (added 2026-04-24 `4b3ea0b2`)
|
|
302
|
+
* can reach its canonical shape.
|
|
303
|
+
*/
|
|
304
|
+
frameProperties?: FrameProperties;
|
|
259
305
|
/** A.7: preserved w14 extension ids (paraId/textId). */
|
|
260
306
|
wordExtensionIds?: {
|
|
261
307
|
paraId?: string;
|
|
@@ -271,6 +317,7 @@ export type ParsedInlineNode =
|
|
|
271
317
|
| ParsedTextNode
|
|
272
318
|
| ParsedBreakNode
|
|
273
319
|
| ParsedColumnBreakNode
|
|
320
|
+
| ParsedPageBreakNode
|
|
274
321
|
| ParsedTabNode
|
|
275
322
|
| ParsedSymbolNode
|
|
276
323
|
| ParsedImageNode
|
|
@@ -306,6 +353,10 @@ export interface ParsedColumnBreakNode {
|
|
|
306
353
|
type: "column_break";
|
|
307
354
|
}
|
|
308
355
|
|
|
356
|
+
export interface ParsedPageBreakNode {
|
|
357
|
+
type: "page_break";
|
|
358
|
+
}
|
|
359
|
+
|
|
309
360
|
export interface ParsedTabNode {
|
|
310
361
|
type: "tab";
|
|
311
362
|
}
|
|
@@ -350,7 +401,7 @@ export interface ParsedImageNode {
|
|
|
350
401
|
export interface ParsedHyperlinkNode {
|
|
351
402
|
type: "hyperlink";
|
|
352
403
|
href: string;
|
|
353
|
-
children: Array<ParsedTextNode | ParsedBreakNode | ParsedColumnBreakNode | ParsedTabNode | ParsedSymbolNode>;
|
|
404
|
+
children: Array<ParsedTextNode | ParsedBreakNode | ParsedColumnBreakNode | ParsedPageBreakNode | ParsedTabNode | ParsedSymbolNode>;
|
|
354
405
|
rawXml: string;
|
|
355
406
|
}
|
|
356
407
|
|
|
@@ -606,7 +657,7 @@ interface XmlTextNode {
|
|
|
606
657
|
type XmlNode = XmlElementNode | XmlTextNode;
|
|
607
658
|
|
|
608
659
|
interface RunParseResult {
|
|
609
|
-
nodes: Array<ParsedTextNode | ParsedBreakNode | ParsedColumnBreakNode | ParsedTabNode | ParsedSymbolNode>;
|
|
660
|
+
nodes: Array<ParsedTextNode | ParsedBreakNode | ParsedColumnBreakNode | ParsedPageBreakNode | ParsedTabNode | ParsedSymbolNode>;
|
|
610
661
|
supported: boolean;
|
|
611
662
|
}
|
|
612
663
|
|
|
@@ -651,24 +702,61 @@ export function setActiveParseTelemetryBus(bus: ParseTelemetryBus | undefined):
|
|
|
651
702
|
activeParseTelemetryBus = bus;
|
|
652
703
|
}
|
|
653
704
|
|
|
705
|
+
/**
|
|
706
|
+
* Request-scoped cosmetic-marker strip context. Set by
|
|
707
|
+
* `parseMainDocumentXml` for the duration of a single parse; read at
|
|
708
|
+
* the four emission sites in `parseBodyChild` / `parseRun` /
|
|
709
|
+
* `parseRunContentOnly` / `parseRevisionContainer`. Using a module
|
|
710
|
+
* variable instead of threading the flag through ~15 intermediate
|
|
711
|
+
* function signatures keeps the call sites readable; the try/finally
|
|
712
|
+
* in the entry point ensures the variable never leaks across calls.
|
|
713
|
+
*
|
|
714
|
+
* Re-entrancy invariant matches `activeChartPartLookup` above.
|
|
715
|
+
*/
|
|
716
|
+
interface CosmeticStripContext {
|
|
717
|
+
readonly strip: boolean;
|
|
718
|
+
readonly counts: Record<string, number>;
|
|
719
|
+
}
|
|
720
|
+
let activeCosmeticStripContext: CosmeticStripContext | null = null;
|
|
721
|
+
|
|
722
|
+
function noteStrippedCosmeticMarker(tag: string): void {
|
|
723
|
+
if (!activeCosmeticStripContext) return;
|
|
724
|
+
activeCosmeticStripContext.counts[tag] =
|
|
725
|
+
(activeCosmeticStripContext.counts[tag] ?? 0) + 1;
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
function shouldStripCosmeticMarker(): boolean {
|
|
729
|
+
return activeCosmeticStripContext?.strip === true;
|
|
730
|
+
}
|
|
731
|
+
|
|
654
732
|
export function parseMainDocumentXml(
|
|
655
733
|
xml: string,
|
|
656
734
|
relationships: readonly OpcRelationship[] = [],
|
|
657
735
|
mediaParts: ReadonlyMap<string, InlineMediaPart> = new Map(),
|
|
658
736
|
sourcePartPath = "/word/document.xml",
|
|
659
737
|
chartPartLookup?: ChartPartLookup,
|
|
738
|
+
parseOptions: ParseMainDocumentOptions = {},
|
|
660
739
|
): ParsedMainDocument {
|
|
661
740
|
activeChartPartLookup = chartPartLookup;
|
|
741
|
+
const stripContext: CosmeticStripContext = {
|
|
742
|
+
strip: parseOptions.stripCosmeticMarkers !== false,
|
|
743
|
+
counts: Object.create(null) as Record<string, number>,
|
|
744
|
+
};
|
|
745
|
+
activeCosmeticStripContext = stripContext;
|
|
662
746
|
const bus = activeParseTelemetryBus;
|
|
663
747
|
const started = bus?.isEnabled("parse") ? performanceNow() : 0;
|
|
664
748
|
try {
|
|
665
749
|
const result = parseMainDocumentXmlInner(xml, relationships, mediaParts, sourcePartPath);
|
|
750
|
+
if (Object.keys(stripContext.counts).length > 0) {
|
|
751
|
+
result.skippedCosmeticMarkerCounts = Object.freeze({ ...stripContext.counts });
|
|
752
|
+
}
|
|
666
753
|
if (bus?.isEnabled("parse")) {
|
|
667
754
|
emitParseSummary(bus, result, sourcePartPath, performanceNow() - started);
|
|
668
755
|
}
|
|
669
756
|
return result;
|
|
670
757
|
} finally {
|
|
671
758
|
activeChartPartLookup = undefined;
|
|
759
|
+
activeCosmeticStripContext = null;
|
|
672
760
|
}
|
|
673
761
|
}
|
|
674
762
|
|
|
@@ -699,6 +787,13 @@ function emitParseSummary(
|
|
|
699
787
|
blockCount: result.blocks.length,
|
|
700
788
|
blockKindCounts: counts,
|
|
701
789
|
ms,
|
|
790
|
+
// Strip counts are surfaced here (telemetry-only) rather than as a
|
|
791
|
+
// warning on `diagnostics.warnings` — the markers carry no
|
|
792
|
+
// contract semantics and surfacing them in the user-visible
|
|
793
|
+
// warnings feed would be noise. Available to debug UX / tests via
|
|
794
|
+
// the `parse` channel; absent when the feature is disabled or no
|
|
795
|
+
// markers were stripped.
|
|
796
|
+
skippedCosmeticMarkerCounts: result.skippedCosmeticMarkerCounts,
|
|
702
797
|
},
|
|
703
798
|
});
|
|
704
799
|
}
|
|
@@ -999,6 +1094,7 @@ function parseBodyChild(
|
|
|
999
1094
|
let bidi: ParsedParagraphNode["bidi"];
|
|
1000
1095
|
let suppressLineNumbers: ParsedParagraphNode["suppressLineNumbers"];
|
|
1001
1096
|
let cnfStyle: ParsedParagraphNode["cnfStyle"];
|
|
1097
|
+
let frameProperties: ParsedParagraphNode["frameProperties"];
|
|
1002
1098
|
let sectionProperties: SectionProperties | undefined;
|
|
1003
1099
|
let sectionPropertiesXml: string | undefined;
|
|
1004
1100
|
let paragraphSupported = true;
|
|
@@ -1045,6 +1141,12 @@ function parseBodyChild(
|
|
|
1045
1141
|
bidi = readOnOffParagraphProperty(child, "bidi");
|
|
1046
1142
|
suppressLineNumbers = readOnOffParagraphProperty(child, "suppressLineNumbers");
|
|
1047
1143
|
cnfStyle = readParagraphCnfStyle(child);
|
|
1144
|
+
{
|
|
1145
|
+
const framePrNode = child.children.find(
|
|
1146
|
+
(c): c is XmlElementNode => c.type === "element" && localName(c.name) === "framePr",
|
|
1147
|
+
);
|
|
1148
|
+
if (framePrNode) frameProperties = readFrameProperties(framePrNode);
|
|
1149
|
+
}
|
|
1048
1150
|
sectionProperties = readSectionPropertiesFromPPr(child);
|
|
1049
1151
|
sectionPropertiesXml = readSectionPropertiesXmlFromPPr(child, sourceXml);
|
|
1050
1152
|
paragraphSupported = paragraphSupported && supportsParagraphProperties(child);
|
|
@@ -1143,6 +1245,10 @@ function parseBodyChild(
|
|
|
1143
1245
|
flushActiveComplexField(children, () => {
|
|
1144
1246
|
activeComplexField = null;
|
|
1145
1247
|
}, activeComplexField);
|
|
1248
|
+
if (shouldStripCosmeticMarker()) {
|
|
1249
|
+
noteStrippedCosmeticMarker("proofErr");
|
|
1250
|
+
break;
|
|
1251
|
+
}
|
|
1146
1252
|
children.push({
|
|
1147
1253
|
type: "opaque_inline",
|
|
1148
1254
|
rawXml: sourceXml.slice(child.start, child.end),
|
|
@@ -1230,6 +1336,7 @@ function parseBodyChild(
|
|
|
1230
1336
|
...(bidi !== undefined ? { bidi } : {}),
|
|
1231
1337
|
...(suppressLineNumbers !== undefined ? { suppressLineNumbers } : {}),
|
|
1232
1338
|
...(cnfStyle ? { cnfStyle } : {}),
|
|
1339
|
+
...(frameProperties ? { frameProperties } : {}),
|
|
1233
1340
|
...(wordExtensionIds ? { wordExtensionIds } : {}),
|
|
1234
1341
|
...(sectionProperties ? { sectionProperties } : {}),
|
|
1235
1342
|
...(sectionPropertiesXml ? { sectionPropertiesXml } : {}),
|
|
@@ -2390,7 +2497,9 @@ function parseRun(
|
|
|
2390
2497
|
break;
|
|
2391
2498
|
}
|
|
2392
2499
|
case "br":
|
|
2393
|
-
if (
|
|
2500
|
+
if (isPageBreak(child)) {
|
|
2501
|
+
result.push({ type: "page_break" });
|
|
2502
|
+
} else if (isColumnBreak(child)) {
|
|
2394
2503
|
result.push({ type: "column_break" });
|
|
2395
2504
|
} else if (isSimpleLineBreak(child)) {
|
|
2396
2505
|
result.push({ type: "hard_break" });
|
|
@@ -2577,6 +2686,11 @@ function parseRun(
|
|
|
2577
2686
|
}
|
|
2578
2687
|
case "lastRenderedPageBreak":
|
|
2579
2688
|
case "proofErr":
|
|
2689
|
+
case "noBreakHyphen":
|
|
2690
|
+
if (shouldStripCosmeticMarker()) {
|
|
2691
|
+
noteStrippedCosmeticMarker(localName(child.name));
|
|
2692
|
+
break;
|
|
2693
|
+
}
|
|
2580
2694
|
result.push({
|
|
2581
2695
|
type: "opaque_inline",
|
|
2582
2696
|
rawXml: sourceXml.slice(child.start, child.end),
|
|
@@ -2650,12 +2764,23 @@ function parseRevisionContainer(
|
|
|
2650
2764
|
result.push(hyperlink);
|
|
2651
2765
|
break;
|
|
2652
2766
|
}
|
|
2767
|
+
case "proofErr":
|
|
2768
|
+
case "lastRenderedPageBreak":
|
|
2769
|
+
case "noBreakHyphen":
|
|
2770
|
+
if (shouldStripCosmeticMarker()) {
|
|
2771
|
+
noteStrippedCosmeticMarker(localName(child.name));
|
|
2772
|
+
break;
|
|
2773
|
+
}
|
|
2774
|
+
return [
|
|
2775
|
+
{
|
|
2776
|
+
type: "opaque_inline",
|
|
2777
|
+
rawXml: sourceXml.slice(node.start, node.end),
|
|
2778
|
+
},
|
|
2779
|
+
];
|
|
2653
2780
|
case "commentRangeStart":
|
|
2654
2781
|
case "commentRangeEnd":
|
|
2655
2782
|
case "bookmarkStart":
|
|
2656
2783
|
case "bookmarkEnd":
|
|
2657
|
-
case "proofErr":
|
|
2658
|
-
case "lastRenderedPageBreak":
|
|
2659
2784
|
return [
|
|
2660
2785
|
{
|
|
2661
2786
|
type: "opaque_inline",
|
|
@@ -2714,7 +2839,7 @@ function parseHyperlink(
|
|
|
2714
2839
|
};
|
|
2715
2840
|
}
|
|
2716
2841
|
|
|
2717
|
-
const children: Array<ParsedTextNode | ParsedBreakNode | ParsedColumnBreakNode | ParsedTabNode | ParsedSymbolNode> = [];
|
|
2842
|
+
const children: Array<ParsedTextNode | ParsedBreakNode | ParsedColumnBreakNode | ParsedPageBreakNode | ParsedTabNode | ParsedSymbolNode> = [];
|
|
2718
2843
|
|
|
2719
2844
|
for (const child of node.children) {
|
|
2720
2845
|
if (child.type !== "element") {
|
|
@@ -2764,7 +2889,7 @@ function parseRunContentOnly(
|
|
|
2764
2889
|
}
|
|
2765
2890
|
|
|
2766
2891
|
const marks = marksResult.marks;
|
|
2767
|
-
const nodes: Array<ParsedTextNode | ParsedBreakNode | ParsedColumnBreakNode | ParsedTabNode | ParsedSymbolNode> = [];
|
|
2892
|
+
const nodes: Array<ParsedTextNode | ParsedBreakNode | ParsedColumnBreakNode | ParsedPageBreakNode | ParsedTabNode | ParsedSymbolNode> = [];
|
|
2768
2893
|
|
|
2769
2894
|
for (const child of node.children) {
|
|
2770
2895
|
if (child.type !== "element") {
|
|
@@ -2812,6 +2937,10 @@ function parseRunContentOnly(
|
|
|
2812
2937
|
break;
|
|
2813
2938
|
}
|
|
2814
2939
|
case "br":
|
|
2940
|
+
if (isPageBreak(child)) {
|
|
2941
|
+
nodes.push({ type: "page_break" });
|
|
2942
|
+
break;
|
|
2943
|
+
}
|
|
2815
2944
|
if (isColumnBreak(child)) {
|
|
2816
2945
|
nodes.push({ type: "column_break" });
|
|
2817
2946
|
break;
|
|
@@ -2824,10 +2953,17 @@ function parseRunContentOnly(
|
|
|
2824
2953
|
case "commentReference":
|
|
2825
2954
|
case "lastRenderedPageBreak":
|
|
2826
2955
|
case "proofErr":
|
|
2956
|
+
case "noBreakHyphen": {
|
|
2957
|
+
const tag = localName(child.name);
|
|
2958
|
+
if (shouldStripCosmeticMarker() && tag !== "commentReference") {
|
|
2959
|
+
noteStrippedCosmeticMarker(tag);
|
|
2960
|
+
break;
|
|
2961
|
+
}
|
|
2827
2962
|
if (options.preserveUnsupportedReviewMarkup) {
|
|
2828
2963
|
return { nodes: [], supported: false };
|
|
2829
2964
|
}
|
|
2830
2965
|
break;
|
|
2966
|
+
}
|
|
2831
2967
|
default:
|
|
2832
2968
|
return { nodes: [], supported: false };
|
|
2833
2969
|
}
|
|
@@ -3149,6 +3285,11 @@ function isColumnBreak(node: XmlElementNode): boolean {
|
|
|
3149
3285
|
return value === "column";
|
|
3150
3286
|
}
|
|
3151
3287
|
|
|
3288
|
+
function isPageBreak(node: XmlElementNode): boolean {
|
|
3289
|
+
const value = (node.attributes["w:type"] ?? node.attributes.type ?? "").toLowerCase();
|
|
3290
|
+
return value === "page";
|
|
3291
|
+
}
|
|
3292
|
+
|
|
3152
3293
|
function findChildElement(node: XmlElementNode, childLocalName: string): XmlElementNode {
|
|
3153
3294
|
const child = node.children.find(
|
|
3154
3295
|
(entry): entry is XmlElementNode =>
|