@beyondwork/docx-react-component 1.0.76 → 1.0.78
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/api/v3/ai/resolve.ts +104 -4
- package/src/io/ooxml/parse-bookmark-references.ts +123 -0
- package/src/io/ooxml/parse-footnotes.ts +26 -3
- package/src/io/ooxml/parse-headers-footers.ts +96 -1
- package/src/io/ooxml/parse-main-document.ts +256 -4
- package/src/io/ooxml/parse-shapes.ts +58 -4
- package/src/io/ooxml/table-opaque-preservation.ts +70 -5
- package/src/model/canonical-document.ts +45 -8
- package/src/runtime/scopes/action-validation.ts +39 -12
- package/src/runtime/scopes/index.ts +3 -0
- package/src/runtime/scopes/resolve-reference.ts +99 -43
- package/src/session/import/loader-types.ts +26 -0
- package/src/session/import/loader.ts +12 -2
- package/src/ui-tailwind/chrome-overlay/tw-chrome-overlay.tsx +12 -0
- package/src/ui-tailwind/editor-surface/perf-probe.ts +3 -0
- package/src/ui-tailwind/editor-surface/pm-decorations.ts +44 -0
- package/src/ui-tailwind/editor-surface/preserve-position.ts +28 -9
- package/src/ui-tailwind/editor-surface/tw-prosemirror-surface.tsx +13 -13
- package/src/ui-tailwind/page-stack/tw-active-band-ribbon.tsx +229 -0
- package/src/ui-tailwind/page-stack/tw-page-chrome-entry.tsx +15 -1
- package/src/ui-tailwind/page-stack/tw-page-footer-band.tsx +18 -0
- package/src/ui-tailwind/page-stack/tw-page-header-band.tsx +20 -0
- package/src/ui-tailwind/page-stack/tw-page-stack-chrome-layer.tsx +10 -0
- package/src/ui-tailwind/tw-review-workspace.tsx +56 -6
|
@@ -31,6 +31,11 @@ import type {
|
|
|
31
31
|
} from "../../model/canonical-document.ts";
|
|
32
32
|
import type { OpcRelationship } from "./part-manifest.ts";
|
|
33
33
|
import { SCOPE_MARKER_BOOKMARK_PREFIX } from "./parse-scope-markers.ts";
|
|
34
|
+
import {
|
|
35
|
+
scanBookmarkReferences,
|
|
36
|
+
isRetainedBookmarkName,
|
|
37
|
+
type BookmarkReferenceScan,
|
|
38
|
+
} from "./parse-bookmark-references.ts";
|
|
34
39
|
import {
|
|
35
40
|
parseInlineMediaXml,
|
|
36
41
|
type InlineMediaPart,
|
|
@@ -253,8 +258,39 @@ export interface ParseMainDocumentOptions {
|
|
|
253
258
|
*
|
|
254
259
|
* Set to `false` to preserve the pre-strip behavior exactly — every
|
|
255
260
|
* cosmetic marker becomes an `opaque_inline` with its source XML.
|
|
261
|
+
*
|
|
262
|
+
* **Phase 2 (Slice B):** the same flag also gates orphan-bookmark
|
|
263
|
+
* stripping. When enabled, a pre-pass scans the source XML for
|
|
264
|
+
* bookmark references (`<w:hyperlink w:anchor>` /
|
|
265
|
+
* `<w:instrText>REF/PAGEREF/NOTEREF/TOC</w:instrText>`); bookmarks
|
|
266
|
+
* whose name is NOT referenced AND not in
|
|
267
|
+
* {@link retainedBookmarkNames} drop at the four emission sites.
|
|
268
|
+
* Aggregate counts surface alongside the cosmetic-marker counts on
|
|
269
|
+
* `skippedCosmeticMarkerCounts.bookmarkStart` /
|
|
270
|
+
* `skippedCosmeticMarkerCounts.bookmarkEnd`.
|
|
256
271
|
*/
|
|
257
272
|
stripCosmeticMarkers?: boolean;
|
|
273
|
+
/**
|
|
274
|
+
* Phase 2 bookmark-strip allowlist. When `stripCosmeticMarkers` is
|
|
275
|
+
* `true`, the parser's reference scan retains bookmarks whose name
|
|
276
|
+
* is referenced by a `<w:hyperlink>` / `<w:instrText>` AND any name
|
|
277
|
+
* listed here. Use this when the host depends on a stable host-
|
|
278
|
+
* authored bookmark name (e.g. `placeholder_party_name`,
|
|
279
|
+
* `signature_block_2`) that the automatic scan can't infer is
|
|
280
|
+
* load-bearing.
|
|
281
|
+
*
|
|
282
|
+
* Default: `[]`. Always-retained regardless of this list:
|
|
283
|
+
* - `_Toc*` (when any TOC field exists)
|
|
284
|
+
* - any name explicitly cited by a `<w:hyperlink w:anchor>` or
|
|
285
|
+
* `<w:instrText>` field instruction
|
|
286
|
+
* - `bw:scope:*` (workflow scope markers — converted to
|
|
287
|
+
* `scope_marker_*` by `rewriteScopeMarkerBookmarks` BEFORE the
|
|
288
|
+
* strip runs; listed here as defense-in-depth)
|
|
289
|
+
* - everything (defensive blanket-retain) when the document
|
|
290
|
+
* contains a `<w:dataBinding>` element whose xpath could
|
|
291
|
+
* reference bookmarks via paths we cannot statically analyze
|
|
292
|
+
*/
|
|
293
|
+
retainedBookmarkNames?: ReadonlyArray<string>;
|
|
258
294
|
}
|
|
259
295
|
|
|
260
296
|
export type ParsedBlockNode =
|
|
@@ -717,6 +753,16 @@ export function setActiveParseTelemetryBus(bus: ParseTelemetryBus | undefined):
|
|
|
717
753
|
interface CosmeticStripContext {
|
|
718
754
|
readonly strip: boolean;
|
|
719
755
|
readonly counts: Record<string, number>;
|
|
756
|
+
/** Phase 2 — bookmark-reference scan. `null` when strip is off OR
|
|
757
|
+
* when the entry point did not run the scan (e.g. an internal call
|
|
758
|
+
* with no XML available). */
|
|
759
|
+
readonly bookmarkScan: BookmarkReferenceScan | null;
|
|
760
|
+
/** Phase 2 — per-parse set of bookmark IDs that the bookmarkStart
|
|
761
|
+
* decision marked as stripped. The bookmarkEnd site keys off this
|
|
762
|
+
* set (since `<w:bookmarkEnd>` carries only the id, not the name).
|
|
763
|
+
* Bookmarks can span paragraphs, so this MUST be request-scoped, not
|
|
764
|
+
* per-paragraph. */
|
|
765
|
+
readonly strippedBookmarkIds: Set<string>;
|
|
720
766
|
}
|
|
721
767
|
let activeCosmeticStripContext: CosmeticStripContext | null = null;
|
|
722
768
|
|
|
@@ -730,6 +776,47 @@ function shouldStripCosmeticMarker(): boolean {
|
|
|
730
776
|
return activeCosmeticStripContext?.strip === true;
|
|
731
777
|
}
|
|
732
778
|
|
|
779
|
+
/**
|
|
780
|
+
* Phase 2 — true when the bookmark with this `name` should be RETAINED
|
|
781
|
+
* (its name is referenced by a hyperlink/field in the doc, on the
|
|
782
|
+
* caller-supplied allowlist, a `bw:scope:*` prefix, or covered by the
|
|
783
|
+
* `_Toc*` blanket-retain when a TOC field exists). When the bookmark
|
|
784
|
+
* has no name (`""`), retain — only the body-walker's `bkId` path
|
|
785
|
+
* reaches this helper, and unnamed bookmarks have no consumer that
|
|
786
|
+
* could reference them by name so retention is a no-op.
|
|
787
|
+
*/
|
|
788
|
+
function shouldRetainBookmark(name: string): boolean {
|
|
789
|
+
const ctx = activeCosmeticStripContext;
|
|
790
|
+
if (!ctx || !ctx.strip) return true; // strip disabled — always retain
|
|
791
|
+
if (!ctx.bookmarkScan) return true; // scan absent — defensive retain
|
|
792
|
+
if (name === "") return true; // no name → no consumer can reference
|
|
793
|
+
return isRetainedBookmarkName(name, ctx.bookmarkScan);
|
|
794
|
+
}
|
|
795
|
+
|
|
796
|
+
/**
|
|
797
|
+
* Phase 2 — record that the bookmark with this `bookmarkId` was
|
|
798
|
+
* stripped at its `bookmarkStart` site so the matching `bookmarkEnd`
|
|
799
|
+
* (which carries only the id, not the name) can find the decision.
|
|
800
|
+
*/
|
|
801
|
+
function noteStrippedBookmarkId(bookmarkId: string): void {
|
|
802
|
+
if (!activeCosmeticStripContext) return;
|
|
803
|
+
activeCosmeticStripContext.strippedBookmarkIds.add(bookmarkId);
|
|
804
|
+
}
|
|
805
|
+
|
|
806
|
+
/**
|
|
807
|
+
* Phase 2 — true when this `bookmarkEnd` corresponds to a `bookmarkStart`
|
|
808
|
+
* that was previously stripped. When the id is unknown (cross-fragment
|
|
809
|
+
* end without a matching start in the same parse) the answer is `false`
|
|
810
|
+
* — we keep the end conservatively rather than risk dropping a load-
|
|
811
|
+
* bearing pair we didn't see start.
|
|
812
|
+
*/
|
|
813
|
+
function shouldStripBookmarkEndId(bookmarkId: string): boolean {
|
|
814
|
+
if (!activeCosmeticStripContext) return false;
|
|
815
|
+
if (!activeCosmeticStripContext.strip) return false;
|
|
816
|
+
if (bookmarkId === "") return false;
|
|
817
|
+
return activeCosmeticStripContext.strippedBookmarkIds.has(bookmarkId);
|
|
818
|
+
}
|
|
819
|
+
|
|
733
820
|
export function parseMainDocumentXml(
|
|
734
821
|
xml: string,
|
|
735
822
|
relationships: readonly OpcRelationship[] = [],
|
|
@@ -739,9 +826,14 @@ export function parseMainDocumentXml(
|
|
|
739
826
|
parseOptions: ParseMainDocumentOptions = {},
|
|
740
827
|
): ParsedMainDocument {
|
|
741
828
|
activeChartPartLookup = chartPartLookup;
|
|
829
|
+
const stripEnabled = parseOptions.stripCosmeticMarkers !== false;
|
|
742
830
|
const stripContext: CosmeticStripContext = {
|
|
743
|
-
strip:
|
|
831
|
+
strip: stripEnabled,
|
|
744
832
|
counts: Object.create(null) as Record<string, number>,
|
|
833
|
+
bookmarkScan: stripEnabled
|
|
834
|
+
? scanBookmarkReferences(xml, parseOptions.retainedBookmarkNames ?? [])
|
|
835
|
+
: null,
|
|
836
|
+
strippedBookmarkIds: new Set<string>(),
|
|
745
837
|
};
|
|
746
838
|
activeCosmeticStripContext = stripContext;
|
|
747
839
|
const bus = activeParseTelemetryBus;
|
|
@@ -848,7 +940,13 @@ function parseMainDocumentXmlInner(
|
|
|
848
940
|
|
|
849
941
|
const allBlocks = bodyElement.children
|
|
850
942
|
.filter((node): node is XmlElementNode => node.type === "element")
|
|
851
|
-
.map((node) => parseBodyChild(node, xml, relationshipMap, relationships, mediaParts, sourcePartPath))
|
|
943
|
+
.map((node) => parseBodyChild(node, xml, relationshipMap, relationships, mediaParts, sourcePartPath))
|
|
944
|
+
// Phase 2 (Slice B) — body-level bookmarkStart/End that the strip
|
|
945
|
+
// dropped come back from `parseBodyChild` as `opaque_block` with
|
|
946
|
+
// empty `rawXml` (sentinel — see `parseBodyChild` body-level
|
|
947
|
+
// bookmark cases). Filter them out here so downstream consumers
|
|
948
|
+
// don't see empty blocks.
|
|
949
|
+
.filter((block) => !(block.type === "opaque_block" && block.rawXml === ""));
|
|
852
950
|
|
|
853
951
|
// The last body-level sectPr is the final section properties (not an intermediate section break).
|
|
854
952
|
// Extract it from the blocks list and store it separately.
|
|
@@ -1071,6 +1169,48 @@ function parseBodyChild(
|
|
|
1071
1169
|
return parseSectionBreakElement(node, sourceXml);
|
|
1072
1170
|
}
|
|
1073
1171
|
|
|
1172
|
+
// Body-level <w:bookmarkStart> / <w:bookmarkEnd> — bookmarks that
|
|
1173
|
+
// span across paragraph boundaries land directly under <w:body>
|
|
1174
|
+
// rather than inside a <w:p>. Pre-Phase-2 these flowed through the
|
|
1175
|
+
// `nodeType !== "p"` default below and became opaque_block (the
|
|
1176
|
+
// dominant 184-of-185 opaque source on EU IT Services Agreement
|
|
1177
|
+
// per `enumerate-opaque-fragments`). Phase 2 strip applies the same
|
|
1178
|
+
// RETAIN-vs-STRIP decision here as at sites 1+2 inside paragraphs.
|
|
1179
|
+
if (nodeType === "bookmarkStart") {
|
|
1180
|
+
const bkId = node.attributes["w:id"] ?? node.attributes.id ?? "";
|
|
1181
|
+
const bkName = node.attributes["w:name"] ?? node.attributes.name ?? "";
|
|
1182
|
+
if (
|
|
1183
|
+
shouldStripCosmeticMarker() &&
|
|
1184
|
+
bkId &&
|
|
1185
|
+
bkName &&
|
|
1186
|
+
!shouldRetainBookmark(bkName)
|
|
1187
|
+
) {
|
|
1188
|
+
noteStrippedCosmeticMarker("bookmarkStart");
|
|
1189
|
+
noteStrippedBookmarkId(bkId);
|
|
1190
|
+
// Returning an empty paragraph block flushes cleanly through the
|
|
1191
|
+
// body-walker's collected blocks. The block is a no-op zero-content
|
|
1192
|
+
// paragraph that downstream consumers ignore. (Returning `null`
|
|
1193
|
+
// would change the body-walker's signature; the block-shaped
|
|
1194
|
+
// no-op preserves the existing iteration contract.)
|
|
1195
|
+
return { type: "opaque_block", rawXml: "" };
|
|
1196
|
+
}
|
|
1197
|
+
return {
|
|
1198
|
+
type: "opaque_block",
|
|
1199
|
+
rawXml: sourceXml.slice(node.start, node.end),
|
|
1200
|
+
};
|
|
1201
|
+
}
|
|
1202
|
+
if (nodeType === "bookmarkEnd") {
|
|
1203
|
+
const bkEndId = node.attributes["w:id"] ?? node.attributes.id ?? "";
|
|
1204
|
+
if (shouldStripBookmarkEndId(bkEndId)) {
|
|
1205
|
+
noteStrippedCosmeticMarker("bookmarkEnd");
|
|
1206
|
+
return { type: "opaque_block", rawXml: "" };
|
|
1207
|
+
}
|
|
1208
|
+
return {
|
|
1209
|
+
type: "opaque_block",
|
|
1210
|
+
rawXml: sourceXml.slice(node.start, node.end),
|
|
1211
|
+
};
|
|
1212
|
+
}
|
|
1213
|
+
|
|
1074
1214
|
if (nodeType !== "p") {
|
|
1075
1215
|
return {
|
|
1076
1216
|
type: "opaque_block",
|
|
@@ -1186,6 +1326,18 @@ function parseBodyChild(
|
|
|
1186
1326
|
const bkId = child.attributes["w:id"] ?? child.attributes.id ?? "";
|
|
1187
1327
|
const bkName = child.attributes["w:name"] ?? child.attributes.name ?? "";
|
|
1188
1328
|
if (bkId) {
|
|
1329
|
+
// Phase 2 (Slice B) — orphan-bookmark strip. When the strip
|
|
1330
|
+
// is on AND the name is not load-bearing per the reference
|
|
1331
|
+
// scan, drop the bookmark. The bookmarkEnd site finds the
|
|
1332
|
+
// matching id via `shouldStripBookmarkEndId`.
|
|
1333
|
+
if (bkName && !shouldRetainBookmark(bkName)) {
|
|
1334
|
+
noteStrippedCosmeticMarker("bookmarkStart");
|
|
1335
|
+
noteStrippedBookmarkId(bkId);
|
|
1336
|
+
flushActiveComplexField(children, () => {
|
|
1337
|
+
activeComplexField = null;
|
|
1338
|
+
}, activeComplexField);
|
|
1339
|
+
break;
|
|
1340
|
+
}
|
|
1189
1341
|
const bookmarkNode = {
|
|
1190
1342
|
type: "bookmark_start",
|
|
1191
1343
|
bookmarkId: bkId,
|
|
@@ -1207,6 +1359,15 @@ function parseBodyChild(
|
|
|
1207
1359
|
case "bookmarkEnd": {
|
|
1208
1360
|
const bkEndId = child.attributes["w:id"] ?? child.attributes.id ?? "";
|
|
1209
1361
|
if (bkEndId) {
|
|
1362
|
+
// Phase 2 — strip the matching bookmarkEnd if its bookmarkStart
|
|
1363
|
+
// was previously stripped (id-keyed pairing across the parse).
|
|
1364
|
+
if (shouldStripBookmarkEndId(bkEndId)) {
|
|
1365
|
+
noteStrippedCosmeticMarker("bookmarkEnd");
|
|
1366
|
+
flushActiveComplexField(children, () => {
|
|
1367
|
+
activeComplexField = null;
|
|
1368
|
+
}, activeComplexField);
|
|
1369
|
+
break;
|
|
1370
|
+
}
|
|
1210
1371
|
const bookmarkNode = {
|
|
1211
1372
|
type: "bookmark_end",
|
|
1212
1373
|
bookmarkId: bkEndId,
|
|
@@ -2755,14 +2916,53 @@ function parseRevisionContainer(
|
|
|
2755
2916
|
];
|
|
2756
2917
|
case "commentRangeStart":
|
|
2757
2918
|
case "commentRangeEnd":
|
|
2758
|
-
case "bookmarkStart":
|
|
2759
|
-
case "bookmarkEnd":
|
|
2760
2919
|
return [
|
|
2761
2920
|
{
|
|
2762
2921
|
type: "opaque_inline",
|
|
2763
2922
|
rawXml: sourceXml.slice(node.start, node.end),
|
|
2764
2923
|
},
|
|
2765
2924
|
];
|
|
2925
|
+
case "bookmarkStart": {
|
|
2926
|
+
// Site 3 (Slice B) — nested-context bookmarkStart. Pre-Slice-B
|
|
2927
|
+
// ALL nested-context bookmarks fell through to opaque_inline
|
|
2928
|
+
// (the 216-opaque source on CCEP). When the strip is on AND the
|
|
2929
|
+
// name isn't load-bearing, drop the start + record the id so
|
|
2930
|
+
// the matching end (Site 4) drops too.
|
|
2931
|
+
const nestedBkId = child.attributes["w:id"] ?? child.attributes.id ?? "";
|
|
2932
|
+
const nestedBkName = child.attributes["w:name"] ?? child.attributes.name ?? "";
|
|
2933
|
+
if (
|
|
2934
|
+
shouldStripCosmeticMarker() &&
|
|
2935
|
+
nestedBkId &&
|
|
2936
|
+
nestedBkName &&
|
|
2937
|
+
!shouldRetainBookmark(nestedBkName)
|
|
2938
|
+
) {
|
|
2939
|
+
noteStrippedCosmeticMarker("bookmarkStart");
|
|
2940
|
+
noteStrippedBookmarkId(nestedBkId);
|
|
2941
|
+
break;
|
|
2942
|
+
}
|
|
2943
|
+
return [
|
|
2944
|
+
{
|
|
2945
|
+
type: "opaque_inline",
|
|
2946
|
+
rawXml: sourceXml.slice(node.start, node.end),
|
|
2947
|
+
},
|
|
2948
|
+
];
|
|
2949
|
+
}
|
|
2950
|
+
case "bookmarkEnd": {
|
|
2951
|
+
// Site 4 (Slice B) — nested-context bookmarkEnd. Strip iff its
|
|
2952
|
+
// matching start (any site) was previously stripped.
|
|
2953
|
+
const nestedBkEndId =
|
|
2954
|
+
child.attributes["w:id"] ?? child.attributes.id ?? "";
|
|
2955
|
+
if (shouldStripBookmarkEndId(nestedBkEndId)) {
|
|
2956
|
+
noteStrippedCosmeticMarker("bookmarkEnd");
|
|
2957
|
+
break;
|
|
2958
|
+
}
|
|
2959
|
+
return [
|
|
2960
|
+
{
|
|
2961
|
+
type: "opaque_inline",
|
|
2962
|
+
rawXml: sourceXml.slice(node.start, node.end),
|
|
2963
|
+
},
|
|
2964
|
+
];
|
|
2965
|
+
}
|
|
2766
2966
|
case "permStart":
|
|
2767
2967
|
result.push(parsePermStartNode(child, sourceXml));
|
|
2768
2968
|
break;
|
|
@@ -2817,6 +3017,18 @@ function parseHyperlink(
|
|
|
2817
3017
|
|
|
2818
3018
|
const children: Array<ParsedTextNode | ParsedBreakNode | ParsedColumnBreakNode | ParsedPageBreakNode | ParsedTabNode | ParsedSymbolNode> = [];
|
|
2819
3019
|
|
|
3020
|
+
// Slice A — local field-bracket state machine. CCEP TOC hyperlinks
|
|
3021
|
+
// wrap their entries with a `PAGEREF` field whose runs carry
|
|
3022
|
+
// `<w:fldChar w:fldCharType="begin|separate|end"/>` markers and a
|
|
3023
|
+
// `<w:instrText>` instruction. Pre-strip behavior bailed the whole
|
|
3024
|
+
// hyperlink to opaque_inline because `parseRunContentOnly` returned
|
|
3025
|
+
// `supported: false` on those markers (~48 CCEP opaques). The local
|
|
3026
|
+
// bracketMode is independent of the body-walker's `activeComplexField`:
|
|
3027
|
+
// hyperlink children are a self-contained sub-walk that doesn't
|
|
3028
|
+
// escape the hyperlink scope.
|
|
3029
|
+
type FieldBracketMode = "outside" | "instruction" | "result";
|
|
3030
|
+
let bracketMode: FieldBracketMode = "outside";
|
|
3031
|
+
|
|
2820
3032
|
for (const child of node.children) {
|
|
2821
3033
|
if (child.type !== "element") {
|
|
2822
3034
|
continue;
|
|
@@ -2829,9 +3041,29 @@ function parseHyperlink(
|
|
|
2829
3041
|
};
|
|
2830
3042
|
}
|
|
2831
3043
|
|
|
3044
|
+
// Pre-scan for fldChar bracket transitions. Per OOXML grammar, a
|
|
3045
|
+
// run carrying a fldChar carries no other displayable content;
|
|
3046
|
+
// skip the whole run after updating bracketMode.
|
|
3047
|
+
const fldChar = child.children.find(
|
|
3048
|
+
(c): c is XmlElementNode =>
|
|
3049
|
+
c.type === "element" && localName(c.name) === "fldChar",
|
|
3050
|
+
);
|
|
3051
|
+
if (fldChar) {
|
|
3052
|
+
const fldType =
|
|
3053
|
+
fldChar.attributes["w:fldCharType"] ?? fldChar.attributes.fldCharType;
|
|
3054
|
+
if (fldType === "begin") bracketMode = "instruction";
|
|
3055
|
+
else if (fldType === "separate") bracketMode = "result";
|
|
3056
|
+
else if (fldType === "end") bracketMode = "outside";
|
|
3057
|
+
continue;
|
|
3058
|
+
}
|
|
3059
|
+
|
|
2832
3060
|
const run = parseRunContentOnly(child, sourceXml, {
|
|
2833
3061
|
allowDeletedText: options.allowDeletedText,
|
|
2834
3062
|
preserveUnsupportedReviewMarkup: options.preserveUnsupportedReviewMarkup,
|
|
3063
|
+
// Tolerate `<w:instrText>` siblings inside hyperlink runs — the
|
|
3064
|
+
// bracket-state machine above takes care of dropping them via
|
|
3065
|
+
// `bracketMode === "instruction"` below.
|
|
3066
|
+
allowFieldMarkers: true,
|
|
2835
3067
|
});
|
|
2836
3068
|
if (!run.supported) {
|
|
2837
3069
|
return {
|
|
@@ -2840,6 +3072,8 @@ function parseHyperlink(
|
|
|
2840
3072
|
};
|
|
2841
3073
|
}
|
|
2842
3074
|
|
|
3075
|
+
// Drop nodes during the field-instruction segment; keep result + outside.
|
|
3076
|
+
if (bracketMode === "instruction") continue;
|
|
2843
3077
|
children.push(...run.nodes);
|
|
2844
3078
|
}
|
|
2845
3079
|
|
|
@@ -2857,6 +3091,16 @@ function parseRunContentOnly(
|
|
|
2857
3091
|
options: {
|
|
2858
3092
|
allowDeletedText?: boolean;
|
|
2859
3093
|
preserveUnsupportedReviewMarkup?: boolean;
|
|
3094
|
+
/**
|
|
3095
|
+
* Slice A — gracefully skip `<w:fldChar>` / `<w:instrText>` children
|
|
3096
|
+
* instead of bailing to `supported: false`. The hyperlink path
|
|
3097
|
+
* (`parseHyperlink`) opts in so TOC `PAGEREF` field markers inside
|
|
3098
|
+
* hyperlink runs no longer trip exit-B. The body-walker callers do
|
|
3099
|
+
* NOT opt in — they have their own `activeComplexField` state machine
|
|
3100
|
+
* that handles these markers semantically and bailing here is
|
|
3101
|
+
* load-bearing for that machine to see the markers in `parseRun`.
|
|
3102
|
+
*/
|
|
3103
|
+
allowFieldMarkers?: boolean;
|
|
2860
3104
|
} = {},
|
|
2861
3105
|
): RunParseResult {
|
|
2862
3106
|
const marksResult = readRunMarks(node, _sourceXml);
|
|
@@ -2940,6 +3184,14 @@ function parseRunContentOnly(
|
|
|
2940
3184
|
}
|
|
2941
3185
|
break;
|
|
2942
3186
|
}
|
|
3187
|
+
case "fldChar":
|
|
3188
|
+
case "instrText":
|
|
3189
|
+
// Slice A — graceful skip when caller opts in (hyperlink path).
|
|
3190
|
+
// Otherwise fall through to default and bail (body-walker path,
|
|
3191
|
+
// which uses its own activeComplexField state machine in
|
|
3192
|
+
// parseRun to handle these markers).
|
|
3193
|
+
if (options.allowFieldMarkers) break;
|
|
3194
|
+
return { nodes: [], supported: false };
|
|
2943
3195
|
default:
|
|
2944
3196
|
return { nodes: [], supported: false };
|
|
2945
3197
|
}
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
* preserved in the canonical node's rawXml field for lossless round-trip export.
|
|
11
11
|
*/
|
|
12
12
|
|
|
13
|
-
import type { ShapeContent } from "../../model/canonical-document.ts";
|
|
13
|
+
import type { BlockNode, ShapeContent } from "../../model/canonical-document.ts";
|
|
14
14
|
import { parseFill } from "./parse-fill.ts";
|
|
15
15
|
import {
|
|
16
16
|
type XmlElementNode,
|
|
@@ -32,6 +32,15 @@ export interface ParsedWpsShape {
|
|
|
32
32
|
text?: string;
|
|
33
33
|
/** Raw txbxContent XML for structured re-rendering. */
|
|
34
34
|
txbxContentXml?: string;
|
|
35
|
+
/**
|
|
36
|
+
* Parsed block-level structure from `w:txbxContent`, populated when a
|
|
37
|
+
* `blockParser` callback is supplied (coord-02 §14 / coord-11 §22 —
|
|
38
|
+
* headers/footers need access to shape-inside text like the CCEP
|
|
39
|
+
* "Copyright CCEP STRICTLY CONFIDENTIAL" red band, which lives in
|
|
40
|
+
* shape-textbox paragraphs). Same shape + semantics as
|
|
41
|
+
* `ShapeContent.txbxBlocks` on the drawing-frame path.
|
|
42
|
+
*/
|
|
43
|
+
txbxBlocks?: ReadonlyArray<BlockNode>;
|
|
35
44
|
/** DrawML geometry preset, e.g. "rect", "roundRect". */
|
|
36
45
|
geometry?: string;
|
|
37
46
|
/** Original drawing XML for lossless round-trip export. */
|
|
@@ -65,7 +74,10 @@ export type ParsedShape = ParsedWpsShape | ParsedWordArt | ParsedVmlShape;
|
|
|
65
74
|
*
|
|
66
75
|
* Returns null if the drawing does not contain a WPS shape.
|
|
67
76
|
*/
|
|
68
|
-
export function parseShapeXml(
|
|
77
|
+
export function parseShapeXml(
|
|
78
|
+
drawingXml: string,
|
|
79
|
+
blockParser?: TxbxBlockParser,
|
|
80
|
+
): ParsedWpsShape | ParsedWordArt | null {
|
|
69
81
|
const root = parseXml(drawingXml);
|
|
70
82
|
const graphicData = findFirstDescendant(root, "graphicData");
|
|
71
83
|
if (!graphicData) return null;
|
|
@@ -104,11 +116,37 @@ export function parseShapeXml(drawingXml: string): ParsedWpsShape | ParsedWordAr
|
|
|
104
116
|
// Extract raw txbxContent XML for structured re-rendering of text boxes
|
|
105
117
|
const txbxContentXml = txbxContent ? extractRawXml(txbxContent) : undefined;
|
|
106
118
|
|
|
119
|
+
// Coord-02 §14 / coord-11 §22 follow-up (2026-04-24): when a
|
|
120
|
+
// blockParser is supplied, recurse into the txbxContent to produce a
|
|
121
|
+
// structured block representation. Without this, shape-textbox
|
|
122
|
+
// content (CCEP "Copyright CCEP STRICTLY CONFIDENTIAL" footer band)
|
|
123
|
+
// is reachable only via the `.text` summary string — L03 cascade +
|
|
124
|
+
// L11 render can't walk runs/marks.
|
|
125
|
+
let txbxBlocks: ReadonlyArray<BlockNode> | undefined;
|
|
126
|
+
if (txbxContentXml && blockParser) {
|
|
127
|
+
try {
|
|
128
|
+
// The `blockParser` callback is supplied by parse-main-document.ts
|
|
129
|
+
// as a thin wrapper over `parseBlockStreamFromXml`. That function
|
|
130
|
+
// returns `ParsedBlockNode[]` — structurally identical to canonical
|
|
131
|
+
// `BlockNode[]` at runtime for shape-textbox content (verified on
|
|
132
|
+
// CCEP SOW footer fixture 2026-04-24: paragraph + text + TextMark
|
|
133
|
+
// shapes land end-to-end with zero `ParsedBlockNode`-only fields
|
|
134
|
+
// surfaced). The cast is safe here because the runtime output IS
|
|
135
|
+
// canonical; a structural `as unknown as BlockNode[]` preserves
|
|
136
|
+
// type safety at every consumer site (L03 cascade, L11 render,
|
|
137
|
+
// validator walk).
|
|
138
|
+
txbxBlocks = blockParser(txbxContentXml) as unknown as ReadonlyArray<BlockNode>;
|
|
139
|
+
} catch {
|
|
140
|
+
txbxBlocks = undefined;
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
107
144
|
return {
|
|
108
145
|
type: "shape",
|
|
109
146
|
...(isTextBox ? { isTextBox: true } : {}),
|
|
110
147
|
...(text ? { text } : {}),
|
|
111
148
|
...(txbxContentXml ? { txbxContentXml } : {}),
|
|
149
|
+
...(txbxBlocks && txbxBlocks.length > 0 ? { txbxBlocks } : {}),
|
|
112
150
|
...(prst ? { geometry: prst } : {}),
|
|
113
151
|
rawXml: drawingXml,
|
|
114
152
|
};
|
|
@@ -186,6 +224,17 @@ function extractAllText(node: XmlElementNode): string {
|
|
|
186
224
|
// txbxContentXml, optional recursive txbxBlocks).
|
|
187
225
|
// ───────────────────────────────────────────────────────────────────────────
|
|
188
226
|
|
|
227
|
+
/**
|
|
228
|
+
* Callback signature for the txbx-content block parser supplied by
|
|
229
|
+
* parse-main-document.ts / parse-headers-footers.ts. The actual
|
|
230
|
+
* implementation wraps `parseBlockStreamFromXml` which returns
|
|
231
|
+
* `ParsedBlockNode[]`; its runtime output is canonical `BlockNode[]`
|
|
232
|
+
* for shape-textbox content (no `ParsedBlockNode`-only fields surface
|
|
233
|
+
* at the shape boundary — verified on CCEP SOW footer fixture
|
|
234
|
+
* 2026-04-24). The structural `unknown` return keeps the parse layer
|
|
235
|
+
* layer-pure; `parseShapeContent` + `parseShapeXml` cast to canonical
|
|
236
|
+
* `BlockNode[]` at the assembly seam.
|
|
237
|
+
*/
|
|
189
238
|
export type TxbxBlockParser = (xml: string) => ReadonlyArray<{ type: string; [key: string]: unknown }>;
|
|
190
239
|
|
|
191
240
|
export function parseShapeContent(
|
|
@@ -212,10 +261,15 @@ export function parseShapeContent(
|
|
|
212
261
|
const txbxContent = txbx ? findFirstDescendant(txbx, "txbxContent") : undefined;
|
|
213
262
|
const txbxContentXml = txbxContent ? extractRawXml(txbxContent) : undefined;
|
|
214
263
|
|
|
215
|
-
let txbxBlocks: ReadonlyArray<
|
|
264
|
+
let txbxBlocks: ReadonlyArray<BlockNode> | undefined;
|
|
216
265
|
if (txbxContentXml && blockParser) {
|
|
217
266
|
try {
|
|
218
|
-
|
|
267
|
+
// See `TxbxBlockParser` doc above: runtime output is canonical
|
|
268
|
+
// `BlockNode[]` for shape-textbox content (verified on CCEP SOW
|
|
269
|
+
// footer fixture 2026-04-24). Cast at the assembly seam so
|
|
270
|
+
// downstream consumers (L03, L11, validator) get canonical types
|
|
271
|
+
// without local `as unknown` ceremony.
|
|
272
|
+
txbxBlocks = blockParser(txbxContentXml) as unknown as ReadonlyArray<BlockNode>;
|
|
219
273
|
} catch {
|
|
220
274
|
// Preserve-only fallback: keep txbxContentXml for serialization; leave
|
|
221
275
|
// txbxBlocks undefined so consumers know recursion did not succeed.
|
|
@@ -55,8 +55,21 @@ import { classifyFieldInstruction } from "./parse-fields.ts";
|
|
|
55
55
|
|
|
56
56
|
/**
|
|
57
57
|
* Field families safe enough to leave a `<w:tbl>` in structured
|
|
58
|
-
* canonical form.
|
|
59
|
-
*
|
|
58
|
+
* canonical form.
|
|
59
|
+
*
|
|
60
|
+
* The principle: the body-direct paragraph parser accepts every
|
|
61
|
+
* field family and emits a typed `FieldInlineNode` — classified
|
|
62
|
+
* families get a refresh slot, preserve-only families round-trip
|
|
63
|
+
* via the shared `FieldInlineNode` shape. Cells inside a `<w:tbl>`
|
|
64
|
+
* run through the same parser + serializer. Flattening the whole
|
|
65
|
+
* table to `opaque_block` because one cell carries a field is
|
|
66
|
+
* over-conservative: preserve-only fields round-trip identically
|
|
67
|
+
* whether they sit in a body paragraph or a table cell.
|
|
68
|
+
*
|
|
69
|
+
* This set is consulted AFTER `isWellFormedFieldInstruction` — for
|
|
70
|
+
* the rare case where a field instruction doesn't start with a
|
|
71
|
+
* recognizable OOXML family identifier, we still accept it iff the
|
|
72
|
+
* classifier happened to recognize it.
|
|
60
73
|
*/
|
|
61
74
|
export const SAFE_TABLE_FIELD_FAMILIES: ReadonlySet<string> = new Set([
|
|
62
75
|
"REF",
|
|
@@ -65,6 +78,18 @@ export const SAFE_TABLE_FIELD_FAMILIES: ReadonlySet<string> = new Set([
|
|
|
65
78
|
"TOC",
|
|
66
79
|
"PAGE",
|
|
67
80
|
"NUMPAGES",
|
|
81
|
+
"STYLEREF",
|
|
82
|
+
"SECTIONPAGES",
|
|
83
|
+
"DATE",
|
|
84
|
+
"TIME",
|
|
85
|
+
"AUTHOR",
|
|
86
|
+
"FILENAME",
|
|
87
|
+
"MERGEFIELD",
|
|
88
|
+
"IF",
|
|
89
|
+
"SEQ",
|
|
90
|
+
"INDEX",
|
|
91
|
+
"TC",
|
|
92
|
+
"FORMULA",
|
|
68
93
|
]);
|
|
69
94
|
|
|
70
95
|
/**
|
|
@@ -127,23 +152,63 @@ export function extractComplexFieldInstructionsFromRaw(rawXml: string): string[]
|
|
|
127
152
|
* `FORMDROPDOWN`. These are fully supported by the body-direct
|
|
128
153
|
* paragraph parser via `parseFFDataFromFldChar` but classify as
|
|
129
154
|
* `UNKNOWN` under `FIELD_FAMILY_PATTERN` (which targets data-field
|
|
130
|
-
* families like REF / TOC / MERGEFIELD).
|
|
131
|
-
*
|
|
132
|
-
*
|
|
155
|
+
* families like REF / TOC / MERGEFIELD). Kept as a named helper for
|
|
156
|
+
* readability; `isWellFormedFieldInstruction` would also accept them
|
|
157
|
+
* via the generic identifier pattern, but the named check documents
|
|
158
|
+
* the carve-out's origin (coord-01 §11, 2026-04-24).
|
|
133
159
|
*/
|
|
134
160
|
export function isLegacyFormFieldInstruction(instruction: string): boolean {
|
|
135
161
|
return /^\s*(FORMTEXT|FORMCHECKBOX|FORMDROPDOWN)\b/i.test(instruction);
|
|
136
162
|
}
|
|
137
163
|
|
|
164
|
+
/**
|
|
165
|
+
* Matches any well-formed OOXML field instruction. OOXML field
|
|
166
|
+
* instructions (ECMA-376 §17.16) begin with an ALL-CAPS family name
|
|
167
|
+
* — `REF`, `PAGE`, `TOC`, `MERGEFIELD`, `DOCPROPERTY`,
|
|
168
|
+
* `LISTNUM`, and so on through the full §17.16 catalog (60+
|
|
169
|
+
* families). Pattern-matching the family-name prefix lets us
|
|
170
|
+
* recognize every standard field shape WITHOUT adding each one to
|
|
171
|
+
* the L02 `PreserveOnlyFieldFamily` union (which would require a
|
|
172
|
+
* cross-lane slice) OR to `FIELD_FAMILY_PATTERN` (which expands
|
|
173
|
+
* classification-time behavior).
|
|
174
|
+
*
|
|
175
|
+
* The narrow `SAFE_TABLE_FIELD_FAMILIES` above is consulted as a
|
|
176
|
+
* fallback for the rare case of a field instruction that doesn't
|
|
177
|
+
* start with a family-name prefix but DOES classify to a known
|
|
178
|
+
* family (e.g. leading whitespace quirks we haven't seen in the
|
|
179
|
+
* wild).
|
|
180
|
+
*
|
|
181
|
+
* Rejection remains for:
|
|
182
|
+
* - Instructions that don't start with a family-name shape — these
|
|
183
|
+
* could be garbled / truncated / mid-field text; flattening the
|
|
184
|
+
* table is the safe preservation path.
|
|
185
|
+
* - Revision markup anywhere in the table (tracked changes —
|
|
186
|
+
* unaffected by this check; handled by `RISKY_TABLE_MARKUP_RE`
|
|
187
|
+
* below).
|
|
188
|
+
*/
|
|
189
|
+
const WELL_FORMED_FIELD_INSTRUCTION_RE = /^\s*[A-Z][A-Z0-9_]*\b/;
|
|
190
|
+
|
|
191
|
+
export function isWellFormedFieldInstruction(instruction: string): boolean {
|
|
192
|
+
return WELL_FORMED_FIELD_INSTRUCTION_RE.test(instruction);
|
|
193
|
+
}
|
|
194
|
+
|
|
138
195
|
/**
|
|
139
196
|
* Decides whether a single field instruction (either `w:instr`
|
|
140
197
|
* attribute value or concatenated `instrText` run) is safe for
|
|
141
198
|
* structured-table parsing. Used by the shared predicate below;
|
|
142
199
|
* exposed for direct callers (the debug diagnostics script runs
|
|
143
200
|
* this to classify source instructions alongside the canonical).
|
|
201
|
+
*
|
|
202
|
+
* Order matters: the well-formed prefix check covers every standard
|
|
203
|
+
* OOXML field family in one pass; the classifier fallback catches
|
|
204
|
+
* edge cases where `FIELD_FAMILY_PATTERN` happens to match but the
|
|
205
|
+
* prefix shape doesn't (unlikely, but the fallback keeps behavior
|
|
206
|
+
* consistent with classification for any instruction the classifier
|
|
207
|
+
* recognizes).
|
|
144
208
|
*/
|
|
145
209
|
export function isSafeTableFieldInstruction(instruction: string): boolean {
|
|
146
210
|
if (isLegacyFormFieldInstruction(instruction)) return true;
|
|
211
|
+
if (isWellFormedFieldInstruction(instruction)) return true;
|
|
147
212
|
const family = classifyFieldInstruction(instruction).family;
|
|
148
213
|
return SAFE_TABLE_FIELD_FAMILIES.has(family);
|
|
149
214
|
}
|
|
@@ -1786,12 +1786,29 @@ export interface SmartArtPreviewNode {
|
|
|
1786
1786
|
/**
|
|
1787
1787
|
* Read-only rendering of a wps:wsp WordprocessingShape. Text content is
|
|
1788
1788
|
* extracted for display. The original drawing XML is preserved in rawXml.
|
|
1789
|
+
*
|
|
1790
|
+
* When the shape is a text-box (`isTextBox: true`), the raw textbox XML
|
|
1791
|
+
* is preserved in `txbxContentXml` for lossless round-trip, and the
|
|
1792
|
+
* parsed block structure lands in `txbxBlocks` — canonical `BlockNode[]`
|
|
1793
|
+
* with styles already resolved (coord-02 §14 / coord-11 §22 closed L01
|
|
1794
|
+
* side 2026-04-24 in `7d87f1189`; L02 type-promoted 2026-04-24 once the
|
|
1795
|
+
* runtime contract was confirmed canonical).
|
|
1789
1796
|
*/
|
|
1790
1797
|
export interface ShapeNode {
|
|
1791
1798
|
type: "shape";
|
|
1792
1799
|
text?: string;
|
|
1793
1800
|
geometry?: string;
|
|
1794
1801
|
isTextBox?: boolean;
|
|
1802
|
+
/** Raw `<w:txbxContent>` XML, preserved for serialization + round-trip. */
|
|
1803
|
+
txbxContentXml?: string;
|
|
1804
|
+
/**
|
|
1805
|
+
* Parsed canonical block-level structure from `<w:txbxContent>`,
|
|
1806
|
+
* populated when the parse path supplies a `blockParser` callback
|
|
1807
|
+
* (headers/footers via `src/io/ooxml/parse-headers-footers.ts`;
|
|
1808
|
+
* body via `src/io/ooxml/parse-main-document.ts`). Shape + semantics
|
|
1809
|
+
* identical to `ShapeContent.txbxBlocks` on the drawing-frame path.
|
|
1810
|
+
*/
|
|
1811
|
+
txbxBlocks?: ReadonlyArray<BlockNode>;
|
|
1795
1812
|
rawXml: string;
|
|
1796
1813
|
}
|
|
1797
1814
|
|
|
@@ -1971,14 +1988,16 @@ export interface ShapeContent {
|
|
|
1971
1988
|
* Parsed block-level structure from `w:txbxContent`, populated when a
|
|
1972
1989
|
* `blockParser` callback is supplied during parse (CO4 F3.3).
|
|
1973
1990
|
*
|
|
1974
|
-
*
|
|
1975
|
-
*
|
|
1976
|
-
*
|
|
1977
|
-
*
|
|
1978
|
-
*
|
|
1979
|
-
*
|
|
1991
|
+
* Canonical `BlockNode[]` — the parse path produces fully-normalized
|
|
1992
|
+
* blocks (styles resolved, marks attached, no `ParsedBlockNode`-only
|
|
1993
|
+
* fields at runtime). Verified on the CCEP SOW footer fixture 2026-04-24:
|
|
1994
|
+
* paragraph + text + `TextMark` shapes land end-to-end. Type promoted
|
|
1995
|
+
* 2026-04-24 from the earlier weakly-typed escape hatch once the L01
|
|
1996
|
+
* shape-textbox parse (commit `7d87f1189`) confirmed the runtime
|
|
1997
|
+
* contract — unblocks L03 cascade + L11 render walking `txbxBlocks`
|
|
1998
|
+
* without `as unknown as BlockNode[]` casts at the consumer site.
|
|
1980
1999
|
*/
|
|
1981
|
-
txbxBlocks?: ReadonlyArray<
|
|
2000
|
+
txbxBlocks?: ReadonlyArray<BlockNode>;
|
|
1982
2001
|
rawXml: string;
|
|
1983
2002
|
}
|
|
1984
2003
|
|
|
@@ -2860,11 +2879,29 @@ function validateDocumentNode(
|
|
|
2860
2879
|
return;
|
|
2861
2880
|
case "chart_preview":
|
|
2862
2881
|
case "smartart_preview":
|
|
2863
|
-
case "shape":
|
|
2864
2882
|
case "wordart":
|
|
2865
2883
|
case "vml_shape":
|
|
2866
2884
|
expectString(record.rawXml, `${path}.rawXml`, issues);
|
|
2867
2885
|
return;
|
|
2886
|
+
case "shape":
|
|
2887
|
+
expectString(record.rawXml, `${path}.rawXml`, issues);
|
|
2888
|
+
if (record.txbxBlocks !== undefined) {
|
|
2889
|
+
if (!Array.isArray(record.txbxBlocks)) {
|
|
2890
|
+
issues.push({
|
|
2891
|
+
path: `${path}.txbxBlocks`,
|
|
2892
|
+
message: "shape.txbxBlocks must be an array when present.",
|
|
2893
|
+
});
|
|
2894
|
+
} else {
|
|
2895
|
+
// coord-02 §14 follow-up (2026-04-24): `ShapeNode.txbxBlocks`
|
|
2896
|
+
// is canonical `BlockNode[]`. Walk it with the same validator
|
|
2897
|
+
// used for top-level document content so run marks / paragraph
|
|
2898
|
+
// structure / nested shapes all enforce the normal rules.
|
|
2899
|
+
record.txbxBlocks.forEach((child, index) => {
|
|
2900
|
+
validateDocumentNode(child, `${path}.txbxBlocks[${index}]`, issues);
|
|
2901
|
+
});
|
|
2902
|
+
}
|
|
2903
|
+
}
|
|
2904
|
+
return;
|
|
2868
2905
|
case "drawing_frame": {
|
|
2869
2906
|
const anchor = asPlainObject(record.anchor, `${path}.anchor`, issues);
|
|
2870
2907
|
const content = asPlainObject(record.content, `${path}.content`, issues);
|