@beyondwork/docx-react-component 1.0.76 → 1.0.77
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/api/v3/ai/resolve.ts +104 -4
- package/src/io/ooxml/parse-bookmark-references.ts +123 -0
- package/src/io/ooxml/parse-footnotes.ts +26 -3
- package/src/io/ooxml/parse-headers-footers.ts +96 -1
- package/src/io/ooxml/parse-main-document.ts +256 -4
- package/src/io/ooxml/parse-shapes.ts +29 -1
- package/src/io/ooxml/table-opaque-preservation.ts +70 -5
- package/src/runtime/scopes/action-validation.ts +39 -12
- package/src/runtime/scopes/index.ts +3 -0
- package/src/runtime/scopes/resolve-reference.ts +99 -43
- package/src/session/import/loader-types.ts +26 -0
- package/src/session/import/loader.ts +12 -2
- package/src/ui-tailwind/editor-surface/perf-probe.ts +3 -0
- package/src/ui-tailwind/editor-surface/pm-decorations.ts +44 -0
- package/src/ui-tailwind/editor-surface/preserve-position.ts +28 -9
|
@@ -31,6 +31,11 @@ import type {
|
|
|
31
31
|
} from "../../model/canonical-document.ts";
|
|
32
32
|
import type { OpcRelationship } from "./part-manifest.ts";
|
|
33
33
|
import { SCOPE_MARKER_BOOKMARK_PREFIX } from "./parse-scope-markers.ts";
|
|
34
|
+
import {
|
|
35
|
+
scanBookmarkReferences,
|
|
36
|
+
isRetainedBookmarkName,
|
|
37
|
+
type BookmarkReferenceScan,
|
|
38
|
+
} from "./parse-bookmark-references.ts";
|
|
34
39
|
import {
|
|
35
40
|
parseInlineMediaXml,
|
|
36
41
|
type InlineMediaPart,
|
|
@@ -253,8 +258,39 @@ export interface ParseMainDocumentOptions {
|
|
|
253
258
|
*
|
|
254
259
|
* Set to `false` to preserve the pre-strip behavior exactly — every
|
|
255
260
|
* cosmetic marker becomes an `opaque_inline` with its source XML.
|
|
261
|
+
*
|
|
262
|
+
* **Phase 2 (Slice B):** the same flag also gates orphan-bookmark
|
|
263
|
+
* stripping. When enabled, a pre-pass scans the source XML for
|
|
264
|
+
* bookmark references (`<w:hyperlink w:anchor>` /
|
|
265
|
+
* `<w:instrText>REF/PAGEREF/NOTEREF/TOC</w:instrText>`); bookmarks
|
|
266
|
+
* whose name is NOT referenced AND not in
|
|
267
|
+
* {@link retainedBookmarkNames} drop at the four emission sites.
|
|
268
|
+
* Aggregate counts surface alongside the cosmetic-marker counts on
|
|
269
|
+
* `skippedCosmeticMarkerCounts.bookmarkStart` /
|
|
270
|
+
* `skippedCosmeticMarkerCounts.bookmarkEnd`.
|
|
256
271
|
*/
|
|
257
272
|
stripCosmeticMarkers?: boolean;
|
|
273
|
+
/**
|
|
274
|
+
* Phase 2 bookmark-strip allowlist. When `stripCosmeticMarkers` is
|
|
275
|
+
* `true`, the parser's reference scan retains bookmarks whose name
|
|
276
|
+
* is referenced by a `<w:hyperlink>` / `<w:instrText>` AND any name
|
|
277
|
+
* listed here. Use this when the host depends on a stable host-
|
|
278
|
+
* authored bookmark name (e.g. `placeholder_party_name`,
|
|
279
|
+
* `signature_block_2`) that the automatic scan can't infer is
|
|
280
|
+
* load-bearing.
|
|
281
|
+
*
|
|
282
|
+
* Default: `[]`. Always-retained regardless of this list:
|
|
283
|
+
* - `_Toc*` (when any TOC field exists)
|
|
284
|
+
* - any name explicitly cited by a `<w:hyperlink w:anchor>` or
|
|
285
|
+
* `<w:instrText>` field instruction
|
|
286
|
+
* - `bw:scope:*` (workflow scope markers — converted to
|
|
287
|
+
* `scope_marker_*` by `rewriteScopeMarkerBookmarks` BEFORE the
|
|
288
|
+
* strip runs; listed here as defense-in-depth)
|
|
289
|
+
* - everything (defensive blanket-retain) when the document
|
|
290
|
+
* contains a `<w:dataBinding>` element whose xpath could
|
|
291
|
+
* reference bookmarks via paths we cannot statically analyze
|
|
292
|
+
*/
|
|
293
|
+
retainedBookmarkNames?: ReadonlyArray<string>;
|
|
258
294
|
}
|
|
259
295
|
|
|
260
296
|
export type ParsedBlockNode =
|
|
@@ -717,6 +753,16 @@ export function setActiveParseTelemetryBus(bus: ParseTelemetryBus | undefined):
|
|
|
717
753
|
interface CosmeticStripContext {
|
|
718
754
|
readonly strip: boolean;
|
|
719
755
|
readonly counts: Record<string, number>;
|
|
756
|
+
/** Phase 2 — bookmark-reference scan. `null` when strip is off OR
|
|
757
|
+
* when the entry point did not run the scan (e.g. an internal call
|
|
758
|
+
* with no XML available). */
|
|
759
|
+
readonly bookmarkScan: BookmarkReferenceScan | null;
|
|
760
|
+
/** Phase 2 — per-parse set of bookmark IDs that the bookmarkStart
|
|
761
|
+
* decision marked as stripped. The bookmarkEnd site keys off this
|
|
762
|
+
* set (since `<w:bookmarkEnd>` carries only the id, not the name).
|
|
763
|
+
* Bookmarks can span paragraphs, so this MUST be request-scoped, not
|
|
764
|
+
* per-paragraph. */
|
|
765
|
+
readonly strippedBookmarkIds: Set<string>;
|
|
720
766
|
}
|
|
721
767
|
let activeCosmeticStripContext: CosmeticStripContext | null = null;
|
|
722
768
|
|
|
@@ -730,6 +776,47 @@ function shouldStripCosmeticMarker(): boolean {
|
|
|
730
776
|
return activeCosmeticStripContext?.strip === true;
|
|
731
777
|
}
|
|
732
778
|
|
|
779
|
+
/**
|
|
780
|
+
* Phase 2 — true when the bookmark with this `name` should be RETAINED
|
|
781
|
+
* (its name is referenced by a hyperlink/field in the doc, on the
|
|
782
|
+
* caller-supplied allowlist, a `bw:scope:*` prefix, or covered by the
|
|
783
|
+
* `_Toc*` blanket-retain when a TOC field exists). When the bookmark
|
|
784
|
+
* has no name (`""`), retain — only the body-walker's `bkId` path
|
|
785
|
+
* reaches this helper, and unnamed bookmarks have no consumer that
|
|
786
|
+
* could reference them by name so retention is a no-op.
|
|
787
|
+
*/
|
|
788
|
+
function shouldRetainBookmark(name: string): boolean {
|
|
789
|
+
const ctx = activeCosmeticStripContext;
|
|
790
|
+
if (!ctx || !ctx.strip) return true; // strip disabled — always retain
|
|
791
|
+
if (!ctx.bookmarkScan) return true; // scan absent — defensive retain
|
|
792
|
+
if (name === "") return true; // no name → no consumer can reference
|
|
793
|
+
return isRetainedBookmarkName(name, ctx.bookmarkScan);
|
|
794
|
+
}
|
|
795
|
+
|
|
796
|
+
/**
|
|
797
|
+
* Phase 2 — record that the bookmark with this `bookmarkId` was
|
|
798
|
+
* stripped at its `bookmarkStart` site so the matching `bookmarkEnd`
|
|
799
|
+
* (which carries only the id, not the name) can find the decision.
|
|
800
|
+
*/
|
|
801
|
+
function noteStrippedBookmarkId(bookmarkId: string): void {
|
|
802
|
+
if (!activeCosmeticStripContext) return;
|
|
803
|
+
activeCosmeticStripContext.strippedBookmarkIds.add(bookmarkId);
|
|
804
|
+
}
|
|
805
|
+
|
|
806
|
+
/**
|
|
807
|
+
* Phase 2 — true when this `bookmarkEnd` corresponds to a `bookmarkStart`
|
|
808
|
+
* that was previously stripped. When the id is unknown (cross-fragment
|
|
809
|
+
* end without a matching start in the same parse) the answer is `false`
|
|
810
|
+
* — we keep the end conservatively rather than risk dropping a load-
|
|
811
|
+
* bearing pair we didn't see start.
|
|
812
|
+
*/
|
|
813
|
+
function shouldStripBookmarkEndId(bookmarkId: string): boolean {
|
|
814
|
+
if (!activeCosmeticStripContext) return false;
|
|
815
|
+
if (!activeCosmeticStripContext.strip) return false;
|
|
816
|
+
if (bookmarkId === "") return false;
|
|
817
|
+
return activeCosmeticStripContext.strippedBookmarkIds.has(bookmarkId);
|
|
818
|
+
}
|
|
819
|
+
|
|
733
820
|
export function parseMainDocumentXml(
|
|
734
821
|
xml: string,
|
|
735
822
|
relationships: readonly OpcRelationship[] = [],
|
|
@@ -739,9 +826,14 @@ export function parseMainDocumentXml(
|
|
|
739
826
|
parseOptions: ParseMainDocumentOptions = {},
|
|
740
827
|
): ParsedMainDocument {
|
|
741
828
|
activeChartPartLookup = chartPartLookup;
|
|
829
|
+
const stripEnabled = parseOptions.stripCosmeticMarkers !== false;
|
|
742
830
|
const stripContext: CosmeticStripContext = {
|
|
743
|
-
strip:
|
|
831
|
+
strip: stripEnabled,
|
|
744
832
|
counts: Object.create(null) as Record<string, number>,
|
|
833
|
+
bookmarkScan: stripEnabled
|
|
834
|
+
? scanBookmarkReferences(xml, parseOptions.retainedBookmarkNames ?? [])
|
|
835
|
+
: null,
|
|
836
|
+
strippedBookmarkIds: new Set<string>(),
|
|
745
837
|
};
|
|
746
838
|
activeCosmeticStripContext = stripContext;
|
|
747
839
|
const bus = activeParseTelemetryBus;
|
|
@@ -848,7 +940,13 @@ function parseMainDocumentXmlInner(
|
|
|
848
940
|
|
|
849
941
|
const allBlocks = bodyElement.children
|
|
850
942
|
.filter((node): node is XmlElementNode => node.type === "element")
|
|
851
|
-
.map((node) => parseBodyChild(node, xml, relationshipMap, relationships, mediaParts, sourcePartPath))
|
|
943
|
+
.map((node) => parseBodyChild(node, xml, relationshipMap, relationships, mediaParts, sourcePartPath))
|
|
944
|
+
// Phase 2 (Slice B) — body-level bookmarkStart/End that the strip
|
|
945
|
+
// dropped come back from `parseBodyChild` as `opaque_block` with
|
|
946
|
+
// empty `rawXml` (sentinel — see `parseBodyChild` body-level
|
|
947
|
+
// bookmark cases). Filter them out here so downstream consumers
|
|
948
|
+
// don't see empty blocks.
|
|
949
|
+
.filter((block) => !(block.type === "opaque_block" && block.rawXml === ""));
|
|
852
950
|
|
|
853
951
|
// The last body-level sectPr is the final section properties (not an intermediate section break).
|
|
854
952
|
// Extract it from the blocks list and store it separately.
|
|
@@ -1071,6 +1169,48 @@ function parseBodyChild(
|
|
|
1071
1169
|
return parseSectionBreakElement(node, sourceXml);
|
|
1072
1170
|
}
|
|
1073
1171
|
|
|
1172
|
+
// Body-level <w:bookmarkStart> / <w:bookmarkEnd> — bookmarks that
|
|
1173
|
+
// span across paragraph boundaries land directly under <w:body>
|
|
1174
|
+
// rather than inside a <w:p>. Pre-Phase-2 these flowed through the
|
|
1175
|
+
// `nodeType !== "p"` default below and became opaque_block (the
|
|
1176
|
+
// dominant 184-of-185 opaque source on EU IT Services Agreement
|
|
1177
|
+
// per `enumerate-opaque-fragments`). Phase 2 strip applies the same
|
|
1178
|
+
// RETAIN-vs-STRIP decision here as at sites 1+2 inside paragraphs.
|
|
1179
|
+
if (nodeType === "bookmarkStart") {
|
|
1180
|
+
const bkId = node.attributes["w:id"] ?? node.attributes.id ?? "";
|
|
1181
|
+
const bkName = node.attributes["w:name"] ?? node.attributes.name ?? "";
|
|
1182
|
+
if (
|
|
1183
|
+
shouldStripCosmeticMarker() &&
|
|
1184
|
+
bkId &&
|
|
1185
|
+
bkName &&
|
|
1186
|
+
!shouldRetainBookmark(bkName)
|
|
1187
|
+
) {
|
|
1188
|
+
noteStrippedCosmeticMarker("bookmarkStart");
|
|
1189
|
+
noteStrippedBookmarkId(bkId);
|
|
1190
|
+
// Returning an empty paragraph block flushes cleanly through the
|
|
1191
|
+
// body-walker's collected blocks. The block is a no-op zero-content
|
|
1192
|
+
// paragraph that downstream consumers ignore. (Returning `null`
|
|
1193
|
+
// would change the body-walker's signature; the block-shaped
|
|
1194
|
+
// no-op preserves the existing iteration contract.)
|
|
1195
|
+
return { type: "opaque_block", rawXml: "" };
|
|
1196
|
+
}
|
|
1197
|
+
return {
|
|
1198
|
+
type: "opaque_block",
|
|
1199
|
+
rawXml: sourceXml.slice(node.start, node.end),
|
|
1200
|
+
};
|
|
1201
|
+
}
|
|
1202
|
+
if (nodeType === "bookmarkEnd") {
|
|
1203
|
+
const bkEndId = node.attributes["w:id"] ?? node.attributes.id ?? "";
|
|
1204
|
+
if (shouldStripBookmarkEndId(bkEndId)) {
|
|
1205
|
+
noteStrippedCosmeticMarker("bookmarkEnd");
|
|
1206
|
+
return { type: "opaque_block", rawXml: "" };
|
|
1207
|
+
}
|
|
1208
|
+
return {
|
|
1209
|
+
type: "opaque_block",
|
|
1210
|
+
rawXml: sourceXml.slice(node.start, node.end),
|
|
1211
|
+
};
|
|
1212
|
+
}
|
|
1213
|
+
|
|
1074
1214
|
if (nodeType !== "p") {
|
|
1075
1215
|
return {
|
|
1076
1216
|
type: "opaque_block",
|
|
@@ -1186,6 +1326,18 @@ function parseBodyChild(
|
|
|
1186
1326
|
const bkId = child.attributes["w:id"] ?? child.attributes.id ?? "";
|
|
1187
1327
|
const bkName = child.attributes["w:name"] ?? child.attributes.name ?? "";
|
|
1188
1328
|
if (bkId) {
|
|
1329
|
+
// Phase 2 (Slice B) — orphan-bookmark strip. When the strip
|
|
1330
|
+
// is on AND the name is not load-bearing per the reference
|
|
1331
|
+
// scan, drop the bookmark. The bookmarkEnd site finds the
|
|
1332
|
+
// matching id via `shouldStripBookmarkEndId`.
|
|
1333
|
+
if (bkName && !shouldRetainBookmark(bkName)) {
|
|
1334
|
+
noteStrippedCosmeticMarker("bookmarkStart");
|
|
1335
|
+
noteStrippedBookmarkId(bkId);
|
|
1336
|
+
flushActiveComplexField(children, () => {
|
|
1337
|
+
activeComplexField = null;
|
|
1338
|
+
}, activeComplexField);
|
|
1339
|
+
break;
|
|
1340
|
+
}
|
|
1189
1341
|
const bookmarkNode = {
|
|
1190
1342
|
type: "bookmark_start",
|
|
1191
1343
|
bookmarkId: bkId,
|
|
@@ -1207,6 +1359,15 @@ function parseBodyChild(
|
|
|
1207
1359
|
case "bookmarkEnd": {
|
|
1208
1360
|
const bkEndId = child.attributes["w:id"] ?? child.attributes.id ?? "";
|
|
1209
1361
|
if (bkEndId) {
|
|
1362
|
+
// Phase 2 — strip the matching bookmarkEnd if its bookmarkStart
|
|
1363
|
+
// was previously stripped (id-keyed pairing across the parse).
|
|
1364
|
+
if (shouldStripBookmarkEndId(bkEndId)) {
|
|
1365
|
+
noteStrippedCosmeticMarker("bookmarkEnd");
|
|
1366
|
+
flushActiveComplexField(children, () => {
|
|
1367
|
+
activeComplexField = null;
|
|
1368
|
+
}, activeComplexField);
|
|
1369
|
+
break;
|
|
1370
|
+
}
|
|
1210
1371
|
const bookmarkNode = {
|
|
1211
1372
|
type: "bookmark_end",
|
|
1212
1373
|
bookmarkId: bkEndId,
|
|
@@ -2755,14 +2916,53 @@ function parseRevisionContainer(
|
|
|
2755
2916
|
];
|
|
2756
2917
|
case "commentRangeStart":
|
|
2757
2918
|
case "commentRangeEnd":
|
|
2758
|
-
case "bookmarkStart":
|
|
2759
|
-
case "bookmarkEnd":
|
|
2760
2919
|
return [
|
|
2761
2920
|
{
|
|
2762
2921
|
type: "opaque_inline",
|
|
2763
2922
|
rawXml: sourceXml.slice(node.start, node.end),
|
|
2764
2923
|
},
|
|
2765
2924
|
];
|
|
2925
|
+
case "bookmarkStart": {
|
|
2926
|
+
// Site 3 (Slice B) — nested-context bookmarkStart. Pre-Slice-B
|
|
2927
|
+
// ALL nested-context bookmarks fell through to opaque_inline
|
|
2928
|
+
// (the 216-opaque source on CCEP). When the strip is on AND the
|
|
2929
|
+
// name isn't load-bearing, drop the start + record the id so
|
|
2930
|
+
// the matching end (Site 4) drops too.
|
|
2931
|
+
const nestedBkId = child.attributes["w:id"] ?? child.attributes.id ?? "";
|
|
2932
|
+
const nestedBkName = child.attributes["w:name"] ?? child.attributes.name ?? "";
|
|
2933
|
+
if (
|
|
2934
|
+
shouldStripCosmeticMarker() &&
|
|
2935
|
+
nestedBkId &&
|
|
2936
|
+
nestedBkName &&
|
|
2937
|
+
!shouldRetainBookmark(nestedBkName)
|
|
2938
|
+
) {
|
|
2939
|
+
noteStrippedCosmeticMarker("bookmarkStart");
|
|
2940
|
+
noteStrippedBookmarkId(nestedBkId);
|
|
2941
|
+
break;
|
|
2942
|
+
}
|
|
2943
|
+
return [
|
|
2944
|
+
{
|
|
2945
|
+
type: "opaque_inline",
|
|
2946
|
+
rawXml: sourceXml.slice(node.start, node.end),
|
|
2947
|
+
},
|
|
2948
|
+
];
|
|
2949
|
+
}
|
|
2950
|
+
case "bookmarkEnd": {
|
|
2951
|
+
// Site 4 (Slice B) — nested-context bookmarkEnd. Strip iff its
|
|
2952
|
+
// matching start (any site) was previously stripped.
|
|
2953
|
+
const nestedBkEndId =
|
|
2954
|
+
child.attributes["w:id"] ?? child.attributes.id ?? "";
|
|
2955
|
+
if (shouldStripBookmarkEndId(nestedBkEndId)) {
|
|
2956
|
+
noteStrippedCosmeticMarker("bookmarkEnd");
|
|
2957
|
+
break;
|
|
2958
|
+
}
|
|
2959
|
+
return [
|
|
2960
|
+
{
|
|
2961
|
+
type: "opaque_inline",
|
|
2962
|
+
rawXml: sourceXml.slice(node.start, node.end),
|
|
2963
|
+
},
|
|
2964
|
+
];
|
|
2965
|
+
}
|
|
2766
2966
|
case "permStart":
|
|
2767
2967
|
result.push(parsePermStartNode(child, sourceXml));
|
|
2768
2968
|
break;
|
|
@@ -2817,6 +3017,18 @@ function parseHyperlink(
|
|
|
2817
3017
|
|
|
2818
3018
|
const children: Array<ParsedTextNode | ParsedBreakNode | ParsedColumnBreakNode | ParsedPageBreakNode | ParsedTabNode | ParsedSymbolNode> = [];
|
|
2819
3019
|
|
|
3020
|
+
// Slice A — local field-bracket state machine. CCEP TOC hyperlinks
|
|
3021
|
+
// wrap their entries with a `PAGEREF` field whose runs carry
|
|
3022
|
+
// `<w:fldChar w:fldCharType="begin|separate|end"/>` markers and a
|
|
3023
|
+
// `<w:instrText>` instruction. Pre-strip behavior bailed the whole
|
|
3024
|
+
// hyperlink to opaque_inline because `parseRunContentOnly` returned
|
|
3025
|
+
// `supported: false` on those markers (~48 CCEP opaques). The local
|
|
3026
|
+
// bracketMode is independent of the body-walker's `activeComplexField`:
|
|
3027
|
+
// hyperlink children are a self-contained sub-walk that doesn't
|
|
3028
|
+
// escape the hyperlink scope.
|
|
3029
|
+
type FieldBracketMode = "outside" | "instruction" | "result";
|
|
3030
|
+
let bracketMode: FieldBracketMode = "outside";
|
|
3031
|
+
|
|
2820
3032
|
for (const child of node.children) {
|
|
2821
3033
|
if (child.type !== "element") {
|
|
2822
3034
|
continue;
|
|
@@ -2829,9 +3041,29 @@ function parseHyperlink(
|
|
|
2829
3041
|
};
|
|
2830
3042
|
}
|
|
2831
3043
|
|
|
3044
|
+
// Pre-scan for fldChar bracket transitions. Per OOXML grammar, a
|
|
3045
|
+
// run carrying a fldChar carries no other displayable content;
|
|
3046
|
+
// skip the whole run after updating bracketMode.
|
|
3047
|
+
const fldChar = child.children.find(
|
|
3048
|
+
(c): c is XmlElementNode =>
|
|
3049
|
+
c.type === "element" && localName(c.name) === "fldChar",
|
|
3050
|
+
);
|
|
3051
|
+
if (fldChar) {
|
|
3052
|
+
const fldType =
|
|
3053
|
+
fldChar.attributes["w:fldCharType"] ?? fldChar.attributes.fldCharType;
|
|
3054
|
+
if (fldType === "begin") bracketMode = "instruction";
|
|
3055
|
+
else if (fldType === "separate") bracketMode = "result";
|
|
3056
|
+
else if (fldType === "end") bracketMode = "outside";
|
|
3057
|
+
continue;
|
|
3058
|
+
}
|
|
3059
|
+
|
|
2832
3060
|
const run = parseRunContentOnly(child, sourceXml, {
|
|
2833
3061
|
allowDeletedText: options.allowDeletedText,
|
|
2834
3062
|
preserveUnsupportedReviewMarkup: options.preserveUnsupportedReviewMarkup,
|
|
3063
|
+
// Tolerate `<w:instrText>` siblings inside hyperlink runs — the
|
|
3064
|
+
// bracket-state machine above takes care of dropping them via
|
|
3065
|
+
// `bracketMode === "instruction"` below.
|
|
3066
|
+
allowFieldMarkers: true,
|
|
2835
3067
|
});
|
|
2836
3068
|
if (!run.supported) {
|
|
2837
3069
|
return {
|
|
@@ -2840,6 +3072,8 @@ function parseHyperlink(
|
|
|
2840
3072
|
};
|
|
2841
3073
|
}
|
|
2842
3074
|
|
|
3075
|
+
// Drop nodes during the field-instruction segment; keep result + outside.
|
|
3076
|
+
if (bracketMode === "instruction") continue;
|
|
2843
3077
|
children.push(...run.nodes);
|
|
2844
3078
|
}
|
|
2845
3079
|
|
|
@@ -2857,6 +3091,16 @@ function parseRunContentOnly(
|
|
|
2857
3091
|
options: {
|
|
2858
3092
|
allowDeletedText?: boolean;
|
|
2859
3093
|
preserveUnsupportedReviewMarkup?: boolean;
|
|
3094
|
+
/**
|
|
3095
|
+
* Slice A — gracefully skip `<w:fldChar>` / `<w:instrText>` children
|
|
3096
|
+
* instead of bailing to `supported: false`. The hyperlink path
|
|
3097
|
+
* (`parseHyperlink`) opts in so TOC `PAGEREF` field markers inside
|
|
3098
|
+
* hyperlink runs no longer trip exit-B. The body-walker callers do
|
|
3099
|
+
* NOT opt in — they have their own `activeComplexField` state machine
|
|
3100
|
+
* that handles these markers semantically and bailing here is
|
|
3101
|
+
* load-bearing for that machine to see the markers in `parseRun`.
|
|
3102
|
+
*/
|
|
3103
|
+
allowFieldMarkers?: boolean;
|
|
2860
3104
|
} = {},
|
|
2861
3105
|
): RunParseResult {
|
|
2862
3106
|
const marksResult = readRunMarks(node, _sourceXml);
|
|
@@ -2940,6 +3184,14 @@ function parseRunContentOnly(
|
|
|
2940
3184
|
}
|
|
2941
3185
|
break;
|
|
2942
3186
|
}
|
|
3187
|
+
case "fldChar":
|
|
3188
|
+
case "instrText":
|
|
3189
|
+
// Slice A — graceful skip when caller opts in (hyperlink path).
|
|
3190
|
+
// Otherwise fall through to default and bail (body-walker path,
|
|
3191
|
+
// which uses its own activeComplexField state machine in
|
|
3192
|
+
// parseRun to handle these markers).
|
|
3193
|
+
if (options.allowFieldMarkers) break;
|
|
3194
|
+
return { nodes: [], supported: false };
|
|
2943
3195
|
default:
|
|
2944
3196
|
return { nodes: [], supported: false };
|
|
2945
3197
|
}
|
|
@@ -32,6 +32,15 @@ export interface ParsedWpsShape {
|
|
|
32
32
|
text?: string;
|
|
33
33
|
/** Raw txbxContent XML for structured re-rendering. */
|
|
34
34
|
txbxContentXml?: string;
|
|
35
|
+
/**
|
|
36
|
+
* Parsed block-level structure from `w:txbxContent`, populated when a
|
|
37
|
+
* `blockParser` callback is supplied (coord-02 §14 / coord-11 §22 —
|
|
38
|
+
* headers/footers need access to shape-inside text like the CCEP
|
|
39
|
+
* "Copyright CCEP STRICTLY CONFIDENTIAL" red band, which lives in
|
|
40
|
+
* shape-textbox paragraphs). Same shape + semantics as
|
|
41
|
+
* `ShapeContent.txbxBlocks` on the drawing-frame path.
|
|
42
|
+
*/
|
|
43
|
+
txbxBlocks?: ReadonlyArray<{ type: string; [key: string]: unknown }>;
|
|
35
44
|
/** DrawML geometry preset, e.g. "rect", "roundRect". */
|
|
36
45
|
geometry?: string;
|
|
37
46
|
/** Original drawing XML for lossless round-trip export. */
|
|
@@ -65,7 +74,10 @@ export type ParsedShape = ParsedWpsShape | ParsedWordArt | ParsedVmlShape;
|
|
|
65
74
|
*
|
|
66
75
|
* Returns null if the drawing does not contain a WPS shape.
|
|
67
76
|
*/
|
|
68
|
-
export function parseShapeXml(
|
|
77
|
+
export function parseShapeXml(
|
|
78
|
+
drawingXml: string,
|
|
79
|
+
blockParser?: TxbxBlockParser,
|
|
80
|
+
): ParsedWpsShape | ParsedWordArt | null {
|
|
69
81
|
const root = parseXml(drawingXml);
|
|
70
82
|
const graphicData = findFirstDescendant(root, "graphicData");
|
|
71
83
|
if (!graphicData) return null;
|
|
@@ -104,11 +116,27 @@ export function parseShapeXml(drawingXml: string): ParsedWpsShape | ParsedWordAr
|
|
|
104
116
|
// Extract raw txbxContent XML for structured re-rendering of text boxes
|
|
105
117
|
const txbxContentXml = txbxContent ? extractRawXml(txbxContent) : undefined;
|
|
106
118
|
|
|
119
|
+
// Coord-02 §14 / coord-11 §22 follow-up (2026-04-24): when a
|
|
120
|
+
// blockParser is supplied, recurse into the txbxContent to produce a
|
|
121
|
+
// structured block representation. Without this, shape-textbox
|
|
122
|
+
// content (CCEP "Copyright CCEP STRICTLY CONFIDENTIAL" footer band)
|
|
123
|
+
// is reachable only via the `.text` summary string — L03 cascade +
|
|
124
|
+
// L11 render can't walk runs/marks.
|
|
125
|
+
let txbxBlocks: ReadonlyArray<{ type: string; [key: string]: unknown }> | undefined;
|
|
126
|
+
if (txbxContentXml && blockParser) {
|
|
127
|
+
try {
|
|
128
|
+
txbxBlocks = blockParser(txbxContentXml);
|
|
129
|
+
} catch {
|
|
130
|
+
txbxBlocks = undefined;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
107
134
|
return {
|
|
108
135
|
type: "shape",
|
|
109
136
|
...(isTextBox ? { isTextBox: true } : {}),
|
|
110
137
|
...(text ? { text } : {}),
|
|
111
138
|
...(txbxContentXml ? { txbxContentXml } : {}),
|
|
139
|
+
...(txbxBlocks && txbxBlocks.length > 0 ? { txbxBlocks } : {}),
|
|
112
140
|
...(prst ? { geometry: prst } : {}),
|
|
113
141
|
rawXml: drawingXml,
|
|
114
142
|
};
|
|
@@ -55,8 +55,21 @@ import { classifyFieldInstruction } from "./parse-fields.ts";
|
|
|
55
55
|
|
|
56
56
|
/**
|
|
57
57
|
* Field families safe enough to leave a `<w:tbl>` in structured
|
|
58
|
-
* canonical form.
|
|
59
|
-
*
|
|
58
|
+
* canonical form.
|
|
59
|
+
*
|
|
60
|
+
* The principle: the body-direct paragraph parser accepts every
|
|
61
|
+
* field family and emits a typed `FieldInlineNode` — classified
|
|
62
|
+
* families get a refresh slot, preserve-only families round-trip
|
|
63
|
+
* via the shared `FieldInlineNode` shape. Cells inside a `<w:tbl>`
|
|
64
|
+
* run through the same parser + serializer. Flattening the whole
|
|
65
|
+
* table to `opaque_block` because one cell carries a field is
|
|
66
|
+
* over-conservative: preserve-only fields round-trip identically
|
|
67
|
+
* whether they sit in a body paragraph or a table cell.
|
|
68
|
+
*
|
|
69
|
+
* This set is consulted AFTER `isWellFormedFieldInstruction` — for
|
|
70
|
+
* the rare case where a field instruction doesn't start with a
|
|
71
|
+
* recognizable OOXML family identifier, we still accept it iff the
|
|
72
|
+
* classifier happened to recognize it.
|
|
60
73
|
*/
|
|
61
74
|
export const SAFE_TABLE_FIELD_FAMILIES: ReadonlySet<string> = new Set([
|
|
62
75
|
"REF",
|
|
@@ -65,6 +78,18 @@ export const SAFE_TABLE_FIELD_FAMILIES: ReadonlySet<string> = new Set([
|
|
|
65
78
|
"TOC",
|
|
66
79
|
"PAGE",
|
|
67
80
|
"NUMPAGES",
|
|
81
|
+
"STYLEREF",
|
|
82
|
+
"SECTIONPAGES",
|
|
83
|
+
"DATE",
|
|
84
|
+
"TIME",
|
|
85
|
+
"AUTHOR",
|
|
86
|
+
"FILENAME",
|
|
87
|
+
"MERGEFIELD",
|
|
88
|
+
"IF",
|
|
89
|
+
"SEQ",
|
|
90
|
+
"INDEX",
|
|
91
|
+
"TC",
|
|
92
|
+
"FORMULA",
|
|
68
93
|
]);
|
|
69
94
|
|
|
70
95
|
/**
|
|
@@ -127,23 +152,63 @@ export function extractComplexFieldInstructionsFromRaw(rawXml: string): string[]
|
|
|
127
152
|
* `FORMDROPDOWN`. These are fully supported by the body-direct
|
|
128
153
|
* paragraph parser via `parseFFDataFromFldChar` but classify as
|
|
129
154
|
* `UNKNOWN` under `FIELD_FAMILY_PATTERN` (which targets data-field
|
|
130
|
-
* families like REF / TOC / MERGEFIELD).
|
|
131
|
-
*
|
|
132
|
-
*
|
|
155
|
+
* families like REF / TOC / MERGEFIELD). Kept as a named helper for
|
|
156
|
+
* readability; `isWellFormedFieldInstruction` would also accept them
|
|
157
|
+
* via the generic identifier pattern, but the named check documents
|
|
158
|
+
* the carve-out's origin (coord-01 §11, 2026-04-24).
|
|
133
159
|
*/
|
|
134
160
|
export function isLegacyFormFieldInstruction(instruction: string): boolean {
|
|
135
161
|
return /^\s*(FORMTEXT|FORMCHECKBOX|FORMDROPDOWN)\b/i.test(instruction);
|
|
136
162
|
}
|
|
137
163
|
|
|
164
|
+
/**
|
|
165
|
+
* Matches any well-formed OOXML field instruction. OOXML field
|
|
166
|
+
* instructions (ECMA-376 §17.16) begin with an ALL-CAPS family name
|
|
167
|
+
* — `REF`, `PAGE`, `TOC`, `MERGEFIELD`, `DOCPROPERTY`,
|
|
168
|
+
* `LISTNUM`, and so on through the full §17.16 catalog (60+
|
|
169
|
+
* families). Pattern-matching the family-name prefix lets us
|
|
170
|
+
* recognize every standard field shape WITHOUT adding each one to
|
|
171
|
+
* the L02 `PreserveOnlyFieldFamily` union (which would require a
|
|
172
|
+
* cross-lane slice) OR to `FIELD_FAMILY_PATTERN` (which expands
|
|
173
|
+
* classification-time behavior).
|
|
174
|
+
*
|
|
175
|
+
* The narrow `SAFE_TABLE_FIELD_FAMILIES` above is consulted as a
|
|
176
|
+
* fallback for the rare case of a field instruction that doesn't
|
|
177
|
+
* start with a family-name prefix but DOES classify to a known
|
|
178
|
+
* family (e.g. leading whitespace quirks we haven't seen in the
|
|
179
|
+
* wild).
|
|
180
|
+
*
|
|
181
|
+
* Rejection remains for:
|
|
182
|
+
* - Instructions that don't start with a family-name shape — these
|
|
183
|
+
* could be garbled / truncated / mid-field text; flattening the
|
|
184
|
+
* table is the safe preservation path.
|
|
185
|
+
* - Revision markup anywhere in the table (tracked changes —
|
|
186
|
+
* unaffected by this check; handled by `RISKY_TABLE_MARKUP_RE`
|
|
187
|
+
* below).
|
|
188
|
+
*/
|
|
189
|
+
const WELL_FORMED_FIELD_INSTRUCTION_RE = /^\s*[A-Z][A-Z0-9_]*\b/;
|
|
190
|
+
|
|
191
|
+
export function isWellFormedFieldInstruction(instruction: string): boolean {
|
|
192
|
+
return WELL_FORMED_FIELD_INSTRUCTION_RE.test(instruction);
|
|
193
|
+
}
|
|
194
|
+
|
|
138
195
|
/**
|
|
139
196
|
* Decides whether a single field instruction (either `w:instr`
|
|
140
197
|
* attribute value or concatenated `instrText` run) is safe for
|
|
141
198
|
* structured-table parsing. Used by the shared predicate below;
|
|
142
199
|
* exposed for direct callers (the debug diagnostics script runs
|
|
143
200
|
* this to classify source instructions alongside the canonical).
|
|
201
|
+
*
|
|
202
|
+
* Order matters: the well-formed prefix check covers every standard
|
|
203
|
+
* OOXML field family in one pass; the classifier fallback catches
|
|
204
|
+
* edge cases where `FIELD_FAMILY_PATTERN` happens to match but the
|
|
205
|
+
* prefix shape doesn't (unlikely, but the fallback keeps behavior
|
|
206
|
+
* consistent with classification for any instruction the classifier
|
|
207
|
+
* recognizes).
|
|
144
208
|
*/
|
|
145
209
|
export function isSafeTableFieldInstruction(instruction: string): boolean {
|
|
146
210
|
if (isLegacyFormFieldInstruction(instruction)) return true;
|
|
211
|
+
if (isWellFormedFieldInstruction(instruction)) return true;
|
|
147
212
|
const family = classifyFieldInstruction(instruction).family;
|
|
148
213
|
return SAFE_TABLE_FIELD_FAMILIES.has(family);
|
|
149
214
|
}
|
|
@@ -214,18 +214,45 @@ function collectGuardVerdict(
|
|
|
214
214
|
// Coord-06 §13e — promote the bare `guard:blocked` blocker to a typed
|
|
215
215
|
// `guard:block-<reason>` suffix so agents can route intelligently on
|
|
216
216
|
// boundary-paragraph / system-paragraph / read-only / protected-range
|
|
217
|
-
// situations. The specific sub-reason is the first
|
|
218
|
-
//
|
|
219
|
-
//
|
|
220
|
-
//
|
|
221
|
-
//
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
217
|
+
// situations. The specific sub-reason is the first NON-selection-
|
|
218
|
+
// scope-membership reason.
|
|
219
|
+
//
|
|
220
|
+
// Scope-targeted-write carve-out (coord-09, TemplateViewer repro
|
|
221
|
+
// 2026-04-24): `applyReplacementScope`, `attachExplanation`, and
|
|
222
|
+
// `createIssue` target a scopeId, not the current editor selection.
|
|
223
|
+
// The scope's own `workflow.effectiveMode` already drove the
|
|
224
|
+
// scope-level arm of `collectGuardVerdict` above (lines 159–197).
|
|
225
|
+
// The selection-scoped coordinator guard, in contrast, evaluates
|
|
226
|
+
// against the live `state.selection` — which, for scope-targeted
|
|
227
|
+
// writes, may sit anywhere in the document. Reasons that depend on
|
|
228
|
+
// selection-scope membership (`outside_workflow_scope`,
|
|
229
|
+
// `workflow_view_only`, `workflow_comment_only`) are therefore
|
|
230
|
+
// double-counting and must not block. Globally-scoped reasons
|
|
231
|
+
// (`document_read_only`, `document_viewing_mode`) still apply — a
|
|
232
|
+
// read-only doc rejects every write, scope-targeted or not.
|
|
233
|
+
const SELECTION_SCOPE_MEMBERSHIP_CODES = new Set([
|
|
234
|
+
"outside_workflow_scope",
|
|
235
|
+
"workflow_view_only",
|
|
236
|
+
"workflow_comment_only",
|
|
237
|
+
]);
|
|
238
|
+
const rawReasons = guard.blockedReasons ?? [];
|
|
239
|
+
const nonSelectionScoped = rawReasons.filter(
|
|
240
|
+
(r) => !SELECTION_SCOPE_MEMBERSHIP_CODES.has(r.code),
|
|
241
|
+
);
|
|
242
|
+
// If every reason was selection-scope-membership for a scope-
|
|
243
|
+
// targeted write, emit no blocker — the scope-level arm above is
|
|
244
|
+
// authoritative. The defensive empty-array fallback
|
|
245
|
+
// (guard:block-unspecified) still fires when the coordinator
|
|
246
|
+
// produced effectiveMode:"blocked" without any reasons at all.
|
|
247
|
+
if (nonSelectionScoped.length > 0 || rawReasons.length === 0) {
|
|
248
|
+
const primaryCode = nonSelectionScoped[0]?.code;
|
|
249
|
+
const suffix = typeof primaryCode === "string" && primaryCode.length > 0
|
|
250
|
+
? primaryCode
|
|
251
|
+
: "unspecified";
|
|
252
|
+
const typedBlocker = `guard:block-${suffix}`;
|
|
253
|
+
if (!blockedReasons.some((existing) => existing === typedBlocker)) {
|
|
254
|
+
blockedReasons.push(typedBlocker);
|
|
255
|
+
}
|
|
229
256
|
}
|
|
230
257
|
}
|
|
231
258
|
for (const reason of guard.blockedReasons ?? []) {
|