@beyondwork/docx-react-component 1.0.56 → 1.0.58

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/README.md +1 -1
  2. package/package.json +1 -1
  3. package/src/api/public-types.ts +330 -0
  4. package/src/compare/diff-engine.ts +3 -0
  5. package/src/core/commands/formatting-commands.ts +1 -0
  6. package/src/core/commands/index.ts +17 -11
  7. package/src/core/selection/mapping.ts +18 -1
  8. package/src/core/selection/review-anchors.ts +29 -18
  9. package/src/io/chart-preview-resolver.ts +175 -41
  10. package/src/io/docx-session.ts +57 -2
  11. package/src/io/export/serialize-main-document.ts +82 -0
  12. package/src/io/export/serialize-styles.ts +61 -3
  13. package/src/io/export/table-properties-xml.ts +19 -4
  14. package/src/io/normalize/normalize-text.ts +33 -0
  15. package/src/io/ooxml/parse-anchor.ts +182 -0
  16. package/src/io/ooxml/parse-drawing.ts +319 -0
  17. package/src/io/ooxml/parse-fields.ts +115 -2
  18. package/src/io/ooxml/parse-fill.ts +215 -0
  19. package/src/io/ooxml/parse-font-table.ts +190 -0
  20. package/src/io/ooxml/parse-footnotes.ts +52 -1
  21. package/src/io/ooxml/parse-main-document.ts +241 -1
  22. package/src/io/ooxml/parse-numbering.ts +96 -0
  23. package/src/io/ooxml/parse-picture.ts +158 -0
  24. package/src/io/ooxml/parse-settings.ts +34 -0
  25. package/src/io/ooxml/parse-shapes.ts +87 -0
  26. package/src/io/ooxml/parse-solid-fill.ts +11 -0
  27. package/src/io/ooxml/parse-styles.ts +74 -1
  28. package/src/io/ooxml/parse-theme.ts +60 -0
  29. package/src/io/paste/html-clipboard.ts +449 -0
  30. package/src/io/paste/word-clipboard.ts +5 -1
  31. package/src/legal/_document-root.ts +26 -0
  32. package/src/legal/bookmarks.ts +4 -3
  33. package/src/legal/cross-references.ts +3 -2
  34. package/src/legal/defined-terms.ts +2 -1
  35. package/src/legal/signature-blocks.ts +2 -1
  36. package/src/model/canonical-document.ts +421 -3
  37. package/src/runtime/chart/chart-model-store.ts +73 -10
  38. package/src/runtime/document-runtime.ts +760 -41
  39. package/src/runtime/document-search.ts +61 -0
  40. package/src/runtime/edit-ops/index.ts +129 -0
  41. package/src/runtime/event-refresh-hints.ts +7 -0
  42. package/src/runtime/field-resolver.ts +341 -0
  43. package/src/runtime/footnote-resolver.ts +55 -0
  44. package/src/runtime/hyperlink-color-resolver.ts +13 -10
  45. package/src/runtime/object-grab/index.ts +51 -0
  46. package/src/runtime/paragraph-style-resolver.ts +105 -0
  47. package/src/runtime/query-scopes.ts +186 -0
  48. package/src/runtime/resolved-numbering-geometry.ts +12 -0
  49. package/src/runtime/scope-resolver.ts +60 -0
  50. package/src/runtime/selection/cursor-ops.ts +186 -15
  51. package/src/runtime/selection/index.ts +17 -1
  52. package/src/runtime/structure-ops/index.ts +77 -0
  53. package/src/runtime/styles-cascade.ts +33 -0
  54. package/src/runtime/surface-projection.ts +192 -12
  55. package/src/runtime/theme-color-resolver.ts +189 -44
  56. package/src/runtime/units.ts +46 -0
  57. package/src/runtime/view-state.ts +13 -2
  58. package/src/ui/WordReviewEditor.tsx +239 -11
  59. package/src/ui/editor-runtime-boundary.ts +97 -1
  60. package/src/ui/editor-shell-view.tsx +1 -1
  61. package/src/ui/runtime-shortcut-dispatch.ts +17 -3
  62. package/src/ui-tailwind/chart/ChartSurface.tsx +36 -10
  63. package/src/ui-tailwind/chart/layout/plot-area.ts +120 -45
  64. package/src/ui-tailwind/chart/render/area.tsx +22 -4
  65. package/src/ui-tailwind/chart/render/bar-column.tsx +37 -11
  66. package/src/ui-tailwind/chart/render/bubble.tsx +6 -2
  67. package/src/ui-tailwind/chart/render/combo.tsx +37 -4
  68. package/src/ui-tailwind/chart/render/line.tsx +28 -5
  69. package/src/ui-tailwind/chart/render/pie.tsx +36 -16
  70. package/src/ui-tailwind/chart/render/progressive-render.ts +8 -1
  71. package/src/ui-tailwind/chart/render/scatter.tsx +9 -4
  72. package/src/ui-tailwind/chrome/avatar-initials.ts +15 -0
  73. package/src/ui-tailwind/chrome/tw-comment-preview.tsx +3 -1
  74. package/src/ui-tailwind/chrome/tw-context-menu.tsx +14 -0
  75. package/src/ui-tailwind/chrome/tw-selection-tool-host.tsx +3 -2
  76. package/src/ui-tailwind/chrome/tw-selection-toolbar.tsx +30 -11
  77. package/src/ui-tailwind/chrome/tw-shortcut-hint.tsx +15 -2
  78. package/src/ui-tailwind/chrome/tw-suggestion-card.tsx +1 -1
  79. package/src/ui-tailwind/chrome/tw-table-context-toolbar.tsx +24 -7
  80. package/src/ui-tailwind/chrome/tw-table-grip-layer.tsx +31 -12
  81. package/src/ui-tailwind/chrome-overlay/page-border-resolver.ts +211 -0
  82. package/src/ui-tailwind/chrome-overlay/tw-chrome-overlay.tsx +24 -0
  83. package/src/ui-tailwind/chrome-overlay/tw-comment-balloon-layer.tsx +74 -0
  84. package/src/ui-tailwind/chrome-overlay/tw-locked-block-layer.tsx +65 -0
  85. package/src/ui-tailwind/chrome-overlay/tw-object-selection-overlay.tsx +157 -0
  86. package/src/ui-tailwind/chrome-overlay/tw-page-border-overlay.tsx +233 -0
  87. package/src/ui-tailwind/chrome-overlay/tw-page-stack-overlay-layer.tsx +135 -13
  88. package/src/ui-tailwind/chrome-overlay/tw-revision-margin-bar-layer.tsx +51 -0
  89. package/src/ui-tailwind/chrome-overlay/tw-scope-card-layer.tsx +12 -4
  90. package/src/ui-tailwind/chrome-overlay/tw-scope-card.tsx +32 -12
  91. package/src/ui-tailwind/chrome-overlay/tw-toc-outline-sidebar.tsx +133 -0
  92. package/src/ui-tailwind/editor-surface/chart-node-view.tsx +49 -10
  93. package/src/ui-tailwind/editor-surface/float-wrap-resolver.ts +119 -0
  94. package/src/ui-tailwind/editor-surface/pm-command-bridge.ts +236 -9
  95. package/src/ui-tailwind/editor-surface/pm-schema.ts +214 -11
  96. package/src/ui-tailwind/editor-surface/pm-state-from-snapshot.ts +32 -2
  97. package/src/ui-tailwind/editor-surface/shape-renderer.ts +206 -0
  98. package/src/ui-tailwind/editor-surface/surface-layer.ts +66 -0
  99. package/src/ui-tailwind/editor-surface/tw-inline-token.tsx +29 -0
  100. package/src/ui-tailwind/editor-surface/tw-segment-view.tsx +7 -1
  101. package/src/ui-tailwind/page-stack/tw-page-stack-chrome-layer.tsx +22 -6
  102. package/src/ui-tailwind/review/tw-comment-sidebar.tsx +10 -16
  103. package/src/ui-tailwind/review/tw-health-panel.tsx +0 -25
  104. package/src/ui-tailwind/review/tw-rail-card.tsx +38 -17
  105. package/src/ui-tailwind/review/tw-review-rail.tsx +2 -2
  106. package/src/ui-tailwind/review/tw-revision-sidebar.tsx +5 -12
  107. package/src/ui-tailwind/review/tw-workflow-tab.tsx +2 -2
  108. package/src/ui-tailwind/theme/editor-theme.css +1 -0
  109. package/src/ui-tailwind/theme/tokens.css +6 -0
  110. package/src/ui-tailwind/theme/tokens.ts +10 -0
  111. package/src/ui-tailwind/tw-review-workspace.tsx +23 -0
  112. package/src/validation/compatibility-engine.ts +2 -0
  113. package/src/validation/docx-comment-proof.ts +12 -3
@@ -0,0 +1,449 @@
1
+ /**
2
+ * I2 Tier B Slice 3 — HTML paste parser.
3
+ *
4
+ * Purpose: when the browser clipboard holds `text/html` but no WordML MIME,
5
+ * route it through this parser to produce a `CanonicalDocumentFragment`.
6
+ * Timeboxed to the two fixtures that cover legal-review use cases —
7
+ * Google Docs and Word web. Outlook / Gmail / raw browser selection follow
8
+ * when user fixtures surface.
9
+ *
10
+ * Implementation: a small handwritten tokenizer + recursive parser. Keeping
11
+ * the dependency profile minimal (no jsdom / parse5 at runtime) — the HTML
12
+ * subset we accept is narrow enough that a 200-line tokenizer reads cleaner
13
+ * than pulling in a 50 KB DOM library.
14
+ *
15
+ * What this parser accepts:
16
+ * - Block structure: `<p>`, `<div>`, `<h1>`–`<h6>`, `<ul>`/`<ol>` + `<li>`
17
+ * - Inline structure: `<span>`, `<b>`/`<strong>`, `<i>`/`<em>`, `<u>`,
18
+ * `<s>`/`<del>`, `<a href>`, `<br>`
19
+ * - Meta: `<meta>`, `<html>`, `<head>`, `<body>` — passed through
20
+ *
21
+ * What this parser drops (sanitizer):
22
+ * - `<script>` / `<style>` / `<iframe>` / `<form>` / `<input>` tag CONTENT
23
+ * - Event handler attributes (`on*`)
24
+ * - `data-*` attributes
25
+ * - `javascript:` hrefs (rewritten to `#`)
26
+ */
27
+
28
+ import type { CanonicalDocumentFragment } from "../../api/public-types.ts";
29
+ import type {
30
+ BlockNode,
31
+ HyperlinkNode,
32
+ InlineNode,
33
+ ParagraphNode,
34
+ TextMark,
35
+ TextNode,
36
+ } from "../../model/canonical-document.ts";
37
+
38
+ export type ParseCanonicalFragmentFromHtmlResult =
39
+ | { ok: true; fragment: CanonicalDocumentFragment }
40
+ | { ok: false; reason: string };
41
+
42
+ export function parseCanonicalFragmentFromHtml(
43
+ html: string,
44
+ ): ParseCanonicalFragmentFromHtmlResult {
45
+ if (typeof html !== "string") {
46
+ return { ok: false, reason: "html must be a string" };
47
+ }
48
+ if (html.length === 0) {
49
+ return { ok: true, fragment: { blocks: [] } };
50
+ }
51
+ try {
52
+ const sanitized = stripUnsafeRegions(html);
53
+ const tokens = tokenize(sanitized);
54
+ const blocks = parseBlocks(tokens);
55
+ return { ok: true, fragment: { blocks } };
56
+ } catch (error) {
57
+ return {
58
+ ok: false,
59
+ reason: error instanceof Error ? error.message : "unknown HTML parse error",
60
+ };
61
+ }
62
+ }
63
+
64
+ // ─── Sanitizer ────────────────────────────────────────────────────────────
65
+
66
+ const UNSAFE_REGIONS = /<(script|style|iframe|form|input|object|embed)\b[^>]*>[\s\S]*?<\/\1>/gi;
67
+ const UNSAFE_SELF_CLOSING = /<(script|style|iframe|form|input|object|embed)\b[^>]*\/?>/gi;
68
+
69
+ function stripUnsafeRegions(html: string): string {
70
+ return html.replace(UNSAFE_REGIONS, "").replace(UNSAFE_SELF_CLOSING, "");
71
+ }
72
+
73
+ // ─── Tokenizer ────────────────────────────────────────────────────────────
74
+
75
+ type Token =
76
+ | { type: "open"; tag: string; attrs: Record<string, string> }
77
+ | { type: "close"; tag: string }
78
+ | { type: "void"; tag: string; attrs: Record<string, string> }
79
+ | { type: "text"; value: string };
80
+
81
+ const VOID_TAGS = new Set(["br", "hr", "img", "meta", "link", "source", "track", "area"]);
82
+
83
+ function tokenize(html: string): Token[] {
84
+ const tokens: Token[] = [];
85
+ const re = /<([!/]?)([a-zA-Z][a-zA-Z0-9-]*)([^>]*)>|([^<]+)/g;
86
+ let match: RegExpExecArray | null;
87
+ while ((match = re.exec(html)) !== null) {
88
+ const [, prefix, rawTag, attrString, textRun] = match;
89
+ if (textRun !== undefined) {
90
+ const decoded = decodeEntities(textRun);
91
+ if (decoded.length > 0) tokens.push({ type: "text", value: decoded });
92
+ continue;
93
+ }
94
+ if (prefix === "!") continue; // comments / doctypes
95
+ const tag = rawTag!.toLowerCase();
96
+ if (prefix === "/") {
97
+ tokens.push({ type: "close", tag });
98
+ continue;
99
+ }
100
+ const attrs = parseAttrs(attrString ?? "");
101
+ const isSelfClosing = (attrString ?? "").trim().endsWith("/");
102
+ if (VOID_TAGS.has(tag) || isSelfClosing) {
103
+ tokens.push({ type: "void", tag, attrs });
104
+ } else {
105
+ tokens.push({ type: "open", tag, attrs });
106
+ }
107
+ }
108
+ return tokens;
109
+ }
110
+
111
+ function parseAttrs(attrString: string): Record<string, string> {
112
+ const attrs: Record<string, string> = {};
113
+ const re = /([a-zA-Z_:][a-zA-Z0-9_:.-]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]+)))?/g;
114
+ let match: RegExpExecArray | null;
115
+ while ((match = re.exec(attrString)) !== null) {
116
+ const [, name, dq, sq, bare] = match;
117
+ const lowerName = name!.toLowerCase();
118
+ // Sanitizer: drop event handlers + data-* attributes.
119
+ if (lowerName.startsWith("on")) continue;
120
+ if (lowerName.startsWith("data-")) continue;
121
+ attrs[lowerName] = (dq ?? sq ?? bare ?? "").trim();
122
+ }
123
+ return attrs;
124
+ }
125
+
126
+ const ENTITY_MAP: Record<string, string> = {
127
+ amp: "&",
128
+ lt: "<",
129
+ gt: ">",
130
+ quot: '"',
131
+ apos: "'",
132
+ nbsp: "\u00a0",
133
+ };
134
+
135
+ function decodeEntities(text: string): string {
136
+ return text.replace(/&(#?[a-zA-Z0-9]+);/g, (whole, ent) => {
137
+ if (ent.startsWith("#")) {
138
+ const code = ent.startsWith("#x") || ent.startsWith("#X")
139
+ ? parseInt(ent.slice(2), 16)
140
+ : parseInt(ent.slice(1), 10);
141
+ return Number.isFinite(code) ? String.fromCodePoint(code) : whole;
142
+ }
143
+ return ENTITY_MAP[ent.toLowerCase()] ?? whole;
144
+ });
145
+ }
146
+
147
+ // ─── Block / inline parser ────────────────────────────────────────────────
148
+
149
+ interface ParseContext {
150
+ marks: TextMark[];
151
+ hyperlinkHref?: string;
152
+ }
153
+
154
+ const BLOCK_TAGS = new Set([
155
+ "p",
156
+ "div",
157
+ "h1",
158
+ "h2",
159
+ "h3",
160
+ "h4",
161
+ "h5",
162
+ "h6",
163
+ "li",
164
+ "tr",
165
+ ]);
166
+
167
+ const CONTAINER_TAGS = new Set(["html", "body", "head", "ul", "ol", "b", "strong", "i", "em", "u", "s", "del", "span", "a"]);
168
+
169
+ const MARK_TAGS: Record<string, TextMark["type"]> = {
170
+ b: "bold",
171
+ strong: "bold",
172
+ i: "italic",
173
+ em: "italic",
174
+ u: "underline",
175
+ s: "strikethrough",
176
+ del: "strikethrough",
177
+ };
178
+
179
+ const HEADING_STYLES: Record<string, string> = {
180
+ h1: "Heading1",
181
+ h2: "Heading2",
182
+ h3: "Heading3",
183
+ h4: "Heading4",
184
+ h5: "Heading5",
185
+ h6: "Heading6",
186
+ };
187
+
188
+ function parseBlocks(tokens: Token[]): BlockNode[] {
189
+ const blocks: BlockNode[] = [];
190
+ let cursor = 0;
191
+
192
+ // Accumulate inline nodes for "loose" text that isn't inside a block tag
193
+ // (e.g. Google Docs wraps content in a `<b>` container; the child `<p>`s
194
+ // are the real blocks). When we hit a BLOCK_TAGS opener, flush any
195
+ // pending inline content into a paragraph.
196
+ let pendingInline: InlineNode[] = [];
197
+ // v5 A2: track marks from mark tags encountered at block level (outside a
198
+ // `<p>`/`<div>`). Pre-fix, `<body><b>hello</b></body>` dropped the bold
199
+ // mark because the block walker descended `<b>` as a transparent container
200
+ // and the text landed in pendingInline unmarked. We now stack mark contexts
201
+ // alongside pendingInline and apply them when text tokens arrive.
202
+ const blockMarkStack: TextMark[][] = [];
203
+ const currentBlockMarks = (): TextMark[] => {
204
+ if (blockMarkStack.length === 0) return [];
205
+ return blockMarkStack.flat();
206
+ };
207
+
208
+ const flushPendingInline = () => {
209
+ if (pendingInline.length > 0) {
210
+ blocks.push({
211
+ type: "paragraph",
212
+ children: pendingInline,
213
+ });
214
+ pendingInline = [];
215
+ }
216
+ };
217
+
218
+ while (cursor < tokens.length) {
219
+ const token = tokens[cursor]!;
220
+
221
+ if (token.type === "text") {
222
+ const marks = currentBlockMarks();
223
+ const inline: TextNode = {
224
+ type: "text",
225
+ text: token.value,
226
+ ...(marks.length > 0 ? { marks: marks.map((m) => ({ ...m })) } : {}),
227
+ };
228
+ pendingInline.push(inline);
229
+ cursor += 1;
230
+ continue;
231
+ }
232
+
233
+ if (token.type === "void") {
234
+ if (token.tag === "br") {
235
+ pendingInline.push({ type: "hard_break" });
236
+ }
237
+ // meta / link / etc. — ignore
238
+ cursor += 1;
239
+ continue;
240
+ }
241
+
242
+ if (token.type === "close") {
243
+ // Pop a block-level mark context if this close matches an open mark tag.
244
+ if (MARK_TAGS[token.tag] && blockMarkStack.length > 0) {
245
+ blockMarkStack.pop();
246
+ }
247
+ cursor += 1;
248
+ continue;
249
+ }
250
+
251
+ // open
252
+ const openTag = token.tag;
253
+
254
+ if (BLOCK_TAGS.has(openTag)) {
255
+ flushPendingInline();
256
+ // Consume inner content up to matching close, collecting inline nodes.
257
+ // Seed with any active block-level marks so `<b><p>hello</p></b>`
258
+ // still produces bold inline nodes inside the paragraph.
259
+ const { inline, end } = parseInlineUntil(tokens, cursor + 1, openTag, {
260
+ marks: currentBlockMarks(),
261
+ });
262
+ const paragraph: ParagraphNode = {
263
+ type: "paragraph",
264
+ children: inline,
265
+ ...(HEADING_STYLES[openTag] ? { styleId: HEADING_STYLES[openTag] } : {}),
266
+ };
267
+ blocks.push(paragraph);
268
+ cursor = end;
269
+ continue;
270
+ }
271
+
272
+ if (openTag === "ul" || openTag === "ol") {
273
+ // Walk forward until matching close; block children handled via `li`.
274
+ // Descend — `li` is a BLOCK_TAGS entry.
275
+ cursor += 1;
276
+ continue;
277
+ }
278
+
279
+ if (MARK_TAGS[openTag]) {
280
+ // v5 A2: mark tag at block level — push onto the block-level mark
281
+ // stack so bare text tokens (and any subsequent BLOCK_TAGS that flush
282
+ // pendingInline into a paragraph) pick up the mark.
283
+ blockMarkStack.push([{ type: MARK_TAGS[openTag]! } as TextMark]);
284
+ cursor += 1;
285
+ continue;
286
+ }
287
+
288
+ if (CONTAINER_TAGS.has(openTag)) {
289
+ // Descend into the container — keep pendingInline flowing; the mark
290
+ // parsing happens when we hit text nodes.
291
+ cursor += 1;
292
+ continue;
293
+ }
294
+
295
+ // Unknown tag — skip.
296
+ cursor += 1;
297
+ }
298
+
299
+ flushPendingInline();
300
+ return blocks;
301
+ }
302
+
303
+ function parseInlineUntil(
304
+ tokens: Token[],
305
+ start: number,
306
+ untilTag: string,
307
+ context: ParseContext,
308
+ ): { inline: InlineNode[]; end: number } {
309
+ const inline: InlineNode[] = [];
310
+ let cursor = start;
311
+
312
+ while (cursor < tokens.length) {
313
+ const token = tokens[cursor]!;
314
+
315
+ if (token.type === "close" && token.tag === untilTag) {
316
+ return { inline, end: cursor + 1 };
317
+ }
318
+
319
+ if (token.type === "text") {
320
+ const node: TextNode = {
321
+ type: "text",
322
+ text: token.value,
323
+ ...(context.marks.length > 0 ? { marks: context.marks.map((m) => ({ ...m })) } : {}),
324
+ };
325
+ if (context.hyperlinkHref) {
326
+ const hyperlink: HyperlinkNode = {
327
+ type: "hyperlink",
328
+ href: context.hyperlinkHref,
329
+ children: [node],
330
+ };
331
+ inline.push(hyperlink);
332
+ } else {
333
+ inline.push(node);
334
+ }
335
+ cursor += 1;
336
+ continue;
337
+ }
338
+
339
+ if (token.type === "void") {
340
+ if (token.tag === "br") inline.push({ type: "hard_break" });
341
+ cursor += 1;
342
+ continue;
343
+ }
344
+
345
+ if (token.type === "open") {
346
+ const openTag = token.tag;
347
+ if (MARK_TAGS[openTag]) {
348
+ const markType = MARK_TAGS[openTag]!;
349
+ const nested = parseInlineUntil(tokens, cursor + 1, openTag, {
350
+ ...context,
351
+ marks: [...context.marks, { type: markType } as TextMark],
352
+ });
353
+ inline.push(...nested.inline);
354
+ cursor = nested.end;
355
+ continue;
356
+ }
357
+ if (openTag === "a") {
358
+ const href = sanitizeHref(token.attrs.href ?? "");
359
+ const nested = parseInlineUntil(tokens, cursor + 1, openTag, {
360
+ ...context,
361
+ hyperlinkHref: href,
362
+ });
363
+ inline.push(...nested.inline);
364
+ cursor = nested.end;
365
+ continue;
366
+ }
367
+ if (openTag === "span") {
368
+ // Google Docs encodes bold/italic via inline `style` on `<span>` rather
369
+ // than semantic `<b>`/`<i>`. Detect `font-weight:700+` and
370
+ // `font-style:italic` / `text-decoration:underline` /
371
+ // `text-decoration:line-through` and inject marks accordingly.
372
+ const extraMarks = marksFromInlineStyle(token.attrs.style ?? "");
373
+ const nested = parseInlineUntil(tokens, cursor + 1, openTag, {
374
+ ...context,
375
+ marks: extraMarks.length > 0 ? [...context.marks, ...extraMarks] : context.marks,
376
+ });
377
+ inline.push(...nested.inline);
378
+ cursor = nested.end;
379
+ continue;
380
+ }
381
+ // Unknown inline open — descend, passing context through. Use a
382
+ // loose fallthrough: treat unknown inlines as transparent wrappers.
383
+ const nested = parseInlineUntil(tokens, cursor + 1, openTag, context);
384
+ inline.push(...nested.inline);
385
+ cursor = nested.end;
386
+ continue;
387
+ }
388
+
389
+ cursor += 1;
390
+ }
391
+
392
+ return { inline, end: cursor };
393
+ }
394
+
395
+ /**
396
+ * Parse a CSS `style` string and extract the subset of declarations that
397
+ * map onto canonical `TextMark`s. Google Docs uses this encoding for every
398
+ * bold / italic / underline / strikethrough run on `<span>` elements —
399
+ * without style parsing we'd return an unmarked run where Word users expect
400
+ * a bold one. Limited to a narrow set of declarations to stay fast and
401
+ * avoid a full CSS parser.
402
+ */
403
+ function marksFromInlineStyle(style: string): TextMark[] {
404
+ if (!style || style.length === 0) return [];
405
+ const marks: TextMark[] = [];
406
+ // Split on `;` first; each piece is `name:value`.
407
+ const decls = style.split(";");
408
+ for (const raw of decls) {
409
+ const trimmed = raw.trim();
410
+ if (trimmed.length === 0) continue;
411
+ const colonIdx = trimmed.indexOf(":");
412
+ if (colonIdx < 0) continue;
413
+ const name = trimmed.slice(0, colonIdx).trim().toLowerCase();
414
+ const value = trimmed.slice(colonIdx + 1).trim().toLowerCase();
415
+ switch (name) {
416
+ case "font-weight": {
417
+ // Numeric 600+ or keywords "bold"/"bolder" → bold mark.
418
+ const numeric = Number.parseInt(value, 10);
419
+ if (Number.isFinite(numeric) && numeric >= 600) {
420
+ marks.push({ type: "bold" });
421
+ } else if (value === "bold" || value === "bolder") {
422
+ marks.push({ type: "bold" });
423
+ }
424
+ break;
425
+ }
426
+ case "font-style": {
427
+ if (value === "italic" || value === "oblique") {
428
+ marks.push({ type: "italic" });
429
+ }
430
+ break;
431
+ }
432
+ case "text-decoration":
433
+ case "text-decoration-line": {
434
+ if (value.includes("underline")) marks.push({ type: "underline" });
435
+ if (value.includes("line-through")) marks.push({ type: "strikethrough" });
436
+ break;
437
+ }
438
+ }
439
+ }
440
+ return marks;
441
+ }
442
+
443
+ function sanitizeHref(href: string): string {
444
+ const trimmed = href.trim().toLowerCase();
445
+ if (trimmed.startsWith("javascript:") || trimmed.startsWith("data:")) {
446
+ return "#";
447
+ }
448
+ return href;
449
+ }
@@ -63,7 +63,11 @@ export function parseCanonicalFragmentFromWordML(xml: string): ParseCanonicalFra
63
63
  * outer element survives, it's added to the outer `<w:document>` wrapper.
64
64
  */
65
65
  function ensureDocumentShell(xml: string): string {
66
- const trimmed = xml.trim();
66
+ // v5 A5: strip UTF-8 BOM before wrapper detection. Modern browsers /
67
+ // Word clipboards don't emit BOM, but some legacy tools do; a leading
68
+ // U+FEFF breaks the regex-based wrapper check and causes us to
69
+ // double-wrap a valid `<w:document>` payload.
70
+ const trimmed = xml.replace(/^\uFEFF/, "").trim();
67
71
  const withoutDecl = trimmed.replace(/^<\?xml[^?]*\?>/, "").trim();
68
72
 
69
73
  const hasDocumentWrapper = /^<w:document[\s>]/i.test(withoutDecl);
@@ -0,0 +1,26 @@
1
+ import type {
2
+ CanonicalDocument,
3
+ DocumentNode,
4
+ DocumentRootNode,
5
+ } from "../model/canonical-document.ts";
6
+
7
+ /**
8
+ * Accept either a full `CanonicalDocument` (or a Pick of it carrying `content`)
9
+ * or a raw `DocumentNode`, and return the node to walk.
10
+ *
11
+ * `DrawingFrameNode` also carries a `content` property (typed differently), so
12
+ * `"content" in document` alone is ambiguous; the `.content.type === "doc"`
13
+ * marker disambiguates since `DocumentRootNode.type === "doc"`.
14
+ */
15
+ export function resolveWalkableRoot(
16
+ document: Pick<CanonicalDocument, "content"> | DocumentNode,
17
+ ): DocumentNode {
18
+ // DrawingFrameNode also has a `content` property, so `"content" in document`
19
+ // alone is ambiguous. Every DocumentNode variant carries a top-level `type`
20
+ // discriminator; CanonicalDocument (and its Pick) does not — so the absence
21
+ // of a top-level `type` is a reliable guard.
22
+ if ("content" in document && !("type" in document)) {
23
+ return (document as { content: DocumentRootNode }).content;
24
+ }
25
+ return document as DocumentNode;
26
+ }
@@ -7,6 +7,7 @@ import type {
7
7
  FieldRegistry,
8
8
  FieldRegistryEntry,
9
9
  } from "../model/canonical-document.ts";
10
+ import { resolveWalkableRoot } from "./_document-root.ts";
10
11
 
11
12
  export interface LegalBookmark {
12
13
  bookmarkId: string;
@@ -93,7 +94,7 @@ export function parseBookmarksFromDocumentXml(xml: string): LegalBookmark[] {
93
94
  export function collectBookmarksFromCanonicalDocument(
94
95
  document: Pick<CanonicalDocument, "content"> | DocumentNode,
95
96
  ): LegalBookmark[] {
96
- const root = "content" in document ? document.content : document;
97
+ const root = resolveWalkableRoot(document);
97
98
  const sequence: Array<BookmarkStartNode | BookmarkEndNode> = [];
98
99
 
99
100
  walkDocument(root, (node) => {
@@ -219,7 +220,7 @@ function compareBookmarks(left: LegalBookmark, right: LegalBookmark): number {
219
220
  export function buildBookmarkNameMap(
220
221
  document: Pick<CanonicalDocument, "content"> | DocumentNode,
221
222
  ): Map<string, { bookmarkId: string; paragraphIndex: number }> {
222
- const root = "content" in document ? document.content : document;
223
+ const root = resolveWalkableRoot(document);
223
224
  const map = new Map<string, { bookmarkId: string; paragraphIndex: number }>();
224
225
  let paragraphIndex = -1;
225
226
 
@@ -289,7 +290,7 @@ export interface BookmarkRekeyPlan {
289
290
  export function detectDuplicateBookmarkIds(
290
291
  document: Pick<CanonicalDocument, "content"> | DocumentNode,
291
292
  ): BookmarkRekeyPlan {
292
- const root = "content" in document ? document.content : document;
293
+ const root = resolveWalkableRoot(document);
293
294
  const seenStarts = new Set<string>();
294
295
  const duplicatedIds = new Set<string>();
295
296
  let maxNumericId = 0;
@@ -12,6 +12,7 @@ import type {
12
12
  ParagraphNode,
13
13
  TocEntry,
14
14
  } from "../model/canonical-document.ts";
15
+ import { resolveWalkableRoot } from "./_document-root.ts";
15
16
 
16
17
  export interface CrossReferencePattern {
17
18
  kind: "section" | "clause" | "article" | "schedule" | "exhibit" | "appendix";
@@ -42,7 +43,7 @@ interface FieldReference {
42
43
  export function collectFieldReferencesFromCanonicalDocument(
43
44
  document: Pick<CanonicalDocument, "content"> | DocumentNode,
44
45
  ): Array<{ family: string; target: string; instruction: string; paragraphIndex: number; displayText: string }> {
45
- const root = "content" in document ? document.content : document;
46
+ const root = resolveWalkableRoot(document);
46
47
  const results: Array<{ family: string; target: string; instruction: string; paragraphIndex: number; displayText: string }> = [];
47
48
  let paragraphIndex = -1;
48
49
 
@@ -175,7 +176,7 @@ export function parseCrossReferencesFromDocumentXml(xml: string): CrossReference
175
176
  export function collectCrossReferencesFromCanonicalDocument(
176
177
  document: Pick<CanonicalDocument, "content"> | DocumentNode,
177
178
  ): CrossReference[] {
178
- const root = "content" in document ? document.content : document;
179
+ const root = resolveWalkableRoot(document);
179
180
  const results: CrossReference[] = [];
180
181
  let paragraphIndex = -1;
181
182
 
@@ -4,6 +4,7 @@ import type {
4
4
  DocumentNode,
5
5
  ParagraphNode,
6
6
  } from "../model/canonical-document.ts";
7
+ import { resolveWalkableRoot } from "./_document-root.ts";
7
8
 
8
9
  export interface DefinedTermOccurrence {
9
10
  paragraphIndex: number;
@@ -43,7 +44,7 @@ export function collectDefinedTermsFromDocumentXml(xml: string): DefinedTerm[] {
43
44
  export function collectDefinedTermsFromCanonicalDocument(
44
45
  document: Pick<CanonicalDocument, "content"> | DocumentNode,
45
46
  ): DefinedTerm[] {
46
- const root = "content" in document ? document.content : document;
47
+ const root = resolveWalkableRoot(document);
47
48
  const paragraphs: string[] = [];
48
49
 
49
50
  walkDocument(root, (node) => {
@@ -4,6 +4,7 @@ import type {
4
4
  DocumentNode,
5
5
  ParagraphNode,
6
6
  } from "../model/canonical-document.ts";
7
+ import { resolveWalkableRoot } from "./_document-root.ts";
7
8
 
8
9
  export interface SignatureBlockCandidate {
9
10
  startIndex: number;
@@ -39,7 +40,7 @@ const UNDERLINE_PLACEHOLDER_PATTERN = /_{4,}|\.{4,}/;
39
40
  export function detectSignatureBlocksFromCanonicalDocument(
40
41
  document: Pick<CanonicalDocument, "content" | "preservation"> | DocumentNode,
41
42
  ): SignatureBlockReport {
42
- const root = "content" in document ? document.content : document;
43
+ const root = resolveWalkableRoot(document);
43
44
  const paragraphs: Array<{ text: string; node: ParagraphNode }> = [];
44
45
  const warnings: string[] = [];
45
46