@beyondwork/docx-react-component 1.0.56 → 1.0.57
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/api/public-types.ts +157 -0
- package/src/compare/diff-engine.ts +3 -0
- package/src/core/commands/formatting-commands.ts +1 -0
- package/src/core/commands/index.ts +17 -11
- package/src/core/selection/mapping.ts +18 -1
- package/src/core/selection/review-anchors.ts +29 -18
- package/src/io/chart-preview-resolver.ts +175 -41
- package/src/io/docx-session.ts +57 -2
- package/src/io/export/serialize-main-document.ts +82 -0
- package/src/io/export/serialize-styles.ts +61 -3
- package/src/io/export/table-properties-xml.ts +19 -4
- package/src/io/normalize/normalize-text.ts +33 -0
- package/src/io/ooxml/parse-anchor.ts +182 -0
- package/src/io/ooxml/parse-drawing.ts +319 -0
- package/src/io/ooxml/parse-fields.ts +115 -2
- package/src/io/ooxml/parse-fill.ts +215 -0
- package/src/io/ooxml/parse-font-table.ts +190 -0
- package/src/io/ooxml/parse-footnotes.ts +52 -1
- package/src/io/ooxml/parse-main-document.ts +241 -1
- package/src/io/ooxml/parse-numbering.ts +96 -0
- package/src/io/ooxml/parse-picture.ts +107 -0
- package/src/io/ooxml/parse-settings.ts +34 -0
- package/src/io/ooxml/parse-shapes.ts +87 -0
- package/src/io/ooxml/parse-solid-fill.ts +11 -0
- package/src/io/ooxml/parse-styles.ts +74 -1
- package/src/io/ooxml/parse-theme.ts +60 -0
- package/src/io/paste/html-clipboard.ts +449 -0
- package/src/io/paste/word-clipboard.ts +5 -1
- package/src/legal/_document-root.ts +26 -0
- package/src/legal/bookmarks.ts +4 -3
- package/src/legal/cross-references.ts +3 -2
- package/src/legal/defined-terms.ts +2 -1
- package/src/legal/signature-blocks.ts +2 -1
- package/src/model/canonical-document.ts +415 -3
- package/src/runtime/chart/chart-model-store.ts +73 -10
- package/src/runtime/document-runtime.ts +693 -41
- package/src/runtime/edit-ops/index.ts +129 -0
- package/src/runtime/event-refresh-hints.ts +7 -0
- package/src/runtime/field-resolver.ts +341 -0
- package/src/runtime/footnote-resolver.ts +55 -0
- package/src/runtime/hyperlink-color-resolver.ts +13 -10
- package/src/runtime/object-grab/index.ts +51 -0
- package/src/runtime/paragraph-style-resolver.ts +105 -0
- package/src/runtime/resolved-numbering-geometry.ts +12 -0
- package/src/runtime/selection/cursor-ops.ts +186 -15
- package/src/runtime/selection/index.ts +17 -1
- package/src/runtime/structure-ops/index.ts +77 -0
- package/src/runtime/styles-cascade.ts +33 -0
- package/src/runtime/surface-projection.ts +186 -12
- package/src/runtime/theme-color-resolver.ts +189 -44
- package/src/runtime/units.ts +46 -0
- package/src/runtime/view-state.ts +13 -2
- package/src/ui/WordReviewEditor.tsx +168 -10
- package/src/ui/editor-runtime-boundary.ts +94 -1
- package/src/ui/editor-shell-view.tsx +1 -1
- package/src/ui/runtime-shortcut-dispatch.ts +17 -3
- package/src/ui-tailwind/chart/ChartSurface.tsx +36 -10
- package/src/ui-tailwind/chart/layout/plot-area.ts +120 -45
- package/src/ui-tailwind/chart/render/area.tsx +22 -4
- package/src/ui-tailwind/chart/render/bar-column.tsx +37 -11
- package/src/ui-tailwind/chart/render/bubble.tsx +6 -2
- package/src/ui-tailwind/chart/render/combo.tsx +37 -4
- package/src/ui-tailwind/chart/render/line.tsx +28 -5
- package/src/ui-tailwind/chart/render/pie.tsx +36 -16
- package/src/ui-tailwind/chart/render/progressive-render.ts +8 -1
- package/src/ui-tailwind/chart/render/scatter.tsx +9 -4
- package/src/ui-tailwind/chrome/avatar-initials.ts +15 -0
- package/src/ui-tailwind/chrome/tw-comment-preview.tsx +3 -1
- package/src/ui-tailwind/chrome/tw-context-menu.tsx +14 -0
- package/src/ui-tailwind/chrome/tw-selection-tool-host.tsx +3 -2
- package/src/ui-tailwind/chrome/tw-selection-toolbar.tsx +30 -11
- package/src/ui-tailwind/chrome/tw-shortcut-hint.tsx +15 -2
- package/src/ui-tailwind/chrome/tw-suggestion-card.tsx +1 -1
- package/src/ui-tailwind/chrome/tw-table-context-toolbar.tsx +24 -7
- package/src/ui-tailwind/chrome/tw-table-grip-layer.tsx +31 -12
- package/src/ui-tailwind/chrome-overlay/page-border-resolver.ts +211 -0
- package/src/ui-tailwind/chrome-overlay/tw-chrome-overlay.tsx +1 -0
- package/src/ui-tailwind/chrome-overlay/tw-comment-balloon-layer.tsx +74 -0
- package/src/ui-tailwind/chrome-overlay/tw-locked-block-layer.tsx +65 -0
- package/src/ui-tailwind/chrome-overlay/tw-page-border-overlay.tsx +233 -0
- package/src/ui-tailwind/chrome-overlay/tw-page-stack-overlay-layer.tsx +135 -13
- package/src/ui-tailwind/chrome-overlay/tw-revision-margin-bar-layer.tsx +51 -0
- package/src/ui-tailwind/chrome-overlay/tw-scope-card-layer.tsx +12 -4
- package/src/ui-tailwind/chrome-overlay/tw-scope-card.tsx +32 -12
- package/src/ui-tailwind/chrome-overlay/tw-toc-outline-sidebar.tsx +133 -0
- package/src/ui-tailwind/editor-surface/chart-node-view.tsx +49 -10
- package/src/ui-tailwind/editor-surface/float-wrap-resolver.ts +119 -0
- package/src/ui-tailwind/editor-surface/pm-command-bridge.ts +236 -9
- package/src/ui-tailwind/editor-surface/pm-schema.ts +188 -11
- package/src/ui-tailwind/editor-surface/pm-state-from-snapshot.ts +28 -2
- package/src/ui-tailwind/editor-surface/shape-renderer.ts +206 -0
- package/src/ui-tailwind/editor-surface/surface-layer.ts +66 -0
- package/src/ui-tailwind/editor-surface/tw-inline-token.tsx +29 -0
- package/src/ui-tailwind/editor-surface/tw-segment-view.tsx +7 -1
- package/src/ui-tailwind/page-stack/tw-page-stack-chrome-layer.tsx +22 -6
- package/src/ui-tailwind/review/tw-comment-sidebar.tsx +10 -16
- package/src/ui-tailwind/review/tw-health-panel.tsx +0 -25
- package/src/ui-tailwind/review/tw-rail-card.tsx +38 -17
- package/src/ui-tailwind/review/tw-review-rail.tsx +2 -2
- package/src/ui-tailwind/review/tw-revision-sidebar.tsx +5 -12
- package/src/ui-tailwind/review/tw-workflow-tab.tsx +2 -2
- package/src/ui-tailwind/theme/editor-theme.css +1 -0
- package/src/ui-tailwind/theme/tokens.css +6 -0
- package/src/ui-tailwind/theme/tokens.ts +10 -0
- package/src/validation/compatibility-engine.ts +2 -0
- package/src/validation/docx-comment-proof.ts +12 -3
|
@@ -0,0 +1,449 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* I2 Tier B Slice 3 — HTML paste parser.
|
|
3
|
+
*
|
|
4
|
+
* Purpose: when the browser clipboard holds `text/html` but no WordML MIME,
|
|
5
|
+
* route it through this parser to produce a `CanonicalDocumentFragment`.
|
|
6
|
+
* Timeboxed to the two fixtures that cover legal-review use cases —
|
|
7
|
+
* Google Docs and Word web. Outlook / Gmail / raw browser selection follow
|
|
8
|
+
* when user fixtures surface.
|
|
9
|
+
*
|
|
10
|
+
* Implementation: a small handwritten tokenizer + recursive parser. Keeping
|
|
11
|
+
* the dependency profile minimal (no jsdom / parse5 at runtime) — the HTML
|
|
12
|
+
* subset we accept is narrow enough that a 200-line tokenizer reads cleaner
|
|
13
|
+
* than pulling in a 50 KB DOM library.
|
|
14
|
+
*
|
|
15
|
+
* What this parser accepts:
|
|
16
|
+
* - Block structure: `<p>`, `<div>`, `<h1>`–`<h6>`, `<ul>`/`<ol>` + `<li>`
|
|
17
|
+
* - Inline structure: `<span>`, `<b>`/`<strong>`, `<i>`/`<em>`, `<u>`,
|
|
18
|
+
* `<s>`/`<del>`, `<a href>`, `<br>`
|
|
19
|
+
* - Meta: `<meta>`, `<html>`, `<head>`, `<body>` — passed through
|
|
20
|
+
*
|
|
21
|
+
* What this parser drops (sanitizer):
|
|
22
|
+
* - `<script>` / `<style>` / `<iframe>` / `<form>` / `<input>` tag CONTENT
|
|
23
|
+
* - Event handler attributes (`on*`)
|
|
24
|
+
* - `data-*` attributes
|
|
25
|
+
* - `javascript:` hrefs (rewritten to `#`)
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
import type { CanonicalDocumentFragment } from "../../api/public-types.ts";
|
|
29
|
+
import type {
|
|
30
|
+
BlockNode,
|
|
31
|
+
HyperlinkNode,
|
|
32
|
+
InlineNode,
|
|
33
|
+
ParagraphNode,
|
|
34
|
+
TextMark,
|
|
35
|
+
TextNode,
|
|
36
|
+
} from "../../model/canonical-document.ts";
|
|
37
|
+
|
|
38
|
+
export type ParseCanonicalFragmentFromHtmlResult =
|
|
39
|
+
| { ok: true; fragment: CanonicalDocumentFragment }
|
|
40
|
+
| { ok: false; reason: string };
|
|
41
|
+
|
|
42
|
+
export function parseCanonicalFragmentFromHtml(
|
|
43
|
+
html: string,
|
|
44
|
+
): ParseCanonicalFragmentFromHtmlResult {
|
|
45
|
+
if (typeof html !== "string") {
|
|
46
|
+
return { ok: false, reason: "html must be a string" };
|
|
47
|
+
}
|
|
48
|
+
if (html.length === 0) {
|
|
49
|
+
return { ok: true, fragment: { blocks: [] } };
|
|
50
|
+
}
|
|
51
|
+
try {
|
|
52
|
+
const sanitized = stripUnsafeRegions(html);
|
|
53
|
+
const tokens = tokenize(sanitized);
|
|
54
|
+
const blocks = parseBlocks(tokens);
|
|
55
|
+
return { ok: true, fragment: { blocks } };
|
|
56
|
+
} catch (error) {
|
|
57
|
+
return {
|
|
58
|
+
ok: false,
|
|
59
|
+
reason: error instanceof Error ? error.message : "unknown HTML parse error",
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// ─── Sanitizer ────────────────────────────────────────────────────────────
|
|
65
|
+
|
|
66
|
+
const UNSAFE_REGIONS = /<(script|style|iframe|form|input|object|embed)\b[^>]*>[\s\S]*?<\/\1>/gi;
|
|
67
|
+
const UNSAFE_SELF_CLOSING = /<(script|style|iframe|form|input|object|embed)\b[^>]*\/?>/gi;
|
|
68
|
+
|
|
69
|
+
function stripUnsafeRegions(html: string): string {
|
|
70
|
+
return html.replace(UNSAFE_REGIONS, "").replace(UNSAFE_SELF_CLOSING, "");
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// ─── Tokenizer ────────────────────────────────────────────────────────────
|
|
74
|
+
|
|
75
|
+
type Token =
|
|
76
|
+
| { type: "open"; tag: string; attrs: Record<string, string> }
|
|
77
|
+
| { type: "close"; tag: string }
|
|
78
|
+
| { type: "void"; tag: string; attrs: Record<string, string> }
|
|
79
|
+
| { type: "text"; value: string };
|
|
80
|
+
|
|
81
|
+
const VOID_TAGS = new Set(["br", "hr", "img", "meta", "link", "source", "track", "area"]);
|
|
82
|
+
|
|
83
|
+
function tokenize(html: string): Token[] {
|
|
84
|
+
const tokens: Token[] = [];
|
|
85
|
+
const re = /<([!/]?)([a-zA-Z][a-zA-Z0-9-]*)([^>]*)>|([^<]+)/g;
|
|
86
|
+
let match: RegExpExecArray | null;
|
|
87
|
+
while ((match = re.exec(html)) !== null) {
|
|
88
|
+
const [, prefix, rawTag, attrString, textRun] = match;
|
|
89
|
+
if (textRun !== undefined) {
|
|
90
|
+
const decoded = decodeEntities(textRun);
|
|
91
|
+
if (decoded.length > 0) tokens.push({ type: "text", value: decoded });
|
|
92
|
+
continue;
|
|
93
|
+
}
|
|
94
|
+
if (prefix === "!") continue; // comments / doctypes
|
|
95
|
+
const tag = rawTag!.toLowerCase();
|
|
96
|
+
if (prefix === "/") {
|
|
97
|
+
tokens.push({ type: "close", tag });
|
|
98
|
+
continue;
|
|
99
|
+
}
|
|
100
|
+
const attrs = parseAttrs(attrString ?? "");
|
|
101
|
+
const isSelfClosing = (attrString ?? "").trim().endsWith("/");
|
|
102
|
+
if (VOID_TAGS.has(tag) || isSelfClosing) {
|
|
103
|
+
tokens.push({ type: "void", tag, attrs });
|
|
104
|
+
} else {
|
|
105
|
+
tokens.push({ type: "open", tag, attrs });
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
return tokens;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
function parseAttrs(attrString: string): Record<string, string> {
|
|
112
|
+
const attrs: Record<string, string> = {};
|
|
113
|
+
const re = /([a-zA-Z_:][a-zA-Z0-9_:.-]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]+)))?/g;
|
|
114
|
+
let match: RegExpExecArray | null;
|
|
115
|
+
while ((match = re.exec(attrString)) !== null) {
|
|
116
|
+
const [, name, dq, sq, bare] = match;
|
|
117
|
+
const lowerName = name!.toLowerCase();
|
|
118
|
+
// Sanitizer: drop event handlers + data-* attributes.
|
|
119
|
+
if (lowerName.startsWith("on")) continue;
|
|
120
|
+
if (lowerName.startsWith("data-")) continue;
|
|
121
|
+
attrs[lowerName] = (dq ?? sq ?? bare ?? "").trim();
|
|
122
|
+
}
|
|
123
|
+
return attrs;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
const ENTITY_MAP: Record<string, string> = {
|
|
127
|
+
amp: "&",
|
|
128
|
+
lt: "<",
|
|
129
|
+
gt: ">",
|
|
130
|
+
quot: '"',
|
|
131
|
+
apos: "'",
|
|
132
|
+
nbsp: "\u00a0",
|
|
133
|
+
};
|
|
134
|
+
|
|
135
|
+
function decodeEntities(text: string): string {
|
|
136
|
+
return text.replace(/&(#?[a-zA-Z0-9]+);/g, (whole, ent) => {
|
|
137
|
+
if (ent.startsWith("#")) {
|
|
138
|
+
const code = ent.startsWith("#x") || ent.startsWith("#X")
|
|
139
|
+
? parseInt(ent.slice(2), 16)
|
|
140
|
+
: parseInt(ent.slice(1), 10);
|
|
141
|
+
return Number.isFinite(code) ? String.fromCodePoint(code) : whole;
|
|
142
|
+
}
|
|
143
|
+
return ENTITY_MAP[ent.toLowerCase()] ?? whole;
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// ─── Block / inline parser ────────────────────────────────────────────────
|
|
148
|
+
|
|
149
|
+
interface ParseContext {
|
|
150
|
+
marks: TextMark[];
|
|
151
|
+
hyperlinkHref?: string;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
const BLOCK_TAGS = new Set([
|
|
155
|
+
"p",
|
|
156
|
+
"div",
|
|
157
|
+
"h1",
|
|
158
|
+
"h2",
|
|
159
|
+
"h3",
|
|
160
|
+
"h4",
|
|
161
|
+
"h5",
|
|
162
|
+
"h6",
|
|
163
|
+
"li",
|
|
164
|
+
"tr",
|
|
165
|
+
]);
|
|
166
|
+
|
|
167
|
+
const CONTAINER_TAGS = new Set(["html", "body", "head", "ul", "ol", "b", "strong", "i", "em", "u", "s", "del", "span", "a"]);
|
|
168
|
+
|
|
169
|
+
const MARK_TAGS: Record<string, TextMark["type"]> = {
|
|
170
|
+
b: "bold",
|
|
171
|
+
strong: "bold",
|
|
172
|
+
i: "italic",
|
|
173
|
+
em: "italic",
|
|
174
|
+
u: "underline",
|
|
175
|
+
s: "strikethrough",
|
|
176
|
+
del: "strikethrough",
|
|
177
|
+
};
|
|
178
|
+
|
|
179
|
+
const HEADING_STYLES: Record<string, string> = {
|
|
180
|
+
h1: "Heading1",
|
|
181
|
+
h2: "Heading2",
|
|
182
|
+
h3: "Heading3",
|
|
183
|
+
h4: "Heading4",
|
|
184
|
+
h5: "Heading5",
|
|
185
|
+
h6: "Heading6",
|
|
186
|
+
};
|
|
187
|
+
|
|
188
|
+
function parseBlocks(tokens: Token[]): BlockNode[] {
|
|
189
|
+
const blocks: BlockNode[] = [];
|
|
190
|
+
let cursor = 0;
|
|
191
|
+
|
|
192
|
+
// Accumulate inline nodes for "loose" text that isn't inside a block tag
|
|
193
|
+
// (e.g. Google Docs wraps content in a `<b>` container; the child `<p>`s
|
|
194
|
+
// are the real blocks). When we hit a BLOCK_TAGS opener, flush any
|
|
195
|
+
// pending inline content into a paragraph.
|
|
196
|
+
let pendingInline: InlineNode[] = [];
|
|
197
|
+
// v5 A2: track marks from mark tags encountered at block level (outside a
|
|
198
|
+
// `<p>`/`<div>`). Pre-fix, `<body><b>hello</b></body>` dropped the bold
|
|
199
|
+
// mark because the block walker descended `<b>` as a transparent container
|
|
200
|
+
// and the text landed in pendingInline unmarked. We now stack mark contexts
|
|
201
|
+
// alongside pendingInline and apply them when text tokens arrive.
|
|
202
|
+
const blockMarkStack: TextMark[][] = [];
|
|
203
|
+
const currentBlockMarks = (): TextMark[] => {
|
|
204
|
+
if (blockMarkStack.length === 0) return [];
|
|
205
|
+
return blockMarkStack.flat();
|
|
206
|
+
};
|
|
207
|
+
|
|
208
|
+
const flushPendingInline = () => {
|
|
209
|
+
if (pendingInline.length > 0) {
|
|
210
|
+
blocks.push({
|
|
211
|
+
type: "paragraph",
|
|
212
|
+
children: pendingInline,
|
|
213
|
+
});
|
|
214
|
+
pendingInline = [];
|
|
215
|
+
}
|
|
216
|
+
};
|
|
217
|
+
|
|
218
|
+
while (cursor < tokens.length) {
|
|
219
|
+
const token = tokens[cursor]!;
|
|
220
|
+
|
|
221
|
+
if (token.type === "text") {
|
|
222
|
+
const marks = currentBlockMarks();
|
|
223
|
+
const inline: TextNode = {
|
|
224
|
+
type: "text",
|
|
225
|
+
text: token.value,
|
|
226
|
+
...(marks.length > 0 ? { marks: marks.map((m) => ({ ...m })) } : {}),
|
|
227
|
+
};
|
|
228
|
+
pendingInline.push(inline);
|
|
229
|
+
cursor += 1;
|
|
230
|
+
continue;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
if (token.type === "void") {
|
|
234
|
+
if (token.tag === "br") {
|
|
235
|
+
pendingInline.push({ type: "hard_break" });
|
|
236
|
+
}
|
|
237
|
+
// meta / link / etc. — ignore
|
|
238
|
+
cursor += 1;
|
|
239
|
+
continue;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
if (token.type === "close") {
|
|
243
|
+
// Pop a block-level mark context if this close matches an open mark tag.
|
|
244
|
+
if (MARK_TAGS[token.tag] && blockMarkStack.length > 0) {
|
|
245
|
+
blockMarkStack.pop();
|
|
246
|
+
}
|
|
247
|
+
cursor += 1;
|
|
248
|
+
continue;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
// open
|
|
252
|
+
const openTag = token.tag;
|
|
253
|
+
|
|
254
|
+
if (BLOCK_TAGS.has(openTag)) {
|
|
255
|
+
flushPendingInline();
|
|
256
|
+
// Consume inner content up to matching close, collecting inline nodes.
|
|
257
|
+
// Seed with any active block-level marks so `<b><p>hello</p></b>`
|
|
258
|
+
// still produces bold inline nodes inside the paragraph.
|
|
259
|
+
const { inline, end } = parseInlineUntil(tokens, cursor + 1, openTag, {
|
|
260
|
+
marks: currentBlockMarks(),
|
|
261
|
+
});
|
|
262
|
+
const paragraph: ParagraphNode = {
|
|
263
|
+
type: "paragraph",
|
|
264
|
+
children: inline,
|
|
265
|
+
...(HEADING_STYLES[openTag] ? { styleId: HEADING_STYLES[openTag] } : {}),
|
|
266
|
+
};
|
|
267
|
+
blocks.push(paragraph);
|
|
268
|
+
cursor = end;
|
|
269
|
+
continue;
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
if (openTag === "ul" || openTag === "ol") {
|
|
273
|
+
// Walk forward until matching close; block children handled via `li`.
|
|
274
|
+
// Descend — `li` is a BLOCK_TAGS entry.
|
|
275
|
+
cursor += 1;
|
|
276
|
+
continue;
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
if (MARK_TAGS[openTag]) {
|
|
280
|
+
// v5 A2: mark tag at block level — push onto the block-level mark
|
|
281
|
+
// stack so bare text tokens (and any subsequent BLOCK_TAGS that flush
|
|
282
|
+
// pendingInline into a paragraph) pick up the mark.
|
|
283
|
+
blockMarkStack.push([{ type: MARK_TAGS[openTag]! } as TextMark]);
|
|
284
|
+
cursor += 1;
|
|
285
|
+
continue;
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
if (CONTAINER_TAGS.has(openTag)) {
|
|
289
|
+
// Descend into the container — keep pendingInline flowing; the mark
|
|
290
|
+
// parsing happens when we hit text nodes.
|
|
291
|
+
cursor += 1;
|
|
292
|
+
continue;
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
// Unknown tag — skip.
|
|
296
|
+
cursor += 1;
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
flushPendingInline();
|
|
300
|
+
return blocks;
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
function parseInlineUntil(
|
|
304
|
+
tokens: Token[],
|
|
305
|
+
start: number,
|
|
306
|
+
untilTag: string,
|
|
307
|
+
context: ParseContext,
|
|
308
|
+
): { inline: InlineNode[]; end: number } {
|
|
309
|
+
const inline: InlineNode[] = [];
|
|
310
|
+
let cursor = start;
|
|
311
|
+
|
|
312
|
+
while (cursor < tokens.length) {
|
|
313
|
+
const token = tokens[cursor]!;
|
|
314
|
+
|
|
315
|
+
if (token.type === "close" && token.tag === untilTag) {
|
|
316
|
+
return { inline, end: cursor + 1 };
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
if (token.type === "text") {
|
|
320
|
+
const node: TextNode = {
|
|
321
|
+
type: "text",
|
|
322
|
+
text: token.value,
|
|
323
|
+
...(context.marks.length > 0 ? { marks: context.marks.map((m) => ({ ...m })) } : {}),
|
|
324
|
+
};
|
|
325
|
+
if (context.hyperlinkHref) {
|
|
326
|
+
const hyperlink: HyperlinkNode = {
|
|
327
|
+
type: "hyperlink",
|
|
328
|
+
href: context.hyperlinkHref,
|
|
329
|
+
children: [node],
|
|
330
|
+
};
|
|
331
|
+
inline.push(hyperlink);
|
|
332
|
+
} else {
|
|
333
|
+
inline.push(node);
|
|
334
|
+
}
|
|
335
|
+
cursor += 1;
|
|
336
|
+
continue;
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
if (token.type === "void") {
|
|
340
|
+
if (token.tag === "br") inline.push({ type: "hard_break" });
|
|
341
|
+
cursor += 1;
|
|
342
|
+
continue;
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
if (token.type === "open") {
|
|
346
|
+
const openTag = token.tag;
|
|
347
|
+
if (MARK_TAGS[openTag]) {
|
|
348
|
+
const markType = MARK_TAGS[openTag]!;
|
|
349
|
+
const nested = parseInlineUntil(tokens, cursor + 1, openTag, {
|
|
350
|
+
...context,
|
|
351
|
+
marks: [...context.marks, { type: markType } as TextMark],
|
|
352
|
+
});
|
|
353
|
+
inline.push(...nested.inline);
|
|
354
|
+
cursor = nested.end;
|
|
355
|
+
continue;
|
|
356
|
+
}
|
|
357
|
+
if (openTag === "a") {
|
|
358
|
+
const href = sanitizeHref(token.attrs.href ?? "");
|
|
359
|
+
const nested = parseInlineUntil(tokens, cursor + 1, openTag, {
|
|
360
|
+
...context,
|
|
361
|
+
hyperlinkHref: href,
|
|
362
|
+
});
|
|
363
|
+
inline.push(...nested.inline);
|
|
364
|
+
cursor = nested.end;
|
|
365
|
+
continue;
|
|
366
|
+
}
|
|
367
|
+
if (openTag === "span") {
|
|
368
|
+
// Google Docs encodes bold/italic via inline `style` on `<span>` rather
|
|
369
|
+
// than semantic `<b>`/`<i>`. Detect `font-weight:700+` and
|
|
370
|
+
// `font-style:italic` / `text-decoration:underline` /
|
|
371
|
+
// `text-decoration:line-through` and inject marks accordingly.
|
|
372
|
+
const extraMarks = marksFromInlineStyle(token.attrs.style ?? "");
|
|
373
|
+
const nested = parseInlineUntil(tokens, cursor + 1, openTag, {
|
|
374
|
+
...context,
|
|
375
|
+
marks: extraMarks.length > 0 ? [...context.marks, ...extraMarks] : context.marks,
|
|
376
|
+
});
|
|
377
|
+
inline.push(...nested.inline);
|
|
378
|
+
cursor = nested.end;
|
|
379
|
+
continue;
|
|
380
|
+
}
|
|
381
|
+
// Unknown inline open — descend, passing context through. Use a
|
|
382
|
+
// loose fallthrough: treat unknown inlines as transparent wrappers.
|
|
383
|
+
const nested = parseInlineUntil(tokens, cursor + 1, openTag, context);
|
|
384
|
+
inline.push(...nested.inline);
|
|
385
|
+
cursor = nested.end;
|
|
386
|
+
continue;
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
cursor += 1;
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
return { inline, end: cursor };
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
/**
|
|
396
|
+
* Parse a CSS `style` string and extract the subset of declarations that
|
|
397
|
+
* map onto canonical `TextMark`s. Google Docs uses this encoding for every
|
|
398
|
+
* bold / italic / underline / strikethrough run on `<span>` elements —
|
|
399
|
+
* without style parsing we'd return an unmarked run where Word users expect
|
|
400
|
+
* a bold one. Limited to a narrow set of declarations to stay fast and
|
|
401
|
+
* avoid a full CSS parser.
|
|
402
|
+
*/
|
|
403
|
+
function marksFromInlineStyle(style: string): TextMark[] {
|
|
404
|
+
if (!style || style.length === 0) return [];
|
|
405
|
+
const marks: TextMark[] = [];
|
|
406
|
+
// Split on `;` first; each piece is `name:value`.
|
|
407
|
+
const decls = style.split(";");
|
|
408
|
+
for (const raw of decls) {
|
|
409
|
+
const trimmed = raw.trim();
|
|
410
|
+
if (trimmed.length === 0) continue;
|
|
411
|
+
const colonIdx = trimmed.indexOf(":");
|
|
412
|
+
if (colonIdx < 0) continue;
|
|
413
|
+
const name = trimmed.slice(0, colonIdx).trim().toLowerCase();
|
|
414
|
+
const value = trimmed.slice(colonIdx + 1).trim().toLowerCase();
|
|
415
|
+
switch (name) {
|
|
416
|
+
case "font-weight": {
|
|
417
|
+
// Numeric 600+ or keywords "bold"/"bolder" → bold mark.
|
|
418
|
+
const numeric = Number.parseInt(value, 10);
|
|
419
|
+
if (Number.isFinite(numeric) && numeric >= 600) {
|
|
420
|
+
marks.push({ type: "bold" });
|
|
421
|
+
} else if (value === "bold" || value === "bolder") {
|
|
422
|
+
marks.push({ type: "bold" });
|
|
423
|
+
}
|
|
424
|
+
break;
|
|
425
|
+
}
|
|
426
|
+
case "font-style": {
|
|
427
|
+
if (value === "italic" || value === "oblique") {
|
|
428
|
+
marks.push({ type: "italic" });
|
|
429
|
+
}
|
|
430
|
+
break;
|
|
431
|
+
}
|
|
432
|
+
case "text-decoration":
|
|
433
|
+
case "text-decoration-line": {
|
|
434
|
+
if (value.includes("underline")) marks.push({ type: "underline" });
|
|
435
|
+
if (value.includes("line-through")) marks.push({ type: "strikethrough" });
|
|
436
|
+
break;
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
return marks;
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
function sanitizeHref(href: string): string {
|
|
444
|
+
const trimmed = href.trim().toLowerCase();
|
|
445
|
+
if (trimmed.startsWith("javascript:") || trimmed.startsWith("data:")) {
|
|
446
|
+
return "#";
|
|
447
|
+
}
|
|
448
|
+
return href;
|
|
449
|
+
}
|
|
@@ -63,7 +63,11 @@ export function parseCanonicalFragmentFromWordML(xml: string): ParseCanonicalFra
|
|
|
63
63
|
* outer element survives, it's added to the outer `<w:document>` wrapper.
|
|
64
64
|
*/
|
|
65
65
|
function ensureDocumentShell(xml: string): string {
|
|
66
|
-
|
|
66
|
+
// v5 A5: strip UTF-8 BOM before wrapper detection. Modern browsers /
|
|
67
|
+
// Word clipboards don't emit BOM, but some legacy tools do; a leading
|
|
68
|
+
// U+FEFF breaks the regex-based wrapper check and causes us to
|
|
69
|
+
// double-wrap a valid `<w:document>` payload.
|
|
70
|
+
const trimmed = xml.replace(/^\uFEFF/, "").trim();
|
|
67
71
|
const withoutDecl = trimmed.replace(/^<\?xml[^?]*\?>/, "").trim();
|
|
68
72
|
|
|
69
73
|
const hasDocumentWrapper = /^<w:document[\s>]/i.test(withoutDecl);
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import type {
|
|
2
|
+
CanonicalDocument,
|
|
3
|
+
DocumentNode,
|
|
4
|
+
DocumentRootNode,
|
|
5
|
+
} from "../model/canonical-document.ts";
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Accept either a full `CanonicalDocument` (or a Pick of it carrying `content`)
|
|
9
|
+
* or a raw `DocumentNode`, and return the node to walk.
|
|
10
|
+
*
|
|
11
|
+
* `DrawingFrameNode` also carries a `content` property (typed differently), so
|
|
12
|
+
* `"content" in document` alone is ambiguous; the `.content.type === "doc"`
|
|
13
|
+
* marker disambiguates since `DocumentRootNode.type === "doc"`.
|
|
14
|
+
*/
|
|
15
|
+
export function resolveWalkableRoot(
|
|
16
|
+
document: Pick<CanonicalDocument, "content"> | DocumentNode,
|
|
17
|
+
): DocumentNode {
|
|
18
|
+
// DrawingFrameNode also has a `content` property, so `"content" in document`
|
|
19
|
+
// alone is ambiguous. Every DocumentNode variant carries a top-level `type`
|
|
20
|
+
// discriminator; CanonicalDocument (and its Pick) does not — so the absence
|
|
21
|
+
// of a top-level `type` is a reliable guard.
|
|
22
|
+
if ("content" in document && !("type" in document)) {
|
|
23
|
+
return (document as { content: DocumentRootNode }).content;
|
|
24
|
+
}
|
|
25
|
+
return document as DocumentNode;
|
|
26
|
+
}
|
package/src/legal/bookmarks.ts
CHANGED
|
@@ -7,6 +7,7 @@ import type {
|
|
|
7
7
|
FieldRegistry,
|
|
8
8
|
FieldRegistryEntry,
|
|
9
9
|
} from "../model/canonical-document.ts";
|
|
10
|
+
import { resolveWalkableRoot } from "./_document-root.ts";
|
|
10
11
|
|
|
11
12
|
export interface LegalBookmark {
|
|
12
13
|
bookmarkId: string;
|
|
@@ -93,7 +94,7 @@ export function parseBookmarksFromDocumentXml(xml: string): LegalBookmark[] {
|
|
|
93
94
|
export function collectBookmarksFromCanonicalDocument(
|
|
94
95
|
document: Pick<CanonicalDocument, "content"> | DocumentNode,
|
|
95
96
|
): LegalBookmark[] {
|
|
96
|
-
const root =
|
|
97
|
+
const root = resolveWalkableRoot(document);
|
|
97
98
|
const sequence: Array<BookmarkStartNode | BookmarkEndNode> = [];
|
|
98
99
|
|
|
99
100
|
walkDocument(root, (node) => {
|
|
@@ -219,7 +220,7 @@ function compareBookmarks(left: LegalBookmark, right: LegalBookmark): number {
|
|
|
219
220
|
export function buildBookmarkNameMap(
|
|
220
221
|
document: Pick<CanonicalDocument, "content"> | DocumentNode,
|
|
221
222
|
): Map<string, { bookmarkId: string; paragraphIndex: number }> {
|
|
222
|
-
const root =
|
|
223
|
+
const root = resolveWalkableRoot(document);
|
|
223
224
|
const map = new Map<string, { bookmarkId: string; paragraphIndex: number }>();
|
|
224
225
|
let paragraphIndex = -1;
|
|
225
226
|
|
|
@@ -289,7 +290,7 @@ export interface BookmarkRekeyPlan {
|
|
|
289
290
|
export function detectDuplicateBookmarkIds(
|
|
290
291
|
document: Pick<CanonicalDocument, "content"> | DocumentNode,
|
|
291
292
|
): BookmarkRekeyPlan {
|
|
292
|
-
const root =
|
|
293
|
+
const root = resolveWalkableRoot(document);
|
|
293
294
|
const seenStarts = new Set<string>();
|
|
294
295
|
const duplicatedIds = new Set<string>();
|
|
295
296
|
let maxNumericId = 0;
|
|
@@ -12,6 +12,7 @@ import type {
|
|
|
12
12
|
ParagraphNode,
|
|
13
13
|
TocEntry,
|
|
14
14
|
} from "../model/canonical-document.ts";
|
|
15
|
+
import { resolveWalkableRoot } from "./_document-root.ts";
|
|
15
16
|
|
|
16
17
|
export interface CrossReferencePattern {
|
|
17
18
|
kind: "section" | "clause" | "article" | "schedule" | "exhibit" | "appendix";
|
|
@@ -42,7 +43,7 @@ interface FieldReference {
|
|
|
42
43
|
export function collectFieldReferencesFromCanonicalDocument(
|
|
43
44
|
document: Pick<CanonicalDocument, "content"> | DocumentNode,
|
|
44
45
|
): Array<{ family: string; target: string; instruction: string; paragraphIndex: number; displayText: string }> {
|
|
45
|
-
const root =
|
|
46
|
+
const root = resolveWalkableRoot(document);
|
|
46
47
|
const results: Array<{ family: string; target: string; instruction: string; paragraphIndex: number; displayText: string }> = [];
|
|
47
48
|
let paragraphIndex = -1;
|
|
48
49
|
|
|
@@ -175,7 +176,7 @@ export function parseCrossReferencesFromDocumentXml(xml: string): CrossReference
|
|
|
175
176
|
export function collectCrossReferencesFromCanonicalDocument(
|
|
176
177
|
document: Pick<CanonicalDocument, "content"> | DocumentNode,
|
|
177
178
|
): CrossReference[] {
|
|
178
|
-
const root =
|
|
179
|
+
const root = resolveWalkableRoot(document);
|
|
179
180
|
const results: CrossReference[] = [];
|
|
180
181
|
let paragraphIndex = -1;
|
|
181
182
|
|
|
@@ -4,6 +4,7 @@ import type {
|
|
|
4
4
|
DocumentNode,
|
|
5
5
|
ParagraphNode,
|
|
6
6
|
} from "../model/canonical-document.ts";
|
|
7
|
+
import { resolveWalkableRoot } from "./_document-root.ts";
|
|
7
8
|
|
|
8
9
|
export interface DefinedTermOccurrence {
|
|
9
10
|
paragraphIndex: number;
|
|
@@ -43,7 +44,7 @@ export function collectDefinedTermsFromDocumentXml(xml: string): DefinedTerm[] {
|
|
|
43
44
|
export function collectDefinedTermsFromCanonicalDocument(
|
|
44
45
|
document: Pick<CanonicalDocument, "content"> | DocumentNode,
|
|
45
46
|
): DefinedTerm[] {
|
|
46
|
-
const root =
|
|
47
|
+
const root = resolveWalkableRoot(document);
|
|
47
48
|
const paragraphs: string[] = [];
|
|
48
49
|
|
|
49
50
|
walkDocument(root, (node) => {
|
|
@@ -4,6 +4,7 @@ import type {
|
|
|
4
4
|
DocumentNode,
|
|
5
5
|
ParagraphNode,
|
|
6
6
|
} from "../model/canonical-document.ts";
|
|
7
|
+
import { resolveWalkableRoot } from "./_document-root.ts";
|
|
7
8
|
|
|
8
9
|
export interface SignatureBlockCandidate {
|
|
9
10
|
startIndex: number;
|
|
@@ -39,7 +40,7 @@ const UNDERLINE_PLACEHOLDER_PATTERN = /_{4,}|\.{4,}/;
|
|
|
39
40
|
export function detectSignatureBlocksFromCanonicalDocument(
|
|
40
41
|
document: Pick<CanonicalDocument, "content" | "preservation"> | DocumentNode,
|
|
41
42
|
): SignatureBlockReport {
|
|
42
|
-
const root =
|
|
43
|
+
const root = resolveWalkableRoot(document);
|
|
43
44
|
const paragraphs: Array<{ text: string; node: ParagraphNode }> = [];
|
|
44
45
|
const warnings: string[] = [];
|
|
45
46
|
|