@adeu/core 1.6.7 → 1.6.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +3969 -1859
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +95 -8
- package/dist/index.d.ts +95 -8
- package/dist/index.js +3966 -1859
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/src/consistency.test.ts +134 -0
- package/src/diff.test.ts +13 -1
- package/src/diff.ts +220 -47
- package/src/docx/bridge.ts +111 -57
- package/src/docx/dom.ts +66 -7
- package/src/domain.test.ts +280 -0
- package/src/domain.ts +264 -10
- package/src/engine.bugs.test.ts +481 -0
- package/src/engine.ts +1346 -192
- package/src/index.ts +7 -8
- package/src/ingest.ts +8 -0
- package/src/markup.ts +160 -53
- package/src/outline.ts +199 -69
- package/src/sanitize/core.ts +130 -0
- package/src/sanitize/report.ts +125 -0
- package/src/sanitize/sanitize.test.ts +237 -0
- package/src/sanitize/transforms.ts +452 -0
- package/src/utils/docx.ts +292 -158
package/src/docx/bridge.ts
CHANGED
|
@@ -1,40 +1,54 @@
|
|
|
1
|
-
import JSZip from
|
|
2
|
-
import {
|
|
1
|
+
import JSZip from "jszip";
|
|
2
|
+
import {
|
|
3
|
+
parseXml,
|
|
4
|
+
findChild,
|
|
5
|
+
findAllDescendants,
|
|
6
|
+
serializeXml,
|
|
7
|
+
} from "./dom.js";
|
|
3
8
|
|
|
4
9
|
export class Relationship {
|
|
5
10
|
constructor(
|
|
6
11
|
public id: string,
|
|
7
12
|
public type: string,
|
|
8
13
|
public target: string,
|
|
9
|
-
public isExternal: boolean
|
|
14
|
+
public isExternal: boolean,
|
|
10
15
|
) {}
|
|
11
16
|
}
|
|
12
17
|
|
|
13
18
|
export class Part {
|
|
14
19
|
public rels: Map<string, Relationship> = new Map();
|
|
15
20
|
public _element: Element;
|
|
16
|
-
|
|
21
|
+
public package?: DocxPackage;
|
|
17
22
|
constructor(
|
|
18
23
|
public partname: string,
|
|
19
24
|
public blob: string,
|
|
20
25
|
element: Element,
|
|
21
|
-
public contentType: string
|
|
26
|
+
public contentType: string,
|
|
22
27
|
) {
|
|
23
28
|
this._element = element;
|
|
24
29
|
}
|
|
25
30
|
|
|
26
|
-
public addRelationship(
|
|
31
|
+
public addRelationship(
|
|
32
|
+
id: string,
|
|
33
|
+
type: string,
|
|
34
|
+
target: string,
|
|
35
|
+
isExternal: boolean = false,
|
|
36
|
+
) {
|
|
27
37
|
this.rels.set(id, new Relationship(id, type, target, isExternal));
|
|
28
|
-
|
|
29
|
-
//
|
|
30
|
-
if (this.
|
|
38
|
+
|
|
39
|
+
// Directly append the relationship element to the document structure
|
|
40
|
+
if (this.partname.endsWith(".rels")) {
|
|
31
41
|
const doc = this._element.ownerDocument;
|
|
32
42
|
if (doc) {
|
|
33
|
-
|
|
34
|
-
relEl.
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
43
|
+
// Use strict namespace to ensure it parses successfully on reload
|
|
44
|
+
const relEl = doc.createElementNS(
|
|
45
|
+
"http://schemas.openxmlformats.org/package/2006/relationships",
|
|
46
|
+
"Relationship",
|
|
47
|
+
);
|
|
48
|
+
relEl.setAttribute("Id", id);
|
|
49
|
+
relEl.setAttribute("Type", type);
|
|
50
|
+
relEl.setAttribute("Target", target);
|
|
51
|
+
if (isExternal) relEl.setAttribute("TargetMode", "External");
|
|
38
52
|
this._element.appendChild(relEl);
|
|
39
53
|
}
|
|
40
54
|
}
|
|
@@ -49,32 +63,44 @@ export class DocxPackage {
|
|
|
49
63
|
|
|
50
64
|
public getPartByPath(path: string): Part | undefined {
|
|
51
65
|
// Strip leading slash for jszip compat
|
|
52
|
-
const searchPath = path.startsWith(
|
|
53
|
-
return this.parts.find(
|
|
66
|
+
const searchPath = path.startsWith("/") ? path.substring(1) : path;
|
|
67
|
+
return this.parts.find(
|
|
68
|
+
(p) => p.partname === searchPath || p.partname === "/" + searchPath,
|
|
69
|
+
);
|
|
54
70
|
}
|
|
55
71
|
|
|
56
72
|
public nextPartname(pattern: string): string {
|
|
57
73
|
let i = 1;
|
|
58
74
|
while (true) {
|
|
59
|
-
const candidate = pattern.replace(
|
|
75
|
+
const candidate = pattern.replace("%d", i === 1 ? "" : i.toString());
|
|
60
76
|
if (!this.getPartByPath(candidate)) return candidate;
|
|
61
77
|
i++;
|
|
62
78
|
}
|
|
63
79
|
}
|
|
64
80
|
|
|
65
|
-
public addPart(
|
|
81
|
+
public addPart(
|
|
82
|
+
partname: string,
|
|
83
|
+
contentType: string,
|
|
84
|
+
xmlString: string,
|
|
85
|
+
): Part {
|
|
66
86
|
const doc = parseXml(xmlString);
|
|
67
|
-
const part = new Part(
|
|
87
|
+
const part = new Part(
|
|
88
|
+
partname,
|
|
89
|
+
xmlString,
|
|
90
|
+
doc.documentElement,
|
|
91
|
+
contentType,
|
|
92
|
+
);
|
|
93
|
+
part.package = this;
|
|
68
94
|
this.parts.push(part);
|
|
69
95
|
|
|
70
96
|
// Update [Content_Types].xml
|
|
71
|
-
const ctPart = this.getPartByPath(
|
|
97
|
+
const ctPart = this.getPartByPath("[Content_Types].xml");
|
|
72
98
|
if (ctPart) {
|
|
73
99
|
const docCT = ctPart._element.ownerDocument;
|
|
74
100
|
if (docCT) {
|
|
75
|
-
const override = docCT.createElement(
|
|
76
|
-
override.setAttribute(
|
|
77
|
-
override.setAttribute(
|
|
101
|
+
const override = docCT.createElement("Override");
|
|
102
|
+
override.setAttribute("PartName", partname);
|
|
103
|
+
override.setAttribute("ContentType", contentType);
|
|
78
104
|
ctPart._element.appendChild(override);
|
|
79
105
|
}
|
|
80
106
|
}
|
|
@@ -83,14 +109,18 @@ export class DocxPackage {
|
|
|
83
109
|
|
|
84
110
|
public getOrCreateRelsPart(sourcePartname: string): Part {
|
|
85
111
|
// e.g., /word/document.xml -> /word/_rels/document.xml.rels
|
|
86
|
-
const parts = sourcePartname.split(
|
|
112
|
+
const parts = sourcePartname.split("/");
|
|
87
113
|
const file = parts.pop();
|
|
88
|
-
const relsPath = parts.join(
|
|
89
|
-
|
|
114
|
+
const relsPath = parts.join("/") + "/_rels/" + file + ".rels";
|
|
115
|
+
|
|
90
116
|
let relsPart = this.getPartByPath(relsPath);
|
|
91
117
|
if (!relsPart) {
|
|
92
118
|
const xml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"></Relationships>`;
|
|
93
|
-
relsPart = this.addPart(
|
|
119
|
+
relsPart = this.addPart(
|
|
120
|
+
relsPath,
|
|
121
|
+
"application/vnd.openxmlformats-package.relationships+xml",
|
|
122
|
+
xml,
|
|
123
|
+
);
|
|
94
124
|
}
|
|
95
125
|
return relsPart;
|
|
96
126
|
}
|
|
@@ -98,63 +128,75 @@ export class DocxPackage {
|
|
|
98
128
|
|
|
99
129
|
export class DocumentObject {
|
|
100
130
|
public part: Part;
|
|
101
|
-
public settings: { oddAndEvenPagesHeaderFooter: boolean } = {
|
|
131
|
+
public settings: { oddAndEvenPagesHeaderFooter: boolean } = {
|
|
132
|
+
oddAndEvenPagesHeaderFooter: false,
|
|
133
|
+
};
|
|
102
134
|
// Simplification for the TS port: sections hold header/footer refs
|
|
103
|
-
public sections: any[] = [];
|
|
135
|
+
public sections: any[] = [];
|
|
104
136
|
|
|
105
|
-
constructor(
|
|
137
|
+
constructor(
|
|
138
|
+
public pkg: DocxPackage,
|
|
139
|
+
part: Part,
|
|
140
|
+
) {
|
|
106
141
|
this.part = part;
|
|
107
142
|
}
|
|
108
143
|
|
|
109
144
|
public get element(): Element {
|
|
110
|
-
return findChild(this.part._element,
|
|
145
|
+
return findChild(this.part._element, "w:body") || this.part._element;
|
|
111
146
|
}
|
|
112
147
|
|
|
113
148
|
/**
|
|
114
149
|
* Main entrypoint for loading a DOCX buffer into the DOM wrapper.
|
|
115
150
|
*/
|
|
116
|
-
public static async load(
|
|
151
|
+
public static async load(
|
|
152
|
+
buffer: Buffer | ArrayBuffer,
|
|
153
|
+
): Promise<DocumentObject> {
|
|
117
154
|
const zip = await JSZip.loadAsync(buffer);
|
|
118
155
|
const pkg = new DocxPackage(zip);
|
|
119
156
|
|
|
120
157
|
// 1. Load Content Types
|
|
121
|
-
const ctFile = zip.file(
|
|
158
|
+
const ctFile = zip.file("[Content_Types].xml");
|
|
122
159
|
let contentTypes: Record<string, string> = {};
|
|
123
160
|
if (ctFile) {
|
|
124
|
-
const ctXml = parseXml(await ctFile.async(
|
|
125
|
-
const overrides = findAllDescendants(ctXml.documentElement,
|
|
161
|
+
const ctXml = parseXml(await ctFile.async("text"));
|
|
162
|
+
const overrides = findAllDescendants(ctXml.documentElement, "Override");
|
|
126
163
|
for (const override of overrides) {
|
|
127
|
-
contentTypes[override.getAttribute(
|
|
164
|
+
contentTypes[override.getAttribute("PartName") || ""] =
|
|
165
|
+
override.getAttribute("ContentType") || "";
|
|
128
166
|
}
|
|
129
167
|
}
|
|
130
168
|
|
|
131
169
|
// 2. Pre-load all XML parts to allow synchronous traversal later
|
|
132
170
|
for (const [path, file] of Object.entries(zip.files)) {
|
|
133
|
-
if (!file.dir && (path.endsWith(
|
|
134
|
-
const text = await file.async(
|
|
171
|
+
if (!file.dir && (path.endsWith(".xml") || path.endsWith(".rels"))) {
|
|
172
|
+
const text = await file.async("text");
|
|
135
173
|
const doc = parseXml(text);
|
|
136
|
-
const cType = contentTypes[
|
|
137
|
-
const part = new Part(
|
|
174
|
+
const cType = contentTypes["/" + path] || "application/xml";
|
|
175
|
+
const part = new Part("/" + path, text, doc.documentElement, cType);
|
|
176
|
+
part.package = pkg;
|
|
138
177
|
pkg.parts.push(part);
|
|
139
178
|
}
|
|
140
179
|
}
|
|
141
180
|
|
|
142
181
|
// 3. Resolve Relationships for the main document
|
|
143
|
-
const mainPart = pkg.getPartByPath(
|
|
144
|
-
if (!mainPart) throw new Error(
|
|
182
|
+
const mainPart = pkg.getPartByPath("word/document.xml");
|
|
183
|
+
if (!mainPart) throw new Error("Invalid DOCX: Missing word/document.xml");
|
|
145
184
|
pkg.mainDocumentPart = mainPart;
|
|
146
185
|
|
|
147
|
-
const relsPart = pkg.getPartByPath(
|
|
186
|
+
const relsPart = pkg.getPartByPath("word/_rels/document.xml.rels");
|
|
148
187
|
if (relsPart) {
|
|
149
|
-
const relElements = findAllDescendants(relsPart._element,
|
|
188
|
+
const relElements = findAllDescendants(relsPart._element, "Relationship");
|
|
150
189
|
for (const rel of relElements) {
|
|
151
|
-
const rId = rel.getAttribute(
|
|
152
|
-
const target = rel.getAttribute(
|
|
153
|
-
const type = rel.getAttribute(
|
|
154
|
-
const targetMode = rel.getAttribute(
|
|
155
|
-
|
|
190
|
+
const rId = rel.getAttribute("Id");
|
|
191
|
+
const target = rel.getAttribute("Target");
|
|
192
|
+
const type = rel.getAttribute("Type");
|
|
193
|
+
const targetMode = rel.getAttribute("TargetMode");
|
|
194
|
+
|
|
156
195
|
if (rId && target && type) {
|
|
157
|
-
mainPart.rels.set(
|
|
196
|
+
mainPart.rels.set(
|
|
197
|
+
rId,
|
|
198
|
+
new Relationship(rId, type, target, targetMode === "External"),
|
|
199
|
+
);
|
|
158
200
|
}
|
|
159
201
|
}
|
|
160
202
|
}
|
|
@@ -166,24 +208,36 @@ export class DocumentObject {
|
|
|
166
208
|
let rId = 1;
|
|
167
209
|
while (this.part.rels.has(`rId${rId}`)) rId++;
|
|
168
210
|
const id = `rId${rId}`;
|
|
169
|
-
|
|
211
|
+
|
|
170
212
|
// In DOCX, targets in .rels are relative to the source part's directory.
|
|
171
213
|
// /word/document.xml relating to /word/comments.xml -> target is "comments.xml"
|
|
172
|
-
const target = part.partname.split(
|
|
173
|
-
|
|
214
|
+
const target = part.partname.split("/").pop()!;
|
|
215
|
+
|
|
174
216
|
this.part.rels.set(id, new Relationship(id, relType, target, false));
|
|
175
217
|
const relsPart = this.pkg.getOrCreateRelsPart(this.part.partname);
|
|
176
218
|
relsPart.addRelationship(id, relType, target, false);
|
|
177
219
|
}
|
|
178
220
|
|
|
221
|
+
public relateToExternal(target: string, relType: string): string {
|
|
222
|
+
let rId = 1;
|
|
223
|
+
while (this.part.rels.has(`rId${rId}`)) rId++;
|
|
224
|
+
const id = `rId${rId}`;
|
|
225
|
+
|
|
226
|
+
this.part.rels.set(id, new Relationship(id, relType, target, true));
|
|
227
|
+
const relsPart = this.pkg.getOrCreateRelsPart(this.part.partname);
|
|
228
|
+
relsPart.addRelationship(id, relType, target, true);
|
|
229
|
+
return id;
|
|
230
|
+
}
|
|
231
|
+
|
|
179
232
|
public async save(): Promise<Buffer> {
|
|
180
233
|
for (const part of this.pkg.parts) {
|
|
181
234
|
let xmlStr = serializeXml(part._element.ownerDocument || part._element);
|
|
182
|
-
if (!xmlStr.startsWith(
|
|
183
|
-
xmlStr =
|
|
235
|
+
if (!xmlStr.startsWith("<?xml")) {
|
|
236
|
+
xmlStr =
|
|
237
|
+
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n' + xmlStr;
|
|
184
238
|
}
|
|
185
239
|
this.pkg.zip.file(part.partname.substring(1), xmlStr); // Strip leading slash for JSZip
|
|
186
240
|
}
|
|
187
|
-
return this.pkg.zip.generateAsync({ type:
|
|
241
|
+
return this.pkg.zip.generateAsync({ type: "nodebuffer" });
|
|
188
242
|
}
|
|
189
|
-
}
|
|
243
|
+
}
|
package/src/docx/dom.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { DOMParser, XMLSerializer } from
|
|
1
|
+
import { DOMParser, XMLSerializer } from "@xmldom/xmldom";
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
4
|
* Simulates docx.oxml.ns.qn. In xmldom, namespaces are preserved in tagName.
|
|
@@ -11,7 +11,10 @@ export const qn = (name: string) => name;
|
|
|
11
11
|
export function findChild(element: Element, tagName: string): Element | null {
|
|
12
12
|
for (let i = 0; i < element.childNodes.length; i++) {
|
|
13
13
|
const child = element.childNodes[i];
|
|
14
|
-
if (
|
|
14
|
+
if (
|
|
15
|
+
child.nodeType === 1 /* ELEMENT_NODE */ &&
|
|
16
|
+
(child as Element).tagName === tagName
|
|
17
|
+
) {
|
|
15
18
|
return child as Element;
|
|
16
19
|
}
|
|
17
20
|
}
|
|
@@ -35,7 +38,10 @@ export function findChildren(element: Element, tagName: string): Element[] {
|
|
|
35
38
|
/**
|
|
36
39
|
* Simulates lxml element.findall(".//w:tag") - searches ALL descendants.
|
|
37
40
|
*/
|
|
38
|
-
export function findAllDescendants(
|
|
41
|
+
export function findAllDescendants(
|
|
42
|
+
element: Element,
|
|
43
|
+
tagName: string,
|
|
44
|
+
): Element[] {
|
|
39
45
|
return Array.from(element.getElementsByTagName(tagName));
|
|
40
46
|
}
|
|
41
47
|
|
|
@@ -43,12 +49,65 @@ export function findAllDescendants(element: Element, tagName: string): Element[]
|
|
|
43
49
|
* Parses raw XML strings into xmldom Documents.
|
|
44
50
|
*/
|
|
45
51
|
export function parseXml(xmlString: string): Document {
|
|
46
|
-
return new DOMParser().parseFromString(xmlString,
|
|
52
|
+
return new DOMParser().parseFromString(xmlString, "text/xml");
|
|
47
53
|
}
|
|
48
54
|
|
|
49
55
|
/**
|
|
50
|
-
* Serializes an xmldom Document or Element back to a string
|
|
56
|
+
* Serializes an xmldom Document or Element back to a string,
|
|
57
|
+
* enforcing deterministic attribute ordering on the root element.
|
|
51
58
|
*/
|
|
52
59
|
export function serializeXml(node: Node): string {
|
|
53
|
-
|
|
54
|
-
|
|
60
|
+
let xml = new XMLSerializer().serializeToString(node);
|
|
61
|
+
|
|
62
|
+
// BUG-11: Deterministic namespace ordering on root elements.
|
|
63
|
+
const rootTagRegex = /<([a-zA-Z0-9_:]+)(\s+[^>]+?)(>|\/>)/;
|
|
64
|
+
const match = rootTagRegex.exec(xml);
|
|
65
|
+
|
|
66
|
+
if (match && !match[1].startsWith("?")) {
|
|
67
|
+
const index = match.index;
|
|
68
|
+
const textBefore = xml.substring(0, index);
|
|
69
|
+
|
|
70
|
+
// Ensure this is the absolute root tag (only <?xml...?> allowed before it)
|
|
71
|
+
const isRoot =
|
|
72
|
+
!textBefore.includes("<") ||
|
|
73
|
+
(textBefore.trim().startsWith("<?xml") &&
|
|
74
|
+
(textBefore.match(/</g) || []).length === 1);
|
|
75
|
+
|
|
76
|
+
if (isRoot) {
|
|
77
|
+
const fullTag = match[0];
|
|
78
|
+
const elemStart = `<${match[1]}`;
|
|
79
|
+
const attrsStr = match[2];
|
|
80
|
+
const tagEnd = match[3];
|
|
81
|
+
|
|
82
|
+
// Robust extraction matching any quote style and internal spacing
|
|
83
|
+
const attrRegex = /([a-zA-Z0-9_:]+)\s*=\s*(["'])(.*?)\2/g;
|
|
84
|
+
const attrs: string[] = [];
|
|
85
|
+
let m;
|
|
86
|
+
while ((m = attrRegex.exec(attrsStr)) !== null) {
|
|
87
|
+
attrs.push(m[0].trim());
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// Sort attributes: xmlns definitions first, then standard attributes
|
|
91
|
+
attrs.sort((a, b) => {
|
|
92
|
+
const aName = a.split("=")[0].trim();
|
|
93
|
+
const bName = b.split("=")[0].trim();
|
|
94
|
+
const aIsXmlns = aName.startsWith("xmlns");
|
|
95
|
+
const bIsXmlns = bName.startsWith("xmlns");
|
|
96
|
+
if (aIsXmlns && !bIsXmlns) return -1;
|
|
97
|
+
if (!aIsXmlns && bIsXmlns) return 1;
|
|
98
|
+
return aName < bName ? -1 : aName > bName ? 1 : 0;
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
const newTag =
|
|
102
|
+
attrs.length > 0
|
|
103
|
+
? `${elemStart} ${attrs.join(" ")}${tagEnd}`
|
|
104
|
+
: `${elemStart}${tagEnd}`;
|
|
105
|
+
xml =
|
|
106
|
+
xml.substring(0, index) +
|
|
107
|
+
newTag +
|
|
108
|
+
xml.substring(index + fullTag.length);
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
return xml;
|
|
113
|
+
}
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import { createTestDocument, addParagraph } from './test-utils.js';
|
|
3
|
+
import { DocumentObject } from './docx/bridge.js';
|
|
4
|
+
import { extractTextFromBuffer } from './ingest.js';
|
|
5
|
+
import { RedlineEngine, BatchValidationError } from './engine.js';
|
|
6
|
+
import { ModifyText } from './models.js';
|
|
7
|
+
import { split_structural_appendix } from './pagination.js';
|
|
8
|
+
|
|
9
|
+
function addBookmark(paragraph: Element, name: string, idVal: string = "0", text: string = "") {
|
|
10
|
+
const doc = paragraph.ownerDocument!;
|
|
11
|
+
const start = doc.createElement('w:bookmarkStart');
|
|
12
|
+
start.setAttribute('w:name', name);
|
|
13
|
+
start.setAttribute('w:id', idVal);
|
|
14
|
+
paragraph.appendChild(start);
|
|
15
|
+
|
|
16
|
+
if (text) {
|
|
17
|
+
const r = doc.createElement('w:r');
|
|
18
|
+
const t = doc.createElement('w:t');
|
|
19
|
+
t.textContent = text;
|
|
20
|
+
if (text.includes(' ')) t.setAttribute('xml:space', 'preserve');
|
|
21
|
+
r.appendChild(t);
|
|
22
|
+
paragraph.appendChild(r);
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
const end = doc.createElement('w:bookmarkEnd');
|
|
26
|
+
end.setAttribute('w:id', idVal);
|
|
27
|
+
paragraph.appendChild(end);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function addCrossReference(paragraph: Element, refName: string, text: string) {
|
|
31
|
+
const doc = paragraph.ownerDocument!;
|
|
32
|
+
const fld = doc.createElement('w:fldSimple');
|
|
33
|
+
fld.setAttribute('w:instr', ` REF ${refName} \\h `);
|
|
34
|
+
const r = doc.createElement('w:r');
|
|
35
|
+
const t = doc.createElement('w:t');
|
|
36
|
+
t.textContent = text;
|
|
37
|
+
if (text.includes(' ')) t.setAttribute('xml:space', 'preserve');
|
|
38
|
+
r.appendChild(t);
|
|
39
|
+
fld.appendChild(r);
|
|
40
|
+
paragraph.appendChild(fld);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function addHyperlink(docObj: DocumentObject, paragraph: Element, url: string, text: string) {
|
|
44
|
+
const idStr = docObj.relateToExternal(url, 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink');
|
|
45
|
+
|
|
46
|
+
const doc = paragraph.ownerDocument!;
|
|
47
|
+
const hyperlink = doc.createElement('w:hyperlink');
|
|
48
|
+
hyperlink.setAttribute('r:id', idStr);
|
|
49
|
+
const r = doc.createElement('w:r');
|
|
50
|
+
const t = doc.createElement('w:t');
|
|
51
|
+
t.textContent = text;
|
|
52
|
+
if (text.includes(' ')) t.setAttribute('xml:space', 'preserve');
|
|
53
|
+
r.appendChild(t);
|
|
54
|
+
hyperlink.appendChild(r);
|
|
55
|
+
paragraph.appendChild(hyperlink);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function setupFootnotesPart(docObj: DocumentObject) {
|
|
59
|
+
const fnXml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
|
60
|
+
<w:footnotes xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
61
|
+
<w:footnote w:type="separator" w:id="-1">
|
|
62
|
+
<w:p><w:r><w:separator/></w:r></w:p>
|
|
63
|
+
</w:footnote>
|
|
64
|
+
<w:footnote w:id="1">
|
|
65
|
+
<w:p><w:r><w:t>Footnote content.</w:t></w:r></w:p>
|
|
66
|
+
</w:footnote>
|
|
67
|
+
</w:footnotes>`;
|
|
68
|
+
|
|
69
|
+
const partname = '/word/footnotes.xml';
|
|
70
|
+
const ctype = 'application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml';
|
|
71
|
+
const relType = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes';
|
|
72
|
+
|
|
73
|
+
const part = docObj.pkg.addPart(partname, ctype, fnXml);
|
|
74
|
+
docObj.relateTo(part, relType);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
async function createDomainSemanticsStream() {
|
|
78
|
+
const doc = await createTestDocument();
|
|
79
|
+
|
|
80
|
+
// 1. Appendix / Definitions
|
|
81
|
+
const p1 = addParagraph(doc, "1. Definitions");
|
|
82
|
+
p1.insertBefore(p1.ownerDocument!.createElement('w:pPr'), p1.firstChild);
|
|
83
|
+
addParagraph(doc, '"Affiliate" means any entity that controls, is controlled by, or is under common control.');
|
|
84
|
+
addParagraph(doc, "“Confidential Information” means all non-public information disclosed by one party to the other.");
|
|
85
|
+
addParagraph(doc, "This paragraph does not define anything.");
|
|
86
|
+
|
|
87
|
+
const p2 = addParagraph(doc, "2. Obligations");
|
|
88
|
+
p2.insertBefore(p2.ownerDocument!.createElement('w:pPr'), p2.firstChild);
|
|
89
|
+
addParagraph(doc, "The Affiliate shall protect the Confidential Information to the highest standard.");
|
|
90
|
+
|
|
91
|
+
// 3. Bookmarks and Cross-References
|
|
92
|
+
const p3 = addParagraph(doc, "Subject to ");
|
|
93
|
+
addBookmark(p3, "MyBookmark_1", "1", "Anchored Clause");
|
|
94
|
+
const r3 = p3.ownerDocument!.createElement('w:r');
|
|
95
|
+
const t3 = p3.ownerDocument!.createElement('w:t');
|
|
96
|
+
t3.textContent = ", the parties agree to...";
|
|
97
|
+
t3.setAttribute('xml:space', 'preserve');
|
|
98
|
+
r3.appendChild(t3);
|
|
99
|
+
p3.appendChild(r3);
|
|
100
|
+
|
|
101
|
+
const p4 = addParagraph(doc, "As strictly stated in ");
|
|
102
|
+
addCrossReference(p4, "MyBookmark_1", "Anchored Clause");
|
|
103
|
+
const r4 = p4.ownerDocument!.createElement('w:r');
|
|
104
|
+
const t4 = p4.ownerDocument!.createElement('w:t');
|
|
105
|
+
t4.textContent = ", either party may terminate.";
|
|
106
|
+
t4.setAttribute('xml:space', 'preserve');
|
|
107
|
+
r4.appendChild(t4);
|
|
108
|
+
p4.appendChild(r4);
|
|
109
|
+
|
|
110
|
+
// 4. Internal Anchors
|
|
111
|
+
const pAnchor = addParagraph(doc, "Section 5. Indemnification");
|
|
112
|
+
addBookmark(pAnchor, "_Ref12345", "0");
|
|
113
|
+
|
|
114
|
+
const pNoise = addParagraph(doc, "Some text.");
|
|
115
|
+
addBookmark(pNoise, "_GoBack", "2");
|
|
116
|
+
addBookmark(pNoise, "_Toc123456789", "3");
|
|
117
|
+
|
|
118
|
+
// 5. Footnotes
|
|
119
|
+
const pFn = addParagraph(doc, "Sentence with footnote");
|
|
120
|
+
const rFn = pFn.ownerDocument!.createElement('w:r');
|
|
121
|
+
const ref = pFn.ownerDocument!.createElement('w:footnoteReference');
|
|
122
|
+
ref.setAttribute('w:id', "1");
|
|
123
|
+
rFn.appendChild(ref);
|
|
124
|
+
pFn.appendChild(rFn);
|
|
125
|
+
setupFootnotesPart(doc);
|
|
126
|
+
|
|
127
|
+
// 6. Links and Cross references
|
|
128
|
+
const pLink = addParagraph(doc, "Please visit ");
|
|
129
|
+
addHyperlink(doc, pLink, "https://adeu.com", "Adeu HQ");
|
|
130
|
+
|
|
131
|
+
const pXref = addParagraph(doc, "As detailed in ");
|
|
132
|
+
addCrossReference(pXref, "_Ref12345", "Section 5");
|
|
133
|
+
|
|
134
|
+
return doc.save();
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
describe('Domain Semantics Engine', () => {
|
|
138
|
+
it('extracts and projects structural appendix and diagnostics correctly', async () => {
|
|
139
|
+
const buf = await createDomainSemanticsStream();
|
|
140
|
+
const text = await extractTextFromBuffer(buf);
|
|
141
|
+
|
|
142
|
+
expect(text).toContain("<!-- READONLY_BOUNDARY_START -->");
|
|
143
|
+
expect(text).toContain("# Document Structure (Read-Only)");
|
|
144
|
+
|
|
145
|
+
// Definitions
|
|
146
|
+
expect(text).toContain("## Defined Terms");
|
|
147
|
+
expect(text).toContain('"Affiliate"');
|
|
148
|
+
expect(text).toContain('"Confidential Information"');
|
|
149
|
+
expect(text).toContain("used 1 times");
|
|
150
|
+
|
|
151
|
+
// Named Anchors & Back-References
|
|
152
|
+
expect(text).toContain("## Named Anchors");
|
|
153
|
+
expect(text).toContain("MyBookmark_1");
|
|
154
|
+
expect(text).toContain("Anchored to:");
|
|
155
|
+
expect(text).toContain("Referenced from:");
|
|
156
|
+
|
|
157
|
+
// Internal anchors & Noise suppression
|
|
158
|
+
expect(text).toContain("{#_Ref12345}");
|
|
159
|
+
expect(text).toContain("Section 5. Indemnification{#_Ref12345}");
|
|
160
|
+
expect(text).not.toContain("{#_GoBack}");
|
|
161
|
+
expect(text).not.toContain("{#_Toc123456789}");
|
|
162
|
+
|
|
163
|
+
// Footnotes
|
|
164
|
+
expect(text).toContain("[^fn-1]");
|
|
165
|
+
expect(text).toContain("## Footnotes");
|
|
166
|
+
expect(text).toContain("[^fn-1]: Footnote content.");
|
|
167
|
+
|
|
168
|
+
// Links
|
|
169
|
+
expect(text).toContain("[Adeu HQ](https://adeu.com)");
|
|
170
|
+
expect(text).toContain("[~Section 5~](#_Ref12345)");
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
const edgeCases = [
|
|
174
|
+
{ target: "# Document Structure (Read-Only)", newText: "# Modified Document Structure", errChecker: (m: string) => m.includes('read-only boundary') || m.includes('appendix') },
|
|
175
|
+
{ target: "Sentence with footnote[^fn-1]", newText: "Sentence with footnote", errChecker: (m: string) => m.includes('footnote') && (m.includes('delete') || m.includes('remove')) },
|
|
176
|
+
{ target: "Sentence with footnote", newText: "Sentence with footnote[^fn-99]", errChecker: (m: string) => m.includes('footnote') && (m.includes('insert') || m.includes('create')) },
|
|
177
|
+
{ target: "Some text.", newText: "Some text.{#_Ref99999}", errChecker: (m: string) => m.includes('anchor') },
|
|
178
|
+
{ target: "Section 5. Indemnification{#_Ref12345}", newText: "Section 5. Indemnification{#_Ref99999}", errChecker: (m: string) => m.includes('anchor') },
|
|
179
|
+
{ target: "[~Section 5~](#_Ref12345)", newText: "[~Section 6~](#_Ref12345)", errChecker: (m: string) => m.includes('cross-reference') || m.includes('rejected') },
|
|
180
|
+
{ target: "[~Section 5~](#_Ref12345)", newText: "[~Section 5~](#_Ref99999)", errChecker: (m: string) => m.includes('dependency corruption') || m.includes('rejected') },
|
|
181
|
+
{ target: "As detailed in [~Section 5~](#_Ref12345)", newText: "As detailed in [~Section 5~](#_Ref12345) and [~Section 6~](#_Ref999)", errChecker: (m: string) => m.includes('cross-reference') || m.includes('read-only') },
|
|
182
|
+
{ target: "As detailed in [~Section 5~](#_Ref12345)", newText: "As detailed in nothing", errChecker: (m: string) => m.includes('cross-reference') || m.includes('delete') },
|
|
183
|
+
{ target: "Please visit [Adeu HQ](https://adeu.com)", newText: "Please visit [Adeu HQ](https://adeu.com) and [Google](https://google.com)", errChecker: (m: string) => m.includes('hyperlink') || m.includes('insert') },
|
|
184
|
+
{ target: "Please visit [Adeu HQ](https://adeu.com)", newText: "Please visit nothing", errChecker: (m: string) => m.includes('hyperlink') || m.includes('delete') },
|
|
185
|
+
];
|
|
186
|
+
|
|
187
|
+
for (const tc of edgeCases) {
|
|
188
|
+
it(`rejects invalid edits: ${tc.target} -> ${tc.newText}`, async () => {
|
|
189
|
+
const buf = await createDomainSemanticsStream();
|
|
190
|
+
const doc = await DocumentObject.load(buf);
|
|
191
|
+
const engine = new RedlineEngine(doc);
|
|
192
|
+
const edit: ModifyText = { type: 'modify', target_text: tc.target, new_text: tc.newText };
|
|
193
|
+
|
|
194
|
+
let errorThrown = false;
|
|
195
|
+
try {
|
|
196
|
+
engine.process_batch([edit]);
|
|
197
|
+
} catch (e) {
|
|
198
|
+
errorThrown = true;
|
|
199
|
+
if (e instanceof BatchValidationError) {
|
|
200
|
+
const msg = e.errors.join('\n').toLowerCase();
|
|
201
|
+
expect(tc.errChecker(msg)).toBe(true);
|
|
202
|
+
} else {
|
|
203
|
+
throw e; // unexpected error
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
expect(errorThrown).toBe(true);
|
|
207
|
+
});
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
it('safely edits footnotes and accepts changes', async () => {
|
|
211
|
+
const buf = await createDomainSemanticsStream();
|
|
212
|
+
const doc = await DocumentObject.load(buf);
|
|
213
|
+
const engine = new RedlineEngine(doc);
|
|
214
|
+
|
|
215
|
+
const edit: ModifyText = { type: 'modify', target_text: "Footnote content.", new_text: "This is an edited footnote." };
|
|
216
|
+
const stats = engine.process_batch([edit]);
|
|
217
|
+
expect(stats.edits_applied).toBe(1);
|
|
218
|
+
|
|
219
|
+
engine.accept_all_revisions();
|
|
220
|
+
const outBuf = await doc.save();
|
|
221
|
+
const cleanText = await extractTextFromBuffer(outBuf, true);
|
|
222
|
+
|
|
223
|
+
expect(cleanText).toContain("[^fn-1]: This is an edited footnote.");
|
|
224
|
+
});
|
|
225
|
+
|
|
226
|
+
it('extracts defined terms and finds typos correctly', async () => {
|
|
227
|
+
const doc = await createTestDocument();
|
|
228
|
+
addParagraph(doc, '"Agreement" means this contract.');
|
|
229
|
+
addParagraph(doc, "“Party” shall mean either side.");
|
|
230
|
+
addParagraph(doc, '"Agreement" means another thing.'); // Duplicate
|
|
231
|
+
addParagraph(doc, 'This contract (hereinafter, the "Contract") is valid.');
|
|
232
|
+
addParagraph(doc, '"Confidential Information" on salainen asia.');
|
|
233
|
+
addParagraph(doc, '1.1 "Affiliate" tarkoittaa osakkuusyhtiötä.');
|
|
234
|
+
addParagraph(doc, 'We will act as the disclosing party (jäljempänä "Discloser").');
|
|
235
|
+
addParagraph(doc, 'This is a syntax example: ("Heading*") and ("<Term>")');
|
|
236
|
+
|
|
237
|
+
addParagraph(doc, "The Agreement is binding. The Contract is signed.");
|
|
238
|
+
addParagraph(doc, "There is an Agrement here.");
|
|
239
|
+
addParagraph(doc, "We shared Confidential Information with the Affiliate. The Discloser is happy.");
|
|
240
|
+
|
|
241
|
+
const buf = await doc.save();
|
|
242
|
+
const full_text = await extractTextFromBuffer(buf, false);
|
|
243
|
+
const [, appendix] = split_structural_appendix(full_text);
|
|
244
|
+
|
|
245
|
+
expect(appendix).toContain('"Agreement" \u2014 used');
|
|
246
|
+
expect(appendix).toContain('"Contract" \u2014 used');
|
|
247
|
+
expect(appendix).toContain('"Confidential Information" \u2014 used');
|
|
248
|
+
expect(appendix).toContain('"Affiliate" \u2014 used');
|
|
249
|
+
expect(appendix).toContain('"Discloser" \u2014 used');
|
|
250
|
+
|
|
251
|
+
expect(appendix).not.toContain('"Party"');
|
|
252
|
+
expect(appendix).not.toContain('"Heading*"');
|
|
253
|
+
expect(appendix).not.toContain('"<Term>"');
|
|
254
|
+
|
|
255
|
+
expect(appendix).toContain("[Error] Duplicate Definition: 'Agreement' is defined multiple times.");
|
|
256
|
+
expect(appendix).toContain("[Info] Possible Typos for 'Agreement': Found 'Agrement'");
|
|
257
|
+
});
|
|
258
|
+
|
|
259
|
+
it('reduces typo noise for short acronyms', async () => {
|
|
260
|
+
const doc = await createTestDocument();
|
|
261
|
+
addParagraph(doc, '"PSUs" means power supply units.');
|
|
262
|
+
addParagraph(doc, '"CPU" means central processing unit.');
|
|
263
|
+
addParagraph(doc, '"Party" means the entity.');
|
|
264
|
+
addParagraph(doc, "We rely on ESAs, LSPs, and GPUs for the servers.");
|
|
265
|
+
addParagraph(doc, "The GPU is very fast.");
|
|
266
|
+
addParagraph(doc, "The Pary signed the contract.");
|
|
267
|
+
addParagraph(doc, "We bought PSUs and a CPU.");
|
|
268
|
+
addParagraph(doc, "The Party begins today.");
|
|
269
|
+
|
|
270
|
+
const buf = await doc.save();
|
|
271
|
+
const full_text = await extractTextFromBuffer(buf, false);
|
|
272
|
+
const [, appendix] = split_structural_appendix(full_text);
|
|
273
|
+
|
|
274
|
+
expect(appendix).toContain("[Info] Possible Typos for 'Party': Found 'Pary'");
|
|
275
|
+
expect(appendix).not.toContain("'GPU'");
|
|
276
|
+
expect(appendix).not.toContain("'GPUs'");
|
|
277
|
+
expect(appendix).not.toContain("'ESAs'");
|
|
278
|
+
expect(appendix).not.toContain("'LSPs'");
|
|
279
|
+
});
|
|
280
|
+
});
|