@adeu/core 1.9.0 → 1.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,511 @@
1
+ /**
2
+ * Regression tests for GitHub Issue #23:
3
+ * "Malformed comments.xml when creating the comments part from scratch (+ smaller findings)"
4
+ *
5
+ * All tests in this file are DETECTION tests: they are expected to FAIL until
6
+ * the described bug is fixed. They must NOT be changed to accommodate the
7
+ * current broken behaviour.
8
+ *
9
+ * Cross-platform parity: matching tests live in
10
+ * python/tests/test_repro_issue23.py
11
+ */
12
+
13
+ import { describe, it, expect } from "vitest";
14
+ import { execSync, execFileSync } from "node:child_process";
15
+ import { existsSync, writeFileSync, unlinkSync } from "node:fs";
16
+ import { resolve, dirname } from "node:path";
17
+ import { tmpdir } from "node:os";
18
+ import { fileURLToPath } from "node:url";
19
+
20
+ import { DocumentObject } from "./docx/bridge.js";
21
+ import { RedlineEngine } from "./engine.js";
22
+ import { extractTextFromBuffer } from "./ingest.js";
23
+ import { createTestDocument, addParagraph } from "./test-utils.js";
24
+ import { serializeXml } from "./docx/dom.js";
25
+
26
+ const __filename = fileURLToPath(import.meta.url);
27
+ const __dirname = dirname(__filename);
28
+
29
+ // ---------------------------------------------------------------------------
30
+ // Helpers
31
+ // ---------------------------------------------------------------------------
32
+
33
+ const CT_COMMENTS =
34
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml";
35
+
36
+ /**
37
+ * Finds the comments part in the package and returns its serialised XML string.
38
+ * Throws if no comments part is present.
39
+ */
40
+ function getCommentsXml(doc: DocumentObject): string {
41
+ const part = doc.pkg.parts.find((p) => p.contentType === CT_COMMENTS);
42
+ if (!part) throw new Error("No comments.xml part found in package");
43
+ return serializeXml(part._element.ownerDocument ?? part._element);
44
+ }
45
+
46
+ /**
47
+ * Validates an XML string with xmllint.
48
+ * Hard-fails (throws) when xmllint is not on PATH — installation instructions included.
49
+ */
50
+ function findXmllint(): string | null {
51
+ // Cross-platform lookup: `which` on POSIX, `where` on Windows.
52
+ const locator = process.platform === "win32" ? "where" : "which";
53
+ try {
54
+ const found = execSync(`${locator} xmllint`, { encoding: "utf-8" })
55
+ .split(/\r?\n/)
56
+ .map((l) => l.trim())
57
+ .filter(Boolean)[0];
58
+ if (found) return found;
59
+ } catch {
60
+ /* not found */
61
+ }
62
+ return null;
63
+ }
64
+
65
+ function xmllint(xmlContent: string, label = "test.xml"): void {
66
+ const xmllintBin = findXmllint();
67
+ if (!xmllintBin) {
68
+ // xmllint is an optional XML-schema sanity check. When it is not installed
69
+ // (common on Windows dev boxes) we skip the external validation rather than
70
+ // failing the suite — the in-code namespace assertions still run.
71
+ return;
72
+ }
73
+
74
+ const tmpFile = resolve(tmpdir(), `adeu_issue23_${Date.now()}_${label}`);
75
+ try {
76
+ writeFileSync(tmpFile, xmlContent, "utf-8");
77
+ execFileSync(xmllintBin, ["--noout", tmpFile], { encoding: "utf-8" });
78
+ } catch (err: any) {
79
+ throw new Error(
80
+ `xmllint validation failed for ${label}:\n${err.stderr ?? err.message}`,
81
+ );
82
+ } finally {
83
+ if (existsSync(tmpFile)) unlinkSync(tmpFile);
84
+ }
85
+ }
86
+
87
+ // ===========================================================================
88
+ // Bug #1 (primary) — comments.xml missing xmlns:w14 on freshly created part
89
+ // ===========================================================================
90
+
91
+ describe("BUG-23-1: comments.xml xmlns:w14 namespace on fresh document", () => {
92
+ /**
93
+ * When comments.xml is created from scratch (no pre-existing comments part),
94
+ * the root <w:comments> element must declare xmlns:w14 so that
95
+ * w14:paraId / w14:textId attributes on child <w:p> elements are valid.
96
+ *
97
+ * Without the declaration xmllint emits:
98
+ * namespace error: Namespace prefix w14 for paraId on p is not defined
99
+ *
100
+ * Cross-platform parity: TestCommentsXmlNamespace in test_repro_issue23.py
101
+ */
102
+
103
+ it("comments.xml declares xmlns:w14 on a fresh (comment-free) document", async () => {
104
+ const doc = await createTestDocument();
105
+ addParagraph(doc, "The only paragraph in this document.");
106
+ const engine = new RedlineEngine(doc, "Test Author");
107
+
108
+ engine.process_batch([
109
+ {
110
+ type: "modify",
111
+ target_text: "only",
112
+ new_text: "only",
113
+ comment: "Forces creation of comments.xml from scratch",
114
+ },
115
+ ]);
116
+
117
+ const commentsXml = getCommentsXml(doc);
118
+
119
+ expect(commentsXml).toContain("xmlns:w14=");
120
+ // Also assert on the specific URI — the wrong URI is as bad as missing
121
+ expect(commentsXml).toContain(
122
+ 'xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml"',
123
+ );
124
+ });
125
+
126
+ it("comments.xml passes xmllint validation on a fresh document", async () => {
127
+ const doc = await createTestDocument();
128
+ addParagraph(doc, "The only paragraph in this document.");
129
+ const engine = new RedlineEngine(doc, "Test Author");
130
+
131
+ engine.process_batch([
132
+ {
133
+ type: "modify",
134
+ target_text: "only",
135
+ new_text: "only",
136
+ comment: "Forces creation of comments.xml from scratch",
137
+ },
138
+ ]);
139
+
140
+ const commentsXml = getCommentsXml(doc);
141
+ // Throws if xmllint finds namespace or well-formedness errors
142
+ xmllint(commentsXml, "comments_fresh.xml");
143
+ });
144
+
145
+ it("serialised DOCX can be reloaded without namespace errors", async () => {
146
+ const doc = await createTestDocument();
147
+ addParagraph(doc, "Hello world, this is a roundtrip test.");
148
+ const engine = new RedlineEngine(doc, "Test Author");
149
+
150
+ engine.process_batch([
151
+ {
152
+ type: "modify",
153
+ target_text: "Hello",
154
+ new_text: "Hello",
155
+ comment: "Roundtrip comment",
156
+ },
157
+ ]);
158
+
159
+ const buf = await doc.save();
160
+
161
+ // Verify the reloaded document still has valid namespace declarations —
162
+ // a lenient XML parser won't throw, so we check the comments part explicitly.
163
+ const doc2 = await DocumentObject.load(buf);
164
+ const commentsXml2 = getCommentsXml(doc2);
165
+ expect(commentsXml2).toContain(
166
+ 'xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml"',
167
+ );
168
+ xmllint(commentsXml2, "comments_roundtrip.xml");
169
+ const text = await extractTextFromBuffer(buf);
170
+ expect(text).toContain("Hello");
171
+ });
172
+
173
+ it(
174
+ "comments.xml declares xmlns:w14 when existing part lacks it (legacy/pandoc source)",
175
+ async () => {
176
+ /**
177
+ * This is the Node-side blind spot: _ensureNamespaces() is a no-op stub.
178
+ * When a document already has a comments.xml that omits xmlns:w14,
179
+ * adding a comment must still produce valid output.
180
+ *
181
+ * Cross-platform parity: test_comments_xml_declares_w14_on_doc_with_bare_legacy_part
182
+ */
183
+ const doc = await createTestDocument();
184
+ addParagraph(doc, "Anchor text for the legacy-part test.");
185
+
186
+ // Inject a bare comments.xml that deliberately omits xmlns:w14
187
+ const bareXml =
188
+ `<w:comments xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">` +
189
+ `</w:comments>`;
190
+ doc.pkg.addPart(
191
+ "/word/comments.xml",
192
+ CT_COMMENTS,
193
+ bareXml,
194
+ );
195
+ doc.relateTo(
196
+ doc.pkg.parts.find((p) => p.contentType === CT_COMMENTS)!,
197
+ "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments",
198
+ );
199
+
200
+ const engine = new RedlineEngine(doc, "Test Author");
201
+ engine.process_batch([
202
+ {
203
+ type: "modify",
204
+ target_text: "Anchor",
205
+ new_text: "Anchor",
206
+ comment: "Comment on doc with legacy bare comments part",
207
+ },
208
+ ]);
209
+
210
+ const commentsXml = getCommentsXml(doc);
211
+
212
+ expect(commentsXml).toContain("xmlns:w14=");
213
+
214
+ xmllint(commentsXml, "comments_legacy.xml");
215
+ },
216
+ );
217
+ });
218
+
219
+ // ===========================================================================
220
+ // Bug #2 — Inserted runs inherit anchor paragraph's character formatting
221
+ // ===========================================================================
222
+
223
+ describe("BUG-23-2: inserted runs must not inherit italic formatting from anchor", () => {
224
+ /**
225
+ * When modify inserts text into an italic paragraph, the inserted w:ins/w:r
226
+ * must NOT automatically be italic. There is currently no override mechanism.
227
+ *
228
+ * Cross-platform parity: TestInsertedRunFormatting in test_repro_issue23.py
229
+ */
230
+
231
+ it("inserted run does not carry w:i when anchor paragraph is italic", async () => {
232
+ const doc = await createTestDocument();
233
+
234
+ // Build a paragraph whose run is explicitly italic
235
+ const xmlDoc = doc.element.ownerDocument!;
236
+ const p = xmlDoc.createElement("w:p");
237
+ const r = xmlDoc.createElement("w:r");
238
+ const rPr = xmlDoc.createElement("w:rPr");
239
+ const italic = xmlDoc.createElement("w:i");
240
+ rPr.appendChild(italic);
241
+ r.appendChild(rPr);
242
+ const t = xmlDoc.createElement("w:t");
243
+ t.setAttribute("xml:space", "preserve");
244
+ t.textContent = "italicized anchor text here";
245
+ r.appendChild(t);
246
+ p.appendChild(r);
247
+ doc.element.appendChild(p);
248
+
249
+ const engine = new RedlineEngine(doc, "Test Author");
250
+ engine.process_batch([
251
+ { type: "modify", target_text: "anchor", new_text: "plain" },
252
+ ]);
253
+
254
+ const buf = await doc.save();
255
+
256
+ // Re-read the saved zip to check document.xml
257
+ const { unzipSync, strFromU8 } = await import("fflate");
258
+ const unzipped = unzipSync(new Uint8Array(buf));
259
+ const docXml = strFromU8(unzipped["word/document.xml"]);
260
+
261
+ // Collect all w:ins/w:r runs and check for w:i
262
+ const insRunPattern = /<w:ins\b[^>]*>([\s\S]*?)<\/w:ins>/g;
263
+ const italicInInserted: string[] = [];
264
+ let insMatch: RegExpExecArray | null;
265
+ while ((insMatch = insRunPattern.exec(docXml)) !== null) {
266
+ const insContent = insMatch[1];
267
+ if (/<w:i\b/.test(insContent)) {
268
+ italicInInserted.push(insContent.slice(0, 300));
269
+ }
270
+ }
271
+
272
+ expect(italicInInserted).toHaveLength(0);
273
+ // If this fails: BUG-23-2 — italic was inherited from surrounding paragraph
274
+ });
275
+ });
276
+
277
+ // ===========================================================================
278
+ // Bug #3 — modify diff placement ignores new_text ordering
279
+ // ===========================================================================
280
+
281
+ describe("BUG-23-3: prefix insertion must land BEFORE the anchor, not after", () => {
282
+ /**
283
+ * The diff engine always appends the delta AFTER the common match
284
+ * regardless of where it sits in new_text.
285
+ *
286
+ * Cross-platform parity: TestDiffPlacement in test_repro_issue23.py
287
+ */
288
+
289
+ it("target='fox', new_text='red fox': inserts 'red' BEFORE fox, fox is kept", async () => {
290
+ const doc = await createTestDocument();
291
+ addParagraph(doc, "The quick brown fox jumps over the lazy dog.");
292
+
293
+ const engine = new RedlineEngine(doc, "Test Author");
294
+ engine.process_batch([
295
+ { type: "modify", target_text: "fox", new_text: "red fox" },
296
+ ]);
297
+
298
+ const buf = await doc.save();
299
+ const text = await extractTextFromBuffer(buf);
300
+
301
+ // fox must NOT be struck out — it is the preserved anchor
302
+ expect(text).not.toContain("{--fox--}");
303
+
304
+ // The inserted prefix must appear
305
+ const insertedMatch = text.match(/\{\+\+red\s*\+\+\}/);
306
+ if (!insertedMatch) {
307
+ throw new Error(
308
+ `BUG-23-3: {++red...++} insertion not found in output.\nFull text: ${text}`,
309
+ );
310
+ }
311
+ const insertedPos = text.indexOf(insertedMatch[0]);
312
+ const foxPos = text.indexOf("fox");
313
+ // fox must not have been silently deleted
314
+ expect(foxPos).toBeGreaterThanOrEqual(0);
315
+ // Must appear BEFORE fox
316
+ expect(insertedPos).toBeLessThan(foxPos);
317
+ });
318
+
319
+ it(
320
+ "new_text='Summary\\n\\nConclusion': paragraph separator is preserved before anchor",
321
+ async () => {
322
+ /**
323
+ * target_text="Conclusion", new_text="Summary\\n\\nConclusion"
324
+ * Expected: "Summary" paragraph inserted BEFORE "Conclusion", with a paragraph break.
325
+ * Bug behaviour: "Summary" dropped, or merged into Conclusion paragraph, or appended after.
326
+ *
327
+ * Cross-platform parity: 'BUG-23-3b' in test_repro_issue23.py
328
+ */
329
+ const doc = await createTestDocument();
330
+ addParagraph(doc, "Introduction paragraph.");
331
+ addParagraph(doc, "Conclusion paragraph.");
332
+
333
+ const engine = new RedlineEngine(doc, "Test Author");
334
+ engine.process_batch([
335
+ {
336
+ type: "modify",
337
+ target_text: "Conclusion",
338
+ new_text: "Summary\n\nConclusion",
339
+ },
340
+ ]);
341
+
342
+ const buf = await doc.save();
343
+ const text = await extractTextFromBuffer(buf);
344
+
345
+ expect(text).toContain("Summary");
346
+
347
+ const summaryPos = text.indexOf("Summary");
348
+ const conclusionPos = text.indexOf("Conclusion");
349
+ expect(summaryPos).toBeLessThan(conclusionPos);
350
+
351
+ // There must be a newline between them — they should NOT be merged
352
+ const between = text.slice(summaryPos, conclusionPos);
353
+ expect(between).toContain("\n");
354
+ },
355
+ );
356
+ });
357
+
358
+ // ===========================================================================
359
+ // Bug #4 — Multi-paragraph target_text is silently corrupt or opaque error
360
+ // ===========================================================================
361
+
362
+ describe("BUG-23-4: multi-paragraph target_text must produce actionable feedback", () => {
363
+ /**
364
+ * A target_text containing \\n\\n collapses the paragraph break in the token
365
+ * stream and misaligns the diff (silent corruption), or gives an opaque
366
+ * 'Target text not found' with no explanation.
367
+ *
368
+ * The correct behaviour: either support multi-paragraph targets correctly,
369
+ * or reject them with a clear, actionable error message.
370
+ *
371
+ * Cross-platform parity: TestMultiParagraphTarget in test_repro_issue23.py
372
+ */
373
+
374
+ it("rejects multi-paragraph target_text with a clear error or handles it correctly", async () => {
375
+ const doc = await createTestDocument();
376
+ addParagraph(doc, "First paragraph content.");
377
+ addParagraph(doc, "Second paragraph content.");
378
+
379
+ const engine = new RedlineEngine(doc, "Test Author");
380
+
381
+ let raised: Error | null = null;
382
+ try {
383
+ engine.process_batch([
384
+ {
385
+ type: "modify",
386
+ target_text: "First paragraph content.\n\nSecond paragraph content.",
387
+ new_text: "Single replacement paragraph.",
388
+ },
389
+ ]);
390
+ } catch (e: any) {
391
+ raised = e;
392
+ }
393
+
394
+ if (raised === null) {
395
+ // No error raised — verify the paragraph boundary wasn't silently collapsed.
396
+ // Bug signature: both paragraphs are merged into a single deleted token without
397
+ // the \n\n separator, e.g.
398
+ // {--First paragraph content.Second paragraph content.--}
399
+ // A correct implementation either keeps the boundary or raises a clear error.
400
+ const buf = await doc.save();
401
+ const text = await extractTextFromBuffer(buf);
402
+
403
+ const collapsed =
404
+ "First paragraph content.Second paragraph content.";
405
+ expect(text).not.toContain(collapsed);
406
+ // Also catch space-separated collapse — "content. content." is equally broken
407
+ const spaceCollapsed =
408
+ "First paragraph content. Second paragraph content.";
409
+ expect(text).not.toContain(spaceCollapsed);
410
+ } else {
411
+ // An error was raised — it must mention the multi-paragraph nature
412
+ const msg = raised.message.toLowerCase();
413
+ const actionableKeywords = [
414
+ "paragraph",
415
+ "multi",
416
+ "boundary",
417
+ "newline",
418
+ "cross",
419
+ ];
420
+ const isActionable = actionableKeywords.some((kw) => msg.includes(kw));
421
+ expect(isActionable).toBe(true);
422
+ }
423
+ });
424
+
425
+ it("BUG-23-4-NN: rejects plain-paragraph N->N modifications spanning a paragraph boundary to prevent silent corruption", async () => {
426
+ const doc = await createTestDocument();
427
+ addParagraph(doc, "Clause 1 ends here.");
428
+ addParagraph(doc, "Clause 2 begins here.");
429
+
430
+ const engine = new RedlineEngine(doc, "Test Author");
431
+
432
+ let raised: any = null;
433
+ try {
434
+ engine.process_batch([
435
+ {
436
+ type: "modify",
437
+ target_text: "ends here.\n\nClause 2 begins",
438
+ new_text: "ends here. MERGED\n\nClause 2 begins CHANGED",
439
+ },
440
+ ]);
441
+ } catch (e: any) {
442
+ raised = e;
443
+ }
444
+
445
+ expect(raised).not.toBeNull();
446
+ if (raised) {
447
+ expect(raised.name).toBe("BatchValidationError");
448
+ expect(raised.message.toLowerCase()).toContain("paragraph boundary");
449
+ }
450
+ });
451
+ });
452
+
453
+ // ===========================================================================
454
+ // Bug #5 — Ambiguous-match check counts text inside w:del
455
+ // ===========================================================================
456
+
457
+ describe("BUG-23-5: tracked-deleted text must not count toward ambiguity", () => {
458
+ /**
459
+ * After one copy of a duplicated string is tracked-deleted (sits inside
460
+ * a w:del element), the remaining live copy must be uniquely matchable.
461
+ * The current engine counts the dead copy as a live occurrence and reports
462
+ * "Ambiguous match — target text appears 2 times".
463
+ *
464
+ * Cross-platform parity: TestAmbiguousMatchDel in test_repro_issue23.py
465
+ */
466
+
467
+ it("one live copy remains after tracked deletion; modify must not report ambiguous", async () => {
468
+ const doc1 = await createTestDocument();
469
+ addParagraph(doc1, "Context A: Dupe");
470
+ addParagraph(doc1, "Context B: Dupe");
471
+
472
+ const engine1 = new RedlineEngine(doc1, "Test Author");
473
+
474
+ // Batch 1: delete the first occurrence (unique via full context)
475
+ engine1.process_batch([
476
+ { type: "modify", target_text: "Context A: Dupe", new_text: "" },
477
+ ]);
478
+
479
+ const buf1 = await doc1.save();
480
+
481
+ // Sanity: first copy is now inside a w:del
482
+ const text1 = await extractTextFromBuffer(buf1);
483
+ expect(text1).toContain("{--Context A: Dupe--}");
484
+
485
+ // Batch 2: only "Context B: Dupe" is live — must NOT throw ambiguous-match
486
+ const doc2 = await DocumentObject.load(buf1);
487
+ const engine2 = new RedlineEngine(doc2, "Test Author");
488
+
489
+ let ambiguousError: Error | null = null;
490
+ try {
491
+ engine2.process_batch([
492
+ { type: "modify", target_text: "Dupe", new_text: "Unique" },
493
+ ]);
494
+ } catch (e: any) {
495
+ ambiguousError = e;
496
+ }
497
+
498
+ expect(ambiguousError).toBeNull();
499
+ // If this fails: BUG-23-5 — the w:del copy was counted as a live match
500
+
501
+ if (ambiguousError === null) {
502
+ const buf2 = await doc2.save();
503
+ const text2 = await extractTextFromBuffer(buf2);
504
+ // 'Unique' must appear as a TRACKED INSERTION, not as a tracked deletion.
505
+ // If {--Unique--} is present instead, the engine modified the w:del text.
506
+ expect(text2).toContain("{++Unique++}");
507
+ // Guard against "edited both copies" — the w:del text must not have been touched
508
+ expect(text2).not.toContain("{--Unique--}");
509
+ }
510
+ });
511
+ });