docrev 0.9.11 → 0.9.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. package/.claude/settings.local.json +9 -9
  2. package/.gitattributes +1 -1
  3. package/CHANGELOG.md +149 -149
  4. package/PLAN-tables-and-postprocess.md +850 -850
  5. package/README.md +391 -391
  6. package/bin/rev.js +11 -11
  7. package/bin/rev.ts +145 -145
  8. package/completions/rev.bash +127 -127
  9. package/completions/rev.ps1 +210 -210
  10. package/completions/rev.zsh +207 -207
  11. package/dev_notes/stress2/build_adversarial.ts +186 -186
  12. package/dev_notes/stress2/drift_matcher.ts +62 -62
  13. package/dev_notes/stress2/probe_anchors.ts +35 -35
  14. package/dev_notes/stress2/project/discussion.before.md +3 -3
  15. package/dev_notes/stress2/project/discussion.md +3 -3
  16. package/dev_notes/stress2/project/methods.before.md +20 -20
  17. package/dev_notes/stress2/project/methods.md +20 -20
  18. package/dev_notes/stress2/project/rev.yaml +5 -5
  19. package/dev_notes/stress2/project/sections.yaml +4 -4
  20. package/dev_notes/stress2/sections.yaml +5 -5
  21. package/dev_notes/stress2/trace_placement.ts +50 -50
  22. package/dev_notes/stresstest_boundaries.ts +27 -27
  23. package/dev_notes/stresstest_drift_apply.ts +43 -43
  24. package/dev_notes/stresstest_drift_compare.ts +43 -43
  25. package/dev_notes/stresstest_drift_v2.ts +54 -54
  26. package/dev_notes/stresstest_inspect.ts +54 -54
  27. package/dev_notes/stresstest_pstyle.ts +55 -55
  28. package/dev_notes/stresstest_section_debug.ts +23 -23
  29. package/dev_notes/stresstest_split.ts +70 -70
  30. package/dev_notes/stresstest_trace.ts +19 -19
  31. package/dev_notes/stresstest_verify_no_overwrite.ts +40 -40
  32. package/dist/lib/build.d.ts +50 -1
  33. package/dist/lib/build.d.ts.map +1 -1
  34. package/dist/lib/build.js +80 -30
  35. package/dist/lib/build.js.map +1 -1
  36. package/dist/lib/commands/build.d.ts.map +1 -1
  37. package/dist/lib/commands/build.js +38 -5
  38. package/dist/lib/commands/build.js.map +1 -1
  39. package/dist/lib/commands/utilities.js +164 -164
  40. package/dist/lib/commands/word-tools.js +8 -8
  41. package/dist/lib/grammar.js +3 -3
  42. package/dist/lib/import.d.ts.map +1 -1
  43. package/dist/lib/import.js +146 -24
  44. package/dist/lib/import.js.map +1 -1
  45. package/dist/lib/pdf-comments.js +44 -44
  46. package/dist/lib/plugins.js +57 -57
  47. package/dist/lib/pptx-themes.js +115 -115
  48. package/dist/lib/spelling.js +2 -2
  49. package/dist/lib/templates.js +387 -387
  50. package/dist/lib/themes.js +51 -51
  51. package/dist/lib/types.d.ts +20 -0
  52. package/dist/lib/types.d.ts.map +1 -1
  53. package/dist/lib/word-extraction.d.ts +6 -0
  54. package/dist/lib/word-extraction.d.ts.map +1 -1
  55. package/dist/lib/word-extraction.js +46 -3
  56. package/dist/lib/word-extraction.js.map +1 -1
  57. package/dist/lib/wordcomments.d.ts.map +1 -1
  58. package/dist/lib/wordcomments.js +23 -5
  59. package/dist/lib/wordcomments.js.map +1 -1
  60. package/eslint.config.js +27 -27
  61. package/lib/anchor-match.ts +276 -276
  62. package/lib/annotations.ts +644 -644
  63. package/lib/build.ts +1300 -1227
  64. package/lib/citations.ts +160 -160
  65. package/lib/commands/build.ts +833 -801
  66. package/lib/commands/citations.ts +515 -515
  67. package/lib/commands/comments.ts +1050 -1050
  68. package/lib/commands/context.ts +174 -174
  69. package/lib/commands/core.ts +309 -309
  70. package/lib/commands/doi.ts +435 -435
  71. package/lib/commands/file-ops.ts +372 -372
  72. package/lib/commands/history.ts +320 -320
  73. package/lib/commands/index.ts +87 -87
  74. package/lib/commands/init.ts +259 -259
  75. package/lib/commands/merge-resolve.ts +378 -378
  76. package/lib/commands/preview.ts +178 -178
  77. package/lib/commands/project-info.ts +244 -244
  78. package/lib/commands/quality.ts +517 -517
  79. package/lib/commands/response.ts +454 -454
  80. package/lib/commands/section-boundaries.ts +82 -82
  81. package/lib/commands/sections.ts +451 -451
  82. package/lib/commands/sync.ts +706 -706
  83. package/lib/commands/text-ops.ts +449 -449
  84. package/lib/commands/utilities.ts +448 -448
  85. package/lib/commands/verify-anchors.ts +272 -272
  86. package/lib/commands/word-tools.ts +340 -340
  87. package/lib/comment-realign.ts +517 -517
  88. package/lib/config.ts +84 -84
  89. package/lib/crossref.ts +781 -781
  90. package/lib/csl.ts +191 -191
  91. package/lib/dependencies.ts +98 -98
  92. package/lib/diff-engine.ts +465 -465
  93. package/lib/doi-cache.ts +115 -115
  94. package/lib/doi.ts +897 -897
  95. package/lib/equations.ts +506 -506
  96. package/lib/errors.ts +346 -346
  97. package/lib/format.ts +541 -541
  98. package/lib/git.ts +326 -326
  99. package/lib/grammar.ts +303 -303
  100. package/lib/image-registry.ts +180 -180
  101. package/lib/import.ts +911 -792
  102. package/lib/journals.ts +543 -543
  103. package/lib/merge.ts +633 -633
  104. package/lib/orcid.ts +144 -144
  105. package/lib/pdf-comments.ts +263 -263
  106. package/lib/pdf-import.ts +524 -524
  107. package/lib/plugins.ts +362 -362
  108. package/lib/postprocess.ts +188 -188
  109. package/lib/pptx-color-filter.lua +37 -37
  110. package/lib/pptx-template.ts +469 -469
  111. package/lib/pptx-themes.ts +483 -483
  112. package/lib/protect-restore.ts +520 -520
  113. package/lib/rate-limiter.ts +94 -94
  114. package/lib/response.ts +197 -197
  115. package/lib/restore-references.ts +240 -240
  116. package/lib/review.ts +327 -327
  117. package/lib/schema.ts +417 -417
  118. package/lib/scientific-words.ts +73 -73
  119. package/lib/sections.ts +335 -335
  120. package/lib/slides.ts +756 -756
  121. package/lib/spelling.ts +334 -334
  122. package/lib/templates.ts +526 -526
  123. package/lib/themes.ts +742 -742
  124. package/lib/trackchanges.ts +247 -247
  125. package/lib/tui.ts +450 -450
  126. package/lib/types.ts +550 -530
  127. package/lib/undo.ts +250 -250
  128. package/lib/utils.ts +69 -69
  129. package/lib/variables.ts +179 -179
  130. package/lib/word-extraction.ts +806 -759
  131. package/lib/word.ts +643 -643
  132. package/lib/wordcomments.ts +817 -798
  133. package/package.json +137 -137
  134. package/scripts/postbuild.js +28 -28
  135. package/skill/REFERENCE.md +431 -431
  136. package/skill/SKILL.md +258 -258
  137. package/tsconfig.json +26 -26
  138. package/types/index.d.ts +525 -525
@@ -1,186 +1,186 @@
1
- /**
2
- * Build an adversarial DOCX with hand-crafted commentRangeStart/End
3
- * markers that probe whether docrev locates comments by *real*
4
- * anchor (the highlighted run text, taken from the docx XML) or by
5
- * guesswork. Each comment is designed to expose a specific failure
6
- * mode if the matcher were not anchor-aware.
7
- */
8
- import AdmZip from 'adm-zip';
9
- import path from 'path';
10
-
11
- const out = path.resolve('dev_notes/stress2/adversarial.docx');
12
-
13
- // Sentence with: a triplicated word, unicode look-alikes, mid-word
14
- // hyphenation, and embedded XML-special chars in the *prose* (escaped).
15
- const paragraphs: { text: string; comments?: { id: string; on: string }[] }[] = [
16
- // heading
17
- { text: 'Methods', comments: [] },
18
-
19
- // p0: single highlight, easy
20
- { text: 'The methods section describes our approach in detail.',
21
- comments: [{ id: '0', on: 'methods section' }] },
22
-
23
- // p1: word "model" appears 3x — anchor must pin the SECOND occurrence
24
- { text: 'The model is fitted, the model is checked, the model is reported.',
25
- comments: [{ id: '1', on: 'model' /* second occurrence */ }] },
26
-
27
- // p2: empty anchor (zero-width comment, classic Word "insert here")
28
- { text: 'Reviewers often place a marker between two sentences. Like this. Then continue.',
29
- comments: [{ id: '2', on: '' }] },
30
-
31
- // p3: anchor spanning across multiple runs (formatting break)
32
- { text: 'Our results show that p < 0.001 across all conditions.',
33
- comments: [{ id: '3', on: 'p < 0.001' }] },
34
-
35
- // p4: text containing literal angle brackets that would break a naive
36
- // [^<]* regex — anchor includes "(<1825)"
37
- { text: 'Trade volumes pre-industrial (<1825) were modest.',
38
- comments: [{ id: '4', on: '(<1825)' }] },
39
-
40
- // p5: very long anchor (full sentence) — matcher must still place it
41
- { text: 'We used a hierarchical Bayesian model with weakly informative priors and Hamiltonian Monte Carlo sampling implemented in Stan.',
42
- comments: [{ id: '5', on: 'We used a hierarchical Bayesian model with weakly informative priors and Hamiltonian Monte Carlo sampling implemented in Stan.' }] },
43
-
44
- // p6: anchor on punctuation only — should fall back to context
45
- { text: 'See Table 1 for details; numbers are rounded.',
46
- comments: [{ id: '6', on: ';' }] },
47
-
48
- // p7: two overlapping comment ranges share a word
49
- { text: 'The overrepresented species are listed in Appendix A.',
50
- comments: [
51
- { id: '7', on: 'overrepresented species' },
52
- { id: '8', on: 'overrepresented' },
53
- ] },
54
-
55
- // p8: anchor that exists VERBATIM elsewhere in the doc — context disambiguates
56
- { text: 'The cohort was small. Limitations are discussed in Section 5.',
57
- comments: [{ id: '9', on: 'small' }] },
58
- { text: 'The effect was small but significant.',
59
- comments: [{ id: '10', on: 'small' }] },
60
-
61
- // p9: heading paragraph (<w:pStyle w:val="Heading1"/>)
62
- { text: 'Discussion', comments: [], /* heading marker handled below */ },
63
- { text: 'In this section we situate the findings in prior literature.',
64
- comments: [{ id: '11', on: 'situate the findings' }] },
65
- ];
66
-
67
- function escapeXml(s: string): string {
68
- return s.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;')
69
- .replace(/"/g, '&quot;').replace(/'/g, '&apos;');
70
- }
71
-
72
- function buildParagraphXml(p: typeof paragraphs[0], isHeading: boolean): string {
73
- const text = p.text;
74
- const comments = p.comments || [];
75
-
76
- if (comments.length === 0) {
77
- return `<w:p>${isHeading ? '<w:pPr><w:pStyle w:val="Heading1"/></w:pPr>' : ''}<w:r><w:t xml:space="preserve">${escapeXml(text)}</w:t></w:r></w:p>`;
78
- }
79
-
80
- // Plan ranges: each comment marks [start..end] character offsets into `text`.
81
- // Empty anchors place start=end at first sentence boundary.
82
- type Range = { id: string; start: number; end: number };
83
- const ranges: Range[] = [];
84
- for (const c of comments) {
85
- if (c.on === '') {
86
- const dotIdx = text.indexOf('.');
87
- const pos = dotIdx >= 0 ? dotIdx + 1 : 0;
88
- ranges.push({ id: c.id, start: pos, end: pos });
89
- continue;
90
- }
91
- if (c.id === '1') {
92
- // "model" — pick the SECOND occurrence
93
- const first = text.indexOf('model');
94
- const second = text.indexOf('model', first + 1);
95
- ranges.push({ id: c.id, start: second, end: second + 'model'.length });
96
- continue;
97
- }
98
- const start = text.indexOf(c.on);
99
- if (start < 0) throw new Error(`anchor not found: ${c.on}`);
100
- ranges.push({ id: c.id, start, end: start + c.on.length });
101
- }
102
-
103
- // Build event list: at each char boundary we may need to emit
104
- // <w:commentRangeStart/> or <w:commentRangeEnd/>.
105
- type Event = { pos: number; kind: 'start' | 'end'; id: string };
106
- const events: Event[] = [];
107
- for (const r of ranges) {
108
- events.push({ pos: r.start, kind: 'start', id: r.id });
109
- events.push({ pos: r.end, kind: 'end', id: r.id });
110
- }
111
- // Emit ends before starts at same position, so an empty anchor's start=end
112
- // ordering keeps a zero-width range
113
- events.sort((a, b) => a.pos - b.pos || (a.kind === 'end' ? -1 : 1));
114
-
115
- let xml = `<w:p>${isHeading ? '<w:pPr><w:pStyle w:val="Heading1"/></w:pPr>' : ''}`;
116
- let cursor = 0;
117
- for (const ev of events) {
118
- if (ev.pos > cursor) {
119
- xml += `<w:r><w:t xml:space="preserve">${escapeXml(text.slice(cursor, ev.pos))}</w:t></w:r>`;
120
- cursor = ev.pos;
121
- }
122
- xml += ev.kind === 'start'
123
- ? `<w:commentRangeStart w:id="${ev.id}"/>`
124
- : `<w:commentRangeEnd w:id="${ev.id}"/><w:r><w:commentReference w:id="${ev.id}"/></w:r>`;
125
- }
126
- if (cursor < text.length) {
127
- xml += `<w:r><w:t xml:space="preserve">${escapeXml(text.slice(cursor))}</w:t></w:r>`;
128
- }
129
- xml += `</w:p>`;
130
- return xml;
131
- }
132
-
133
- const headingTexts = new Set(['Methods', 'Discussion']);
134
- const documentBody = paragraphs.map(p => buildParagraphXml(p, headingTexts.has(p.text))).join('');
135
-
136
- const documentXml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
137
- <w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
138
- <w:body>
139
- ${documentBody}
140
- <w:sectPr/>
141
- </w:body>
142
- </w:document>`;
143
-
144
- const allComments = paragraphs.flatMap(p => p.comments || []);
145
-
146
- const commentsXml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
147
- <w:comments xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
148
- ${allComments.map(c => ` <w:comment w:id="${c.id}" w:author="Reviewer" w:initials="R" w:date="2026-01-01T00:00:00Z"><w:p><w:r><w:t>Comment ${c.id} on &quot;${escapeXml(c.on)}&quot; — text contains <embedded brackets and &amp;.</w:t></w:r></w:p></w:comment>`.replace('<embedded', '&lt;embedded')).join('\n')}
149
- </w:comments>`;
150
-
151
- const contentTypesXml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
152
- <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
153
- <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
154
- <Default Extension="xml" ContentType="application/xml"/>
155
- <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
156
- <Override PartName="/word/comments.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml"/>
157
- <Override PartName="/word/styles.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml"/>
158
- </Types>`;
159
-
160
- const rootRelsXml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
161
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
162
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
163
- </Relationships>`;
164
-
165
- const docRelsXml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
166
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
167
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments" Target="comments.xml"/>
168
- <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles" Target="styles.xml"/>
169
- </Relationships>`;
170
-
171
- const stylesXml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
172
- <w:styles xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
173
- <w:style w:type="paragraph" w:styleId="Heading1"><w:name w:val="heading 1"/></w:style>
174
- </w:styles>`;
175
-
176
- const zip = new AdmZip();
177
- zip.addFile('[Content_Types].xml', Buffer.from(contentTypesXml, 'utf8'));
178
- zip.addFile('_rels/.rels', Buffer.from(rootRelsXml, 'utf8'));
179
- zip.addFile('word/document.xml', Buffer.from(documentXml, 'utf8'));
180
- zip.addFile('word/comments.xml', Buffer.from(commentsXml, 'utf8'));
181
- zip.addFile('word/styles.xml', Buffer.from(stylesXml, 'utf8'));
182
- zip.addFile('word/_rels/document.xml.rels', Buffer.from(docRelsXml, 'utf8'));
183
- zip.writeZip(out);
184
-
185
- console.log(`wrote ${out}`);
186
- console.log(`comments: ${allComments.length}`);
1
+ /**
2
+ * Build an adversarial DOCX with hand-crafted commentRangeStart/End
3
+ * markers that probe whether docrev locates comments by *real*
4
+ * anchor (the highlighted run text, taken from the docx XML) or by
5
+ * guesswork. Each comment is designed to expose a specific failure
6
+ * mode if the matcher were not anchor-aware.
7
+ */
8
+ import AdmZip from 'adm-zip';
9
+ import path from 'path';
10
+
11
+ const out = path.resolve('dev_notes/stress2/adversarial.docx');
12
+
13
+ // Sentence with: a triplicated word, unicode look-alikes, mid-word
14
+ // hyphenation, and embedded XML-special chars in the *prose* (escaped).
15
+ const paragraphs: { text: string; comments?: { id: string; on: string }[] }[] = [
16
+ // heading
17
+ { text: 'Methods', comments: [] },
18
+
19
+ // p0: single highlight, easy
20
+ { text: 'The methods section describes our approach in detail.',
21
+ comments: [{ id: '0', on: 'methods section' }] },
22
+
23
+ // p1: word "model" appears 3x — anchor must pin the SECOND occurrence
24
+ { text: 'The model is fitted, the model is checked, the model is reported.',
25
+ comments: [{ id: '1', on: 'model' /* second occurrence */ }] },
26
+
27
+ // p2: empty anchor (zero-width comment, classic Word "insert here")
28
+ { text: 'Reviewers often place a marker between two sentences. Like this. Then continue.',
29
+ comments: [{ id: '2', on: '' }] },
30
+
31
+ // p3: anchor spanning across multiple runs (formatting break)
32
+ { text: 'Our results show that p < 0.001 across all conditions.',
33
+ comments: [{ id: '3', on: 'p < 0.001' }] },
34
+
35
+ // p4: text containing literal angle brackets that would break a naive
36
+ // [^<]* regex — anchor includes "(<1825)"
37
+ { text: 'Trade volumes pre-industrial (<1825) were modest.',
38
+ comments: [{ id: '4', on: '(<1825)' }] },
39
+
40
+ // p5: very long anchor (full sentence) — matcher must still place it
41
+ { text: 'We used a hierarchical Bayesian model with weakly informative priors and Hamiltonian Monte Carlo sampling implemented in Stan.',
42
+ comments: [{ id: '5', on: 'We used a hierarchical Bayesian model with weakly informative priors and Hamiltonian Monte Carlo sampling implemented in Stan.' }] },
43
+
44
+ // p6: anchor on punctuation only — should fall back to context
45
+ { text: 'See Table 1 for details; numbers are rounded.',
46
+ comments: [{ id: '6', on: ';' }] },
47
+
48
+ // p7: two overlapping comment ranges share a word
49
+ { text: 'The overrepresented species are listed in Appendix A.',
50
+ comments: [
51
+ { id: '7', on: 'overrepresented species' },
52
+ { id: '8', on: 'overrepresented' },
53
+ ] },
54
+
55
+ // p8: anchor that exists VERBATIM elsewhere in the doc — context disambiguates
56
+ { text: 'The cohort was small. Limitations are discussed in Section 5.',
57
+ comments: [{ id: '9', on: 'small' }] },
58
+ { text: 'The effect was small but significant.',
59
+ comments: [{ id: '10', on: 'small' }] },
60
+
61
+ // p9: heading paragraph (<w:pStyle w:val="Heading1"/>)
62
+ { text: 'Discussion', comments: [], /* heading marker handled below */ },
63
+ { text: 'In this section we situate the findings in prior literature.',
64
+ comments: [{ id: '11', on: 'situate the findings' }] },
65
+ ];
66
+
67
+ function escapeXml(s: string): string {
68
+ return s.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;')
69
+ .replace(/"/g, '&quot;').replace(/'/g, '&apos;');
70
+ }
71
+
72
+ function buildParagraphXml(p: typeof paragraphs[0], isHeading: boolean): string {
73
+ const text = p.text;
74
+ const comments = p.comments || [];
75
+
76
+ if (comments.length === 0) {
77
+ return `<w:p>${isHeading ? '<w:pPr><w:pStyle w:val="Heading1"/></w:pPr>' : ''}<w:r><w:t xml:space="preserve">${escapeXml(text)}</w:t></w:r></w:p>`;
78
+ }
79
+
80
+ // Plan ranges: each comment marks [start..end] character offsets into `text`.
81
+ // Empty anchors place start=end at first sentence boundary.
82
+ type Range = { id: string; start: number; end: number };
83
+ const ranges: Range[] = [];
84
+ for (const c of comments) {
85
+ if (c.on === '') {
86
+ const dotIdx = text.indexOf('.');
87
+ const pos = dotIdx >= 0 ? dotIdx + 1 : 0;
88
+ ranges.push({ id: c.id, start: pos, end: pos });
89
+ continue;
90
+ }
91
+ if (c.id === '1') {
92
+ // "model" — pick the SECOND occurrence
93
+ const first = text.indexOf('model');
94
+ const second = text.indexOf('model', first + 1);
95
+ ranges.push({ id: c.id, start: second, end: second + 'model'.length });
96
+ continue;
97
+ }
98
+ const start = text.indexOf(c.on);
99
+ if (start < 0) throw new Error(`anchor not found: ${c.on}`);
100
+ ranges.push({ id: c.id, start, end: start + c.on.length });
101
+ }
102
+
103
+ // Build event list: at each char boundary we may need to emit
104
+ // <w:commentRangeStart/> or <w:commentRangeEnd/>.
105
+ type Event = { pos: number; kind: 'start' | 'end'; id: string };
106
+ const events: Event[] = [];
107
+ for (const r of ranges) {
108
+ events.push({ pos: r.start, kind: 'start', id: r.id });
109
+ events.push({ pos: r.end, kind: 'end', id: r.id });
110
+ }
111
+ // Emit ends before starts at same position, so an empty anchor's start=end
112
+ // ordering keeps a zero-width range
113
+ events.sort((a, b) => a.pos - b.pos || (a.kind === 'end' ? -1 : 1));
114
+
115
+ let xml = `<w:p>${isHeading ? '<w:pPr><w:pStyle w:val="Heading1"/></w:pPr>' : ''}`;
116
+ let cursor = 0;
117
+ for (const ev of events) {
118
+ if (ev.pos > cursor) {
119
+ xml += `<w:r><w:t xml:space="preserve">${escapeXml(text.slice(cursor, ev.pos))}</w:t></w:r>`;
120
+ cursor = ev.pos;
121
+ }
122
+ xml += ev.kind === 'start'
123
+ ? `<w:commentRangeStart w:id="${ev.id}"/>`
124
+ : `<w:commentRangeEnd w:id="${ev.id}"/><w:r><w:commentReference w:id="${ev.id}"/></w:r>`;
125
+ }
126
+ if (cursor < text.length) {
127
+ xml += `<w:r><w:t xml:space="preserve">${escapeXml(text.slice(cursor))}</w:t></w:r>`;
128
+ }
129
+ xml += `</w:p>`;
130
+ return xml;
131
+ }
132
+
133
+ const headingTexts = new Set(['Methods', 'Discussion']);
134
+ const documentBody = paragraphs.map(p => buildParagraphXml(p, headingTexts.has(p.text))).join('');
135
+
136
+ const documentXml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
137
+ <w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
138
+ <w:body>
139
+ ${documentBody}
140
+ <w:sectPr/>
141
+ </w:body>
142
+ </w:document>`;
143
+
144
+ const allComments = paragraphs.flatMap(p => p.comments || []);
145
+
146
+ const commentsXml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
147
+ <w:comments xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
148
+ ${allComments.map(c => ` <w:comment w:id="${c.id}" w:author="Reviewer" w:initials="R" w:date="2026-01-01T00:00:00Z"><w:p><w:r><w:t>Comment ${c.id} on &quot;${escapeXml(c.on)}&quot; — text contains <embedded brackets and &amp;.</w:t></w:r></w:p></w:comment>`.replace('<embedded', '&lt;embedded')).join('\n')}
149
+ </w:comments>`;
150
+
151
+ const contentTypesXml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
152
+ <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
153
+ <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
154
+ <Default Extension="xml" ContentType="application/xml"/>
155
+ <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
156
+ <Override PartName="/word/comments.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml"/>
157
+ <Override PartName="/word/styles.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml"/>
158
+ </Types>`;
159
+
160
+ const rootRelsXml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
161
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
162
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
163
+ </Relationships>`;
164
+
165
+ const docRelsXml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
166
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
167
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments" Target="comments.xml"/>
168
+ <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles" Target="styles.xml"/>
169
+ </Relationships>`;
170
+
171
+ const stylesXml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
172
+ <w:styles xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
173
+ <w:style w:type="paragraph" w:styleId="Heading1"><w:name w:val="heading 1"/></w:style>
174
+ </w:styles>`;
175
+
176
+ const zip = new AdmZip();
177
+ zip.addFile('[Content_Types].xml', Buffer.from(contentTypesXml, 'utf8'));
178
+ zip.addFile('_rels/.rels', Buffer.from(rootRelsXml, 'utf8'));
179
+ zip.addFile('word/document.xml', Buffer.from(documentXml, 'utf8'));
180
+ zip.addFile('word/comments.xml', Buffer.from(commentsXml, 'utf8'));
181
+ zip.addFile('word/styles.xml', Buffer.from(stylesXml, 'utf8'));
182
+ zip.addFile('word/_rels/document.xml.rels', Buffer.from(docRelsXml, 'utf8'));
183
+ zip.writeZip(out);
184
+
185
+ console.log(`wrote ${out}`);
186
+ console.log(`comments: ${allComments.length}`);
@@ -1,62 +1,62 @@
1
- /**
2
- * Run anchors from the adversarial docx against a HEAVILY DRIFTED
3
- * markdown and report match quality per comment.
4
- */
5
- import { extractCommentAnchors } from '../../lib/word-extraction.js';
6
- import { findAnchorInText, classifyStrategy } from '../../lib/anchor-match.js';
7
-
8
- const docx = 'dev_notes/stress2/adversarial.docx';
9
-
10
- // Drifted markdown — same comments need to land on this.
11
- // - p0 paraphrased
12
- // - p1 reordered, only TWO occurrences of "model"
13
- // - p2 prose lightly edited, sentence boundary preserved
14
- // - p3 prose changed slightly: "p < 0.001 (n=412)"
15
- // - p4 prose moved (<1825)
16
- // - p5 mostly intact
17
- // - p6 entire sentence rewritten ("Round numbers are reported in Table 1.")
18
- // - p7 unchanged
19
- // - p8/p9 unchanged
20
- // - p10/p11 unchanged
21
- const drifted = [
22
- '# Methods', '',
23
- 'Below we describe our methodology in detail.', '',
24
- 'The model is fitted; later, the model is reported.', '',
25
- 'Reviewers often place a marker between two sentences. Like this. Then continue.', '',
26
- 'Our results show that p < 0.001 (n=412) across all conditions.', '',
27
- 'Trade volumes were modest pre-industrial (<1825).', '',
28
- 'We used a hierarchical Bayesian model with weakly informative priors and Hamiltonian Monte Carlo sampling implemented in Stan.', '',
29
- 'Round numbers are reported in Table 1.', '',
30
- 'The overrepresented species are listed in Appendix A.', '',
31
- 'The cohort was small. Limitations are discussed in Section 5.',
32
- 'The effect was small but significant.', '',
33
- '# Discussion', '',
34
- 'In this section we situate the findings in prior literature.',
35
- ].join('\n');
36
-
37
- const { anchors } = await extractCommentAnchors(docx);
38
-
39
- const sorted = [...anchors.entries()].sort((a, b) => Number(a[0]) - Number(b[0]));
40
-
41
- const tally = { clean: 0, drift: 0, 'context-only': 0, ambiguous: 0, unmatched: 0 };
42
-
43
- console.log('id │ quality │ strategy │ #hits │ anchor');
44
- console.log('───┼───────────────┼─────────────────────┼───────┼──────────────────────');
45
- for (const [id, a] of sorted) {
46
- const r = findAnchorInText(a.anchor, drifted, a.before, a.after);
47
- let q: keyof typeof tally = classifyStrategy(r.strategy, r.occurrences.length) as any;
48
- if (r.occurrences.length > 1) q = 'ambiguous';
49
- tally[q]++;
50
- const tag = a.isEmpty ? '∅' : a.anchor.length > 30 ? a.anchor.slice(0, 27) + '...' : a.anchor;
51
- console.log(`${id.padStart(2)} │ ${q.padEnd(13)} │ ${r.strategy.padEnd(19)} │ ${String(r.occurrences.length).padStart(5)} │ ${JSON.stringify(tag)}`);
52
- }
53
-
54
- console.log('\nTally:', tally);
55
-
56
- // Also confirm: comment #6 (";") will fail anchor-direct match and force
57
- // context fallback (because the original sentence was rewritten).
58
- console.log('\nComment #6 (\";\") — sentence rewritten in drifted md.');
59
- const c6 = anchors.get('6')!;
60
- const r6 = findAnchorInText(c6.anchor, drifted, c6.before, c6.after);
61
- console.log(` strategy=${r6.strategy}, hits=${r6.occurrences.length}`);
62
- console.log(` ${r6.occurrences.length > 0 ? 'placed via fallback' : 'left for manual placement'}`);
1
+ /**
2
+ * Run anchors from the adversarial docx against a HEAVILY DRIFTED
3
+ * markdown and report match quality per comment.
4
+ */
5
+ import { extractCommentAnchors } from '../../lib/word-extraction.js';
6
+ import { findAnchorInText, classifyStrategy } from '../../lib/anchor-match.js';
7
+
8
+ const docx = 'dev_notes/stress2/adversarial.docx';
9
+
10
+ // Drifted markdown — same comments need to land on this.
11
+ // - p0 paraphrased
12
+ // - p1 reordered, only TWO occurrences of "model"
13
+ // - p2 prose lightly edited, sentence boundary preserved
14
+ // - p3 prose changed slightly: "p < 0.001 (n=412)"
15
+ // - p4 prose moved (<1825)
16
+ // - p5 mostly intact
17
+ // - p6 entire sentence rewritten ("Round numbers are reported in Table 1.")
18
+ // - p7 unchanged
19
+ // - p8/p9 unchanged
20
+ // - p10/p11 unchanged
21
+ const drifted = [
22
+ '# Methods', '',
23
+ 'Below we describe our methodology in detail.', '',
24
+ 'The model is fitted; later, the model is reported.', '',
25
+ 'Reviewers often place a marker between two sentences. Like this. Then continue.', '',
26
+ 'Our results show that p < 0.001 (n=412) across all conditions.', '',
27
+ 'Trade volumes were modest pre-industrial (<1825).', '',
28
+ 'We used a hierarchical Bayesian model with weakly informative priors and Hamiltonian Monte Carlo sampling implemented in Stan.', '',
29
+ 'Round numbers are reported in Table 1.', '',
30
+ 'The overrepresented species are listed in Appendix A.', '',
31
+ 'The cohort was small. Limitations are discussed in Section 5.',
32
+ 'The effect was small but significant.', '',
33
+ '# Discussion', '',
34
+ 'In this section we situate the findings in prior literature.',
35
+ ].join('\n');
36
+
37
+ const { anchors } = await extractCommentAnchors(docx);
38
+
39
+ const sorted = [...anchors.entries()].sort((a, b) => Number(a[0]) - Number(b[0]));
40
+
41
+ const tally = { clean: 0, drift: 0, 'context-only': 0, ambiguous: 0, unmatched: 0 };
42
+
43
+ console.log('id │ quality │ strategy │ #hits │ anchor');
44
+ console.log('───┼───────────────┼─────────────────────┼───────┼──────────────────────');
45
+ for (const [id, a] of sorted) {
46
+ const r = findAnchorInText(a.anchor, drifted, a.before, a.after);
47
+ let q: keyof typeof tally = classifyStrategy(r.strategy, r.occurrences.length) as any;
48
+ if (r.occurrences.length > 1) q = 'ambiguous';
49
+ tally[q]++;
50
+ const tag = a.isEmpty ? '∅' : a.anchor.length > 30 ? a.anchor.slice(0, 27) + '...' : a.anchor;
51
+ console.log(`${id.padStart(2)} │ ${q.padEnd(13)} │ ${r.strategy.padEnd(19)} │ ${String(r.occurrences.length).padStart(5)} │ ${JSON.stringify(tag)}`);
52
+ }
53
+
54
+ console.log('\nTally:', tally);
55
+
56
+ // Also confirm: comment #6 (";") will fail anchor-direct match and force
57
+ // context fallback (because the original sentence was rewritten).
58
+ console.log('\nComment #6 (\";\") — sentence rewritten in drifted md.');
59
+ const c6 = anchors.get('6')!;
60
+ const r6 = findAnchorInText(c6.anchor, drifted, c6.before, c6.after);
61
+ console.log(` strategy=${r6.strategy}, hits=${r6.occurrences.length}`);
62
+ console.log(` ${r6.occurrences.length > 0 ? 'placed via fallback' : 'left for manual placement'}`);
@@ -1,35 +1,35 @@
1
- /**
2
- * Probe: do extracted anchors come from `<w:commentRangeStart/End>`
3
- * markers (i.e. real anchors), and does each comment carry a unique
4
- * docPosition that disambiguates duplicate prose?
5
- */
6
- import { extractCommentAnchors } from '../../lib/word-extraction.js';
7
-
8
- const docx = 'dev_notes/stress2/adversarial.docx';
9
- const { anchors, fullDocText } = await extractCommentAnchors(docx);
10
-
11
- console.log(`Doc text length: ${fullDocText.length}`);
12
- console.log(`Anchors extracted: ${anchors.size}\n`);
13
-
14
- const sorted = [...anchors.entries()].sort((a, b) => Number(a[0]) - Number(b[0]));
15
-
16
- for (const [id, a] of sorted) {
17
- const tag = a.isEmpty ? '∅(empty)' : JSON.stringify(a.anchor);
18
- console.log(`#${id} pos=${String(a.docPosition).padStart(4)} anchor=${tag}`);
19
- if (a.before) console.log(` before=${JSON.stringify(a.before.slice(-40))}`);
20
- if (a.after) console.log(` after =${JSON.stringify(a.after.slice(0, 40))}`);
21
- }
22
-
23
- // Sanity: comment 1 should pin the SECOND occurrence of "model"
24
- const c1 = anchors.get('1')!;
25
- const firstModel = fullDocText.indexOf('model');
26
- const secondModel = fullDocText.indexOf('model', firstModel + 1);
27
- console.log(`\nComment #1 expectation: pin SECOND "model" at pos ${secondModel}`);
28
- console.log(`Comment #1 actual docPosition=${c1.docPosition}`);
29
- console.log(`Match: ${c1.docPosition === secondModel ? 'YES (real-anchor)' : 'NO'}`);
30
-
31
- // Comment 7 vs 8: overlapping ranges, both anchored on "overrepresented..."
32
- const c7 = anchors.get('7')!;
33
- const c8 = anchors.get('8')!;
34
- console.log(`\nComment #7 anchor: ${JSON.stringify(c7.anchor)} (expected "overrepresented species")`);
35
- console.log(`Comment #8 anchor: ${JSON.stringify(c8.anchor)} (expected "overrepresented")`);
1
+ /**
2
+ * Probe: do extracted anchors come from `<w:commentRangeStart/End>`
3
+ * markers (i.e. real anchors), and does each comment carry a unique
4
+ * docPosition that disambiguates duplicate prose?
5
+ */
6
+ import { extractCommentAnchors } from '../../lib/word-extraction.js';
7
+
8
+ const docx = 'dev_notes/stress2/adversarial.docx';
9
+ const { anchors, fullDocText } = await extractCommentAnchors(docx);
10
+
11
+ console.log(`Doc text length: ${fullDocText.length}`);
12
+ console.log(`Anchors extracted: ${anchors.size}\n`);
13
+
14
+ const sorted = [...anchors.entries()].sort((a, b) => Number(a[0]) - Number(b[0]));
15
+
16
+ for (const [id, a] of sorted) {
17
+ const tag = a.isEmpty ? '∅(empty)' : JSON.stringify(a.anchor);
18
+ console.log(`#${id} pos=${String(a.docPosition).padStart(4)} anchor=${tag}`);
19
+ if (a.before) console.log(` before=${JSON.stringify(a.before.slice(-40))}`);
20
+ if (a.after) console.log(` after =${JSON.stringify(a.after.slice(0, 40))}`);
21
+ }
22
+
23
+ // Sanity: comment 1 should pin the SECOND occurrence of "model"
24
+ const c1 = anchors.get('1')!;
25
+ const firstModel = fullDocText.indexOf('model');
26
+ const secondModel = fullDocText.indexOf('model', firstModel + 1);
27
+ console.log(`\nComment #1 expectation: pin SECOND "model" at pos ${secondModel}`);
28
+ console.log(`Comment #1 actual docPosition=${c1.docPosition}`);
29
+ console.log(`Match: ${c1.docPosition === secondModel ? 'YES (real-anchor)' : 'NO'}`);
30
+
31
+ // Comment 7 vs 8: overlapping ranges, both anchored on "overrepresented..."
32
+ const c7 = anchors.get('7')!;
33
+ const c8 = anchors.get('8')!;
34
+ console.log(`\nComment #7 anchor: ${JSON.stringify(c7.anchor)} (expected "overrepresented species")`);
35
+ console.log(`Comment #8 anchor: ${JSON.stringify(c8.anchor)} (expected "overrepresented")`);
@@ -1,3 +1,3 @@
1
- # Discussion
2
-
3
- In this section we situate the findings in prior literature.
1
+ # Discussion
2
+
3
+ In this section we situate the findings in prior literature.
@@ -1,3 +1,3 @@
1
- # Discussion
2
-
3
- In this section we {>>Reviewer: Comment 11 on "situate the findings" — text contains <embedded brackets and &.<<}situate the findings in prior literature.
1
+ # Discussion
2
+
3
+ In this section we {>>Reviewer: Comment 11 on "situate the findings" — text contains <embedded brackets and &.<<}situate the findings in prior literature.
@@ -1,20 +1,20 @@
1
- # Methods
2
-
3
- Below we describe our methodology in detail.
4
-
5
- The model is fitted; later, the model is reported.
6
-
7
- Reviewers often place a marker between two sentences. Like this. Then continue.
8
-
9
- Our results show that p < 0.001 (n=412) across all conditions.
10
-
11
- Trade volumes were modest pre-industrial (<1825).
12
-
13
- We used a hierarchical Bayesian model with weakly informative priors and Hamiltonian Monte Carlo sampling implemented in Stan.
14
-
15
- Round numbers are reported in Table 1.
16
-
17
- The overrepresented species are listed in Appendix A.
18
-
19
- The cohort was small. Limitations are discussed in Section 5.
20
- The effect was small but significant.
1
+ # Methods
2
+
3
+ Below we describe our methodology in detail.
4
+
5
+ The model is fitted; later, the model is reported.
6
+
7
+ Reviewers often place a marker between two sentences. Like this. Then continue.
8
+
9
+ Our results show that p < 0.001 (n=412) across all conditions.
10
+
11
+ Trade volumes were modest pre-industrial (<1825).
12
+
13
+ We used a hierarchical Bayesian model with weakly informative priors and Hamiltonian Monte Carlo sampling implemented in Stan.
14
+
15
+ Round numbers are reported in Table 1.
16
+
17
+ The overrepresented species are listed in Appendix A.
18
+
19
+ The cohort was small. Limitations are discussed in Section 5.
20
+ The effect was small but significant.