docrev 0.9.15 → 0.9.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/README.md +41 -46
  2. package/dist/lib/anchor-match.d.ts +1 -1
  3. package/dist/lib/anchor-match.d.ts.map +1 -1
  4. package/dist/lib/anchor-match.js +47 -17
  5. package/dist/lib/anchor-match.js.map +1 -1
  6. package/dist/lib/build.d.ts +1 -0
  7. package/dist/lib/build.d.ts.map +1 -1
  8. package/dist/lib/build.js +9 -4
  9. package/dist/lib/build.js.map +1 -1
  10. package/dist/lib/wordcomments.d.ts.map +1 -1
  11. package/dist/lib/wordcomments.js +25 -2
  12. package/dist/lib/wordcomments.js.map +1 -1
  13. package/docs-src/build.py +113 -0
  14. package/docs-src/extra.css +208 -0
  15. package/docs-src/md-to-html.lua +6 -0
  16. package/docs-src/template.html +116 -0
  17. package/lib/anchor-match.ts +49 -17
  18. package/lib/build.ts +10 -4
  19. package/lib/wordcomments.ts +25 -2
  20. package/mkdocs.yml +64 -0
  21. package/package.json +1 -1
  22. package/dev_notes/bug_repro_comment_parser.md +0 -71
  23. package/dev_notes/stress2/adversarial.docx +0 -0
  24. package/dev_notes/stress2/build_adversarial.ts +0 -186
  25. package/dev_notes/stress2/drift_matcher.ts +0 -62
  26. package/dev_notes/stress2/probe_anchors.ts +0 -35
  27. package/dev_notes/stress2/project/adversarial.docx +0 -0
  28. package/dev_notes/stress2/project/discussion.before.md +0 -3
  29. package/dev_notes/stress2/project/discussion.md +0 -3
  30. package/dev_notes/stress2/project/methods.before.md +0 -20
  31. package/dev_notes/stress2/project/methods.md +0 -20
  32. package/dev_notes/stress2/project/rev.yaml +0 -5
  33. package/dev_notes/stress2/project/sections.yaml +0 -4
  34. package/dev_notes/stress2/sections.yaml +0 -5
  35. package/dev_notes/stress2/trace_placement.ts +0 -50
  36. package/dev_notes/stresstest_boundaries.ts +0 -27
  37. package/dev_notes/stresstest_drift_apply.ts +0 -43
  38. package/dev_notes/stresstest_drift_compare.ts +0 -43
  39. package/dev_notes/stresstest_drift_v2.ts +0 -54
  40. package/dev_notes/stresstest_inspect.ts +0 -54
  41. package/dev_notes/stresstest_pstyle.ts +0 -55
  42. package/dev_notes/stresstest_section_debug.ts +0 -23
  43. package/dev_notes/stresstest_split.ts +0 -70
  44. package/dev_notes/stresstest_trace.ts +0 -19
  45. package/dev_notes/stresstest_verify_no_overwrite.ts +0 -40
@@ -11,6 +11,8 @@ export type AnchorStrategy =
11
11
  | 'stripped'
12
12
  | 'partial-start'
13
13
  | 'partial-start-stripped'
14
+ | 'partial-window'
15
+ | 'partial-window-stripped'
14
16
  | 'context-both'
15
17
  | 'context-before'
16
18
  | 'context-after'
@@ -171,41 +173,69 @@ export function findAnchorInText(
171
173
  return { occurrences, matchedAnchor: anchor, strategy: 'stripped', stripped: true };
172
174
  }
173
175
 
174
- // Strategy 4: first N words of anchor (long anchors)
176
+ // Strategy 4: word window from anchor (prefix or interior).
177
+ // Sliding the window across the anchor catches the case where the
178
+ // anchor's prefix has been edited but a chunk in the middle/end
179
+ // survived intact (e.g. "Sensitivity analyses were performed by
180
+ // perturbing the prior variance" → drifted "Sensitivity analyses
181
+ // perturbed the prior variance" still contains "the prior variance").
175
182
  const words = anchor.split(/\s+/);
176
183
  if (words.length > 3) {
177
184
  for (let n = Math.min(6, words.length); n >= 3; n--) {
178
- const partialAnchor = words.slice(0, n).join(' ').toLowerCase();
179
- if (partialAnchor.length >= 15) {
180
- occurrences = findAllOccurrences(textLower, partialAnchor);
181
- if (occurrences.length > 0) {
182
- return { occurrences, matchedAnchor: words.slice(0, n).join(' '), strategy: 'partial-start' };
185
+ for (let start = 0; start + n <= words.length; start++) {
186
+ const window = words.slice(start, start + n).join(' ');
187
+ const windowLower = window.toLowerCase();
188
+ if (windowLower.length < 15) continue;
189
+
190
+ let occ = findAllOccurrences(textLower, windowLower);
191
+ if (occ.length > 0) {
192
+ const strategy: AnchorStrategy = start === 0 ? 'partial-start' : 'partial-window';
193
+ return { occurrences: occ, matchedAnchor: window, strategy };
183
194
  }
184
- occurrences = findAllOccurrences(strippedLower, partialAnchor);
185
- if (occurrences.length > 0) {
186
- return {
187
- occurrences,
188
- matchedAnchor: words.slice(0, n).join(' '),
189
- strategy: 'partial-start-stripped',
190
- stripped: true,
191
- };
195
+ occ = findAllOccurrences(strippedLower, windowLower);
196
+ if (occ.length > 0) {
197
+ const strategy: AnchorStrategy = start === 0 ? 'partial-start-stripped' : 'partial-window-stripped';
198
+ return { occurrences: occ, matchedAnchor: window, strategy, stripped: true };
192
199
  }
193
200
  }
194
201
  }
195
202
  }
196
203
 
197
- // Strategy 5: context (before/after) only
204
+ // Strategy 5: context (before/after) only.
205
+ //
206
+ // For a non-empty anchor that already failed every text-based strategy
207
+ // above, we treat context as a degraded placement: classify it
208
+ // 'context-only' so callers can warn the user. We also reject
209
+ // implausible brackets — if both contexts match but the gap between
210
+ // them is far too small to contain the anchor (e.g. the anchored
211
+ // sentence was deleted), do not silently land the comment between
212
+ // the surviving sentences. Return 'failed' so the user is told to
213
+ // place it manually.
198
214
  if (before || after) {
199
215
  const beforeLower = before.toLowerCase();
200
216
  const afterLower = after.toLowerCase();
217
+ const anchorLen = anchor.length;
201
218
 
202
219
  if (before && after) {
203
220
  const beforeIdx = textLower.indexOf(beforeLower.slice(-50));
204
221
  if (beforeIdx !== -1) {
205
222
  const searchStart = beforeIdx + beforeLower.slice(-50).length;
206
223
  const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart);
207
- if (afterIdx !== -1 && afterIdx - searchStart < 500) {
208
- return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' };
224
+ if (afterIdx !== -1) {
225
+ const gap = afterIdx - searchStart;
226
+ // Require the bracket to plausibly contain a remnant of the anchor.
227
+ // Below 30% of anchor length: anchor was deleted — refuse to place.
228
+ // Above 2× anchor length + slack: brackets are too far apart, the
229
+ // matcher has latched onto unrelated repeats of common context.
230
+ const minGap = Math.floor(anchorLen * 0.3);
231
+ const maxGap = Math.min(500, anchorLen * 2 + 50);
232
+ if (gap >= minGap && gap <= maxGap) {
233
+ return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' };
234
+ }
235
+ // Both brackets found but gap implausible: anchor likely deleted.
236
+ // Don't fall back to single-side context — that would silently
237
+ // place the comment in the wrong location.
238
+ return { occurrences: [], matchedAnchor: null, strategy: 'failed' };
209
239
  }
210
240
  }
211
241
  }
@@ -262,6 +292,8 @@ export function classifyStrategy(strategy: AnchorStrategy, occurrences: number):
262
292
  case 'stripped':
263
293
  case 'partial-start':
264
294
  case 'partial-start-stripped':
295
+ case 'partial-window':
296
+ case 'partial-window-stripped':
265
297
  case 'split-match':
266
298
  return 'drift';
267
299
  case 'context-both':
package/lib/build.ts CHANGED
@@ -74,6 +74,7 @@ export interface PdfConfig {
74
74
  export interface DocxConfig {
75
75
  reference?: string | null;
76
76
  keepComments?: boolean;
77
+ affiliationNewline?: boolean;
77
78
  toc?: boolean;
78
79
  }
79
80
 
@@ -232,7 +233,8 @@ export const DEFAULT_CONFIG: BuildConfig = {
232
233
  },
233
234
  docx: {
234
235
  reference: null,
235
- keepComments: true,
236
+ keepComments: false,
237
+ affiliationNewline: true,
236
238
  toc: false,
237
239
  },
238
240
  tex: {
@@ -675,12 +677,16 @@ function generateMarkdownAuthorBlock(config: BuildConfig): string {
675
677
  lines.push('');
676
678
 
677
679
  // Affiliation lines: ^1^ Department of ...
678
- for (const [key, text] of Object.entries(config.affiliations)) {
680
+ const affiliationEntries = Object.entries(config.affiliations);
681
+ const useLineBreaks = config.docx.affiliationNewline !== false;
682
+ affiliationEntries.forEach(([key, text], idx) => {
679
683
  const num = keyToNum.get(key);
680
684
  if (num !== undefined) {
681
- lines.push(`^${num}^ ${text}`);
685
+ const isLast = idx === affiliationEntries.length - 1;
686
+ const suffix = useLineBreaks && !isLast ? '\\' : '';
687
+ lines.push(`^${num}^ ${text}${suffix}`);
682
688
  }
683
- }
689
+ });
684
690
 
685
691
  // Corresponding author footnote
686
692
  const corresponding = config.authors.find(a => typeof a !== 'string' && a.corresponding) as Author | undefined;
@@ -478,8 +478,31 @@ export async function injectCommentsAtMarkers(
478
478
  const startMarker = `${MARKER_START_PREFIX}${idx}${MARKER_SUFFIX}`;
479
479
  const endMarker = `${MARKER_END_PREFIX}${idx}${MARKER_SUFFIX}`;
480
480
 
481
- const startPos = documentXml.indexOf(startMarker);
482
- const endPos = documentXml.indexOf(endMarker, startPos + startMarker.length);
481
+ // Pandoc duplicates inline image alt-text into <wp:docPr descr="...">
482
+ // metadata attributes AND into the visible caption paragraph. A naive
483
+ // indexOf hits the metadata-attribute occurrence first, where there is
484
+ // no <w:t> element so dissectRun fails. Skip occurrences whose position
485
+ // is inside an XML tag (last unbalanced '<' before position).
486
+ // See: https://github.com/gcol33/docrev/issues/4
487
+ function findInTextContent(haystack: string, needle: string, fromIdx = 0): number {
488
+ let i = fromIdx;
489
+ while (true) {
490
+ const p = haystack.indexOf(needle, i);
491
+ if (p < 0) return -1;
492
+ const lastLt = haystack.lastIndexOf('<', p);
493
+ const lastGt = haystack.lastIndexOf('>', p);
494
+ if (lastLt > lastGt) {
495
+ i = p + 1;
496
+ continue;
497
+ }
498
+ return p;
499
+ }
500
+ }
501
+
502
+ const startPos = findInTextContent(documentXml, startMarker);
503
+ const endPos = startPos === -1
504
+ ? -1
505
+ : findInTextContent(documentXml, endMarker, startPos + startMarker.length);
483
506
 
484
507
  if (startPos === -1 || endPos === -1) continue;
485
508
 
package/mkdocs.yml ADDED
@@ -0,0 +1,64 @@
1
+ site_name: docrev
2
+ site_url: https://gillescolling.com/docrev
3
+ site_description: CLI for writing documents in Markdown while collaborating with Word users.
4
+ site_author: Gilles Colling
5
+ repo_url: https://github.com/gcol33/docrev
6
+ repo_name: gcol33/docrev
7
+
8
+ theme:
9
+ name: material
10
+ palette:
11
+ - scheme: default
12
+ primary: custom
13
+ accent: custom
14
+ toggle:
15
+ icon: material/brightness-7
16
+ name: Switch to dark mode
17
+ - scheme: slate
18
+ primary: custom
19
+ accent: custom
20
+ toggle:
21
+ icon: material/brightness-4
22
+ name: Switch to light mode
23
+ font:
24
+ text: Roboto
25
+ code: Roboto Mono
26
+ features:
27
+ - navigation.tabs
28
+ - navigation.top
29
+ - navigation.instant
30
+ - search.highlight
31
+ - content.code.copy
32
+ icon:
33
+ repo: fontawesome/brands/github
34
+
35
+ nav:
36
+ - Home: index.md
37
+ - Get Started: workflow.md
38
+ - Commands: commands.md
39
+ - Configuration: configuration.md
40
+ - Troubleshooting: troubleshooting.md
41
+
42
+ markdown_extensions:
43
+ - pymdownx.highlight:
44
+ anchor_linenums: true
45
+ - pymdownx.superfences
46
+ - pymdownx.inlinehilite
47
+ - pymdownx.tabbed:
48
+ alternate_style: true
49
+ - admonition
50
+ - pymdownx.details
51
+ - attr_list
52
+ - md_in_html
53
+ - toc:
54
+ permalink: true
55
+
56
+ extra_css:
57
+ - stylesheets/extra.css
58
+
59
+ extra:
60
+ social:
61
+ - icon: fontawesome/brands/github
62
+ link: https://github.com/gcol33/docrev
63
+ - icon: fontawesome/brands/npm
64
+ link: https://www.npmjs.com/package/docrev
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "docrev",
3
- "version": "0.9.15",
3
+ "version": "0.9.17",
4
4
  "description": "Academic paper revision workflow: Word ↔ Markdown round-trips, DOI validation, reviewer comments",
5
5
  "type": "module",
6
6
  "types": "dist/lib/types.d.ts",
@@ -1,71 +0,0 @@
1
- # Bug repro — comment parser drops valid comments
2
-
3
- ## Symptom
4
-
5
- A markdown file with 24 valid `{>>Author: text<<}` comments is shown by `rev comments file.md` as containing only 21. Three comments are silently dropped.
6
-
7
- ## Reproducer
8
-
9
- `test.md`:
10
- ```markdown
11
- # Test
12
-
13
- **Para 1.** Sentence one{>>Jonathan Lenoir: short comment<<}.
14
-
15
- **Para 2.** Sentence two{>>Jens-Christian Svenning: I'm quite skeptical about these listings of all the bad things alien species do, with any kind of proper benchmarking - you could arguably make similar lists of all the annoying or dangerous things native species do<<}.
16
-
17
- **Para 3.** Sentence three{>>Jens-Christian Svenning: see https://besjournals.onlinelibrary.wiley.com/doi/full/10.1111/1365-2745.14307<<}.
18
- ```
19
-
20
- ```bash
21
- rev comments test.md
22
- ```
23
-
24
- **Expected:** 3 comments listed.
25
- **Actual:** 1 comment listed (Lenoir's short comment). Both Svenning comments are dropped.
26
-
27
- ## Root cause — `lib/annotations.ts`
28
-
29
- ### Bug A: hyphenated reviewer names (line 102)
30
-
31
- ```ts
32
- const hasAuthorPrefix = /^[A-Za-z][A-Za-z\s]{0,20}:\s/.test(commentContent.trim());
33
- ```
34
-
35
- The character class `[A-Za-z\s]` forbids hyphens, apostrophes, and accented characters. So:
36
-
37
- - `Jens-Christian Svenning:` — fails (hyphen at position 4)
38
- - `Camilla T Colding-Jørgensen:` — fails (hyphen + diacritic)
39
- - `Renata Ćušterevska:` — fails (diacritic at position 7)
40
- - Any name with `-`, `'`, or non-ASCII letters fails
41
-
42
- When `hasAuthorPrefix === false` AND content > `MAX_COMMENT_CONTENT_LENGTH = 200`, line 104 discards the comment. Any long comment from a hyphenated reviewer is silently dropped.
43
-
44
- ### Bug B: any comment containing a URL <150 chars (line 95)
45
-
46
- ```ts
47
- if (/https?:\/\/|www\./i.test(commentContent) && commentContent.length < 150) return true;
48
- ```
49
-
50
- This filter assumes URLs in CriticMarkup blocks are markdown links wrapped in CriticMarkup syntax (a real false positive). But reviewers legitimately cite URLs/DOIs inside comments — that gets dropped too.
51
-
52
- `{>>Lenoir: see https://www.pnas.org/doi/10.1073/pnas.1608980113<<}` — 60 chars, contains URL → dropped, even though `Lenoir:` is a valid author prefix.
53
-
54
- ## Suggested fix
55
-
56
- 1. **Line 102 — broaden author-prefix regex:**
57
- ```ts
58
- const hasAuthorPrefix = /^[\p{L}][\p{L}\s\-'.]{0,30}:\s/u.test(commentContent.trim());
59
- ```
60
- Allows hyphens, apostrophes, periods, and Unicode letters. Length bumped to 30 (Camilla T Colding-Jørgensen is 27 chars).
61
-
62
- 2. **Line 95 — only filter URLs when there's no author prefix:**
63
- ```ts
64
- const looksLikeAuthor = /^[\p{L}][\p{L}\s\-'.]{0,30}:\s/u.test(commentContent.trim());
65
- if (!looksLikeAuthor && /https?:\/\/|www\./i.test(commentContent) && commentContent.length < 150) return true;
66
- ```
67
- A comment with a real `Author:` prefix should not be dropped just because it cites a URL.
68
-
69
- ## Impact
70
-
71
- In `paper_hexgrids_2026/abstract.md`, three reviewer comments from Jens-Christian Svenning (one tone-related, two with cited URLs/DOIs) were silently dropped from `rev comments`, `rev next`, and presumably `rev build docx --dual` output. Other affected reviewers in this manuscript: Camilla T Colding-Jørgensen (long comments would drop), Renata Ćušterevska, Jean-Christian Svenning across all sections.
Binary file
@@ -1,186 +0,0 @@
1
- /**
2
- * Build an adversarial DOCX with hand-crafted commentRangeStart/End
3
- * markers that probe whether docrev locates comments by *real*
4
- * anchor (the highlighted run text, taken from the docx XML) or by
5
- * guesswork. Each comment is designed to expose a specific failure
6
- * mode if the matcher were not anchor-aware.
7
- */
8
- import AdmZip from 'adm-zip';
9
- import path from 'path';
10
-
11
- const out = path.resolve('dev_notes/stress2/adversarial.docx');
12
-
13
- // Sentence with: a triplicated word, unicode look-alikes, mid-word
14
- // hyphenation, and embedded XML-special chars in the *prose* (escaped).
15
- const paragraphs: { text: string; comments?: { id: string; on: string }[] }[] = [
16
- // heading
17
- { text: 'Methods', comments: [] },
18
-
19
- // p0: single highlight, easy
20
- { text: 'The methods section describes our approach in detail.',
21
- comments: [{ id: '0', on: 'methods section' }] },
22
-
23
- // p1: word "model" appears 3x — anchor must pin the SECOND occurrence
24
- { text: 'The model is fitted, the model is checked, the model is reported.',
25
- comments: [{ id: '1', on: 'model' /* second occurrence */ }] },
26
-
27
- // p2: empty anchor (zero-width comment, classic Word "insert here")
28
- { text: 'Reviewers often place a marker between two sentences. Like this. Then continue.',
29
- comments: [{ id: '2', on: '' }] },
30
-
31
- // p3: anchor spanning across multiple runs (formatting break)
32
- { text: 'Our results show that p < 0.001 across all conditions.',
33
- comments: [{ id: '3', on: 'p < 0.001' }] },
34
-
35
- // p4: text containing literal angle brackets that would break a naive
36
- // [^<]* regex — anchor includes "(<1825)"
37
- { text: 'Trade volumes pre-industrial (<1825) were modest.',
38
- comments: [{ id: '4', on: '(<1825)' }] },
39
-
40
- // p5: very long anchor (full sentence) — matcher must still place it
41
- { text: 'We used a hierarchical Bayesian model with weakly informative priors and Hamiltonian Monte Carlo sampling implemented in Stan.',
42
- comments: [{ id: '5', on: 'We used a hierarchical Bayesian model with weakly informative priors and Hamiltonian Monte Carlo sampling implemented in Stan.' }] },
43
-
44
- // p6: anchor on punctuation only — should fall back to context
45
- { text: 'See Table 1 for details; numbers are rounded.',
46
- comments: [{ id: '6', on: ';' }] },
47
-
48
- // p7: two overlapping comment ranges share a word
49
- { text: 'The overrepresented species are listed in Appendix A.',
50
- comments: [
51
- { id: '7', on: 'overrepresented species' },
52
- { id: '8', on: 'overrepresented' },
53
- ] },
54
-
55
- // p8: anchor that exists VERBATIM elsewhere in the doc — context disambiguates
56
- { text: 'The cohort was small. Limitations are discussed in Section 5.',
57
- comments: [{ id: '9', on: 'small' }] },
58
- { text: 'The effect was small but significant.',
59
- comments: [{ id: '10', on: 'small' }] },
60
-
61
- // p9: heading paragraph (<w:pStyle w:val="Heading1"/>)
62
- { text: 'Discussion', comments: [], /* heading marker handled below */ },
63
- { text: 'In this section we situate the findings in prior literature.',
64
- comments: [{ id: '11', on: 'situate the findings' }] },
65
- ];
66
-
67
- function escapeXml(s: string): string {
68
- return s.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;')
69
- .replace(/"/g, '&quot;').replace(/'/g, '&apos;');
70
- }
71
-
72
- function buildParagraphXml(p: typeof paragraphs[0], isHeading: boolean): string {
73
- const text = p.text;
74
- const comments = p.comments || [];
75
-
76
- if (comments.length === 0) {
77
- return `<w:p>${isHeading ? '<w:pPr><w:pStyle w:val="Heading1"/></w:pPr>' : ''}<w:r><w:t xml:space="preserve">${escapeXml(text)}</w:t></w:r></w:p>`;
78
- }
79
-
80
- // Plan ranges: each comment marks [start..end] character offsets into `text`.
81
- // Empty anchors place start=end at first sentence boundary.
82
- type Range = { id: string; start: number; end: number };
83
- const ranges: Range[] = [];
84
- for (const c of comments) {
85
- if (c.on === '') {
86
- const dotIdx = text.indexOf('.');
87
- const pos = dotIdx >= 0 ? dotIdx + 1 : 0;
88
- ranges.push({ id: c.id, start: pos, end: pos });
89
- continue;
90
- }
91
- if (c.id === '1') {
92
- // "model" — pick the SECOND occurrence
93
- const first = text.indexOf('model');
94
- const second = text.indexOf('model', first + 1);
95
- ranges.push({ id: c.id, start: second, end: second + 'model'.length });
96
- continue;
97
- }
98
- const start = text.indexOf(c.on);
99
- if (start < 0) throw new Error(`anchor not found: ${c.on}`);
100
- ranges.push({ id: c.id, start, end: start + c.on.length });
101
- }
102
-
103
- // Build event list: at each char boundary we may need to emit
104
- // <w:commentRangeStart/> or <w:commentRangeEnd/>.
105
- type Event = { pos: number; kind: 'start' | 'end'; id: string };
106
- const events: Event[] = [];
107
- for (const r of ranges) {
108
- events.push({ pos: r.start, kind: 'start', id: r.id });
109
- events.push({ pos: r.end, kind: 'end', id: r.id });
110
- }
111
- // Emit ends before starts at same position, so an empty anchor's start=end
112
- // ordering keeps a zero-width range
113
- events.sort((a, b) => a.pos - b.pos || (a.kind === 'end' ? -1 : 1));
114
-
115
- let xml = `<w:p>${isHeading ? '<w:pPr><w:pStyle w:val="Heading1"/></w:pPr>' : ''}`;
116
- let cursor = 0;
117
- for (const ev of events) {
118
- if (ev.pos > cursor) {
119
- xml += `<w:r><w:t xml:space="preserve">${escapeXml(text.slice(cursor, ev.pos))}</w:t></w:r>`;
120
- cursor = ev.pos;
121
- }
122
- xml += ev.kind === 'start'
123
- ? `<w:commentRangeStart w:id="${ev.id}"/>`
124
- : `<w:commentRangeEnd w:id="${ev.id}"/><w:r><w:commentReference w:id="${ev.id}"/></w:r>`;
125
- }
126
- if (cursor < text.length) {
127
- xml += `<w:r><w:t xml:space="preserve">${escapeXml(text.slice(cursor))}</w:t></w:r>`;
128
- }
129
- xml += `</w:p>`;
130
- return xml;
131
- }
132
-
133
- const headingTexts = new Set(['Methods', 'Discussion']);
134
- const documentBody = paragraphs.map(p => buildParagraphXml(p, headingTexts.has(p.text))).join('');
135
-
136
- const documentXml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
137
- <w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
138
- <w:body>
139
- ${documentBody}
140
- <w:sectPr/>
141
- </w:body>
142
- </w:document>`;
143
-
144
- const allComments = paragraphs.flatMap(p => p.comments || []);
145
-
146
- const commentsXml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
147
- <w:comments xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
148
- ${allComments.map(c => ` <w:comment w:id="${c.id}" w:author="Reviewer" w:initials="R" w:date="2026-01-01T00:00:00Z"><w:p><w:r><w:t>Comment ${c.id} on &quot;${escapeXml(c.on)}&quot; — text contains <embedded brackets and &amp;.</w:t></w:r></w:p></w:comment>`.replace('<embedded', '&lt;embedded')).join('\n')}
149
- </w:comments>`;
150
-
151
- const contentTypesXml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
152
- <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
153
- <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
154
- <Default Extension="xml" ContentType="application/xml"/>
155
- <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
156
- <Override PartName="/word/comments.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml"/>
157
- <Override PartName="/word/styles.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml"/>
158
- </Types>`;
159
-
160
- const rootRelsXml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
161
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
162
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
163
- </Relationships>`;
164
-
165
- const docRelsXml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
166
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
167
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments" Target="comments.xml"/>
168
- <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles" Target="styles.xml"/>
169
- </Relationships>`;
170
-
171
- const stylesXml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
172
- <w:styles xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
173
- <w:style w:type="paragraph" w:styleId="Heading1"><w:name w:val="heading 1"/></w:style>
174
- </w:styles>`;
175
-
176
- const zip = new AdmZip();
177
- zip.addFile('[Content_Types].xml', Buffer.from(contentTypesXml, 'utf8'));
178
- zip.addFile('_rels/.rels', Buffer.from(rootRelsXml, 'utf8'));
179
- zip.addFile('word/document.xml', Buffer.from(documentXml, 'utf8'));
180
- zip.addFile('word/comments.xml', Buffer.from(commentsXml, 'utf8'));
181
- zip.addFile('word/styles.xml', Buffer.from(stylesXml, 'utf8'));
182
- zip.addFile('word/_rels/document.xml.rels', Buffer.from(docRelsXml, 'utf8'));
183
- zip.writeZip(out);
184
-
185
- console.log(`wrote ${out}`);
186
- console.log(`comments: ${allComments.length}`);
@@ -1,62 +0,0 @@
1
- /**
2
- * Run anchors from the adversarial docx against a HEAVILY DRIFTED
3
- * markdown and report match quality per comment.
4
- */
5
- import { extractCommentAnchors } from '../../lib/word-extraction.js';
6
- import { findAnchorInText, classifyStrategy } from '../../lib/anchor-match.js';
7
-
8
- const docx = 'dev_notes/stress2/adversarial.docx';
9
-
10
- // Drifted markdown — same comments need to land on this.
11
- // - p0 paraphrased
12
- // - p1 reordered, only TWO occurrences of "model"
13
- // - p2 prose lightly edited, sentence boundary preserved
14
- // - p3 prose changed slightly: "p < 0.001 (n=412)"
15
- // - p4 prose moved (<1825)
16
- // - p5 mostly intact
17
- // - p6 entire sentence rewritten ("Round numbers are reported in Table 1.")
18
- // - p7 unchanged
19
- // - p8/p9 unchanged
20
- // - p10/p11 unchanged
21
- const drifted = [
22
- '# Methods', '',
23
- 'Below we describe our methodology in detail.', '',
24
- 'The model is fitted; later, the model is reported.', '',
25
- 'Reviewers often place a marker between two sentences. Like this. Then continue.', '',
26
- 'Our results show that p < 0.001 (n=412) across all conditions.', '',
27
- 'Trade volumes were modest pre-industrial (<1825).', '',
28
- 'We used a hierarchical Bayesian model with weakly informative priors and Hamiltonian Monte Carlo sampling implemented in Stan.', '',
29
- 'Round numbers are reported in Table 1.', '',
30
- 'The overrepresented species are listed in Appendix A.', '',
31
- 'The cohort was small. Limitations are discussed in Section 5.',
32
- 'The effect was small but significant.', '',
33
- '# Discussion', '',
34
- 'In this section we situate the findings in prior literature.',
35
- ].join('\n');
36
-
37
- const { anchors } = await extractCommentAnchors(docx);
38
-
39
- const sorted = [...anchors.entries()].sort((a, b) => Number(a[0]) - Number(b[0]));
40
-
41
- const tally = { clean: 0, drift: 0, 'context-only': 0, ambiguous: 0, unmatched: 0 };
42
-
43
- console.log('id │ quality │ strategy │ #hits │ anchor');
44
- console.log('───┼───────────────┼─────────────────────┼───────┼──────────────────────');
45
- for (const [id, a] of sorted) {
46
- const r = findAnchorInText(a.anchor, drifted, a.before, a.after);
47
- let q: keyof typeof tally = classifyStrategy(r.strategy, r.occurrences.length) as any;
48
- if (r.occurrences.length > 1) q = 'ambiguous';
49
- tally[q]++;
50
- const tag = a.isEmpty ? '∅' : a.anchor.length > 30 ? a.anchor.slice(0, 27) + '...' : a.anchor;
51
- console.log(`${id.padStart(2)} │ ${q.padEnd(13)} │ ${r.strategy.padEnd(19)} │ ${String(r.occurrences.length).padStart(5)} │ ${JSON.stringify(tag)}`);
52
- }
53
-
54
- console.log('\nTally:', tally);
55
-
56
- // Also confirm: comment #6 (";") will fail anchor-direct match and force
57
- // context fallback (because the original sentence was rewritten).
58
- console.log('\nComment #6 (\";\") — sentence rewritten in drifted md.');
59
- const c6 = anchors.get('6')!;
60
- const r6 = findAnchorInText(c6.anchor, drifted, c6.before, c6.after);
61
- console.log(` strategy=${r6.strategy}, hits=${r6.occurrences.length}`);
62
- console.log(` ${r6.occurrences.length > 0 ? 'placed via fallback' : 'left for manual placement'}`);
@@ -1,35 +0,0 @@
1
- /**
2
- * Probe: do extracted anchors come from `<w:commentRangeStart/End>`
3
- * markers (i.e. real anchors), and does each comment carry a unique
4
- * docPosition that disambiguates duplicate prose?
5
- */
6
- import { extractCommentAnchors } from '../../lib/word-extraction.js';
7
-
8
- const docx = 'dev_notes/stress2/adversarial.docx';
9
- const { anchors, fullDocText } = await extractCommentAnchors(docx);
10
-
11
- console.log(`Doc text length: ${fullDocText.length}`);
12
- console.log(`Anchors extracted: ${anchors.size}\n`);
13
-
14
- const sorted = [...anchors.entries()].sort((a, b) => Number(a[0]) - Number(b[0]));
15
-
16
- for (const [id, a] of sorted) {
17
- const tag = a.isEmpty ? '∅(empty)' : JSON.stringify(a.anchor);
18
- console.log(`#${id} pos=${String(a.docPosition).padStart(4)} anchor=${tag}`);
19
- if (a.before) console.log(` before=${JSON.stringify(a.before.slice(-40))}`);
20
- if (a.after) console.log(` after =${JSON.stringify(a.after.slice(0, 40))}`);
21
- }
22
-
23
- // Sanity: comment 1 should pin the SECOND occurrence of "model"
24
- const c1 = anchors.get('1')!;
25
- const firstModel = fullDocText.indexOf('model');
26
- const secondModel = fullDocText.indexOf('model', firstModel + 1);
27
- console.log(`\nComment #1 expectation: pin SECOND "model" at pos ${secondModel}`);
28
- console.log(`Comment #1 actual docPosition=${c1.docPosition}`);
29
- console.log(`Match: ${c1.docPosition === secondModel ? 'YES (real-anchor)' : 'NO'}`);
30
-
31
- // Comment 7 vs 8: overlapping ranges, both anchored on "overrepresented..."
32
- const c7 = anchors.get('7')!;
33
- const c8 = anchors.get('8')!;
34
- console.log(`\nComment #7 anchor: ${JSON.stringify(c7.anchor)} (expected "overrepresented species")`);
35
- console.log(`Comment #8 anchor: ${JSON.stringify(c8.anchor)} (expected "overrepresented")`);
@@ -1,3 +0,0 @@
1
- # Discussion
2
-
3
- In this section we situate the findings in prior literature.
@@ -1,3 +0,0 @@
1
- # Discussion
2
-
3
- In this section we {>>Reviewer: Comment 11 on "situate the findings" — text contains <embedded brackets and &.<<}situate the findings in prior literature.
@@ -1,20 +0,0 @@
1
- # Methods
2
-
3
- Below we describe our methodology in detail.
4
-
5
- The model is fitted; later, the model is reported.
6
-
7
- Reviewers often place a marker between two sentences. Like this. Then continue.
8
-
9
- Our results show that p < 0.001 (n=412) across all conditions.
10
-
11
- Trade volumes were modest pre-industrial (<1825).
12
-
13
- We used a hierarchical Bayesian model with weakly informative priors and Hamiltonian Monte Carlo sampling implemented in Stan.
14
-
15
- Round numbers are reported in Table 1.
16
-
17
- The overrepresented species are listed in Appendix A.
18
-
19
- The cohort was small. Limitations are discussed in Section 5.
20
- The effect was small but significant.