docrev 0.9.6 → 0.9.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +41 -0
- package/dev_notes/bug_repro_comment_parser.md +71 -0
- package/dev_notes/stress2/adversarial.docx +0 -0
- package/dev_notes/stress2/build_adversarial.ts +186 -0
- package/dev_notes/stress2/drift_matcher.ts +62 -0
- package/dev_notes/stress2/probe_anchors.ts +35 -0
- package/dev_notes/stress2/project/adversarial.docx +0 -0
- package/dev_notes/stress2/project/discussion.before.md +3 -0
- package/dev_notes/stress2/project/discussion.md +3 -0
- package/dev_notes/stress2/project/methods.before.md +20 -0
- package/dev_notes/stress2/project/methods.md +20 -0
- package/dev_notes/stress2/project/rev.yaml +5 -0
- package/dev_notes/stress2/project/sections.yaml +4 -0
- package/dev_notes/stress2/sections.yaml +5 -0
- package/dev_notes/stress2/trace_placement.ts +50 -0
- package/dev_notes/stresstest_boundaries.ts +27 -0
- package/dev_notes/stresstest_drift_apply.ts +43 -0
- package/dev_notes/stresstest_drift_compare.ts +43 -0
- package/dev_notes/stresstest_drift_v2.ts +54 -0
- package/dev_notes/stresstest_inspect.ts +54 -0
- package/dev_notes/stresstest_pstyle.ts +55 -0
- package/dev_notes/stresstest_section_debug.ts +23 -0
- package/dev_notes/stresstest_split.ts +70 -0
- package/dev_notes/stresstest_trace.ts +19 -0
- package/dev_notes/stresstest_verify_no_overwrite.ts +40 -0
- package/dist/lib/anchor-match.d.ts +51 -0
- package/dist/lib/anchor-match.d.ts.map +1 -0
- package/dist/lib/anchor-match.js +227 -0
- package/dist/lib/anchor-match.js.map +1 -0
- package/dist/lib/annotations.d.ts.map +1 -1
- package/dist/lib/annotations.js +24 -11
- package/dist/lib/annotations.js.map +1 -1
- package/dist/lib/commands/index.d.ts +2 -1
- package/dist/lib/commands/index.d.ts.map +1 -1
- package/dist/lib/commands/index.js +3 -1
- package/dist/lib/commands/index.js.map +1 -1
- package/dist/lib/commands/quality.js +1 -1
- package/dist/lib/commands/quality.js.map +1 -1
- package/dist/lib/commands/section-boundaries.d.ts +22 -0
- package/dist/lib/commands/section-boundaries.d.ts.map +1 -0
- package/dist/lib/commands/section-boundaries.js +63 -0
- package/dist/lib/commands/section-boundaries.js.map +1 -0
- package/dist/lib/commands/sync.d.ts.map +1 -1
- package/dist/lib/commands/sync.js +141 -0
- package/dist/lib/commands/sync.js.map +1 -1
- package/dist/lib/commands/verify-anchors.d.ts +17 -0
- package/dist/lib/commands/verify-anchors.d.ts.map +1 -0
- package/dist/lib/commands/verify-anchors.js +226 -0
- package/dist/lib/commands/verify-anchors.js.map +1 -0
- package/dist/lib/comment-realign.js +2 -2
- package/dist/lib/comment-realign.js.map +1 -1
- package/dist/lib/import.d.ts +26 -8
- package/dist/lib/import.d.ts.map +1 -1
- package/dist/lib/import.js +166 -187
- package/dist/lib/import.js.map +1 -1
- package/dist/lib/response.js +1 -1
- package/dist/lib/response.js.map +1 -1
- package/dist/lib/word-extraction.d.ts +23 -0
- package/dist/lib/word-extraction.d.ts.map +1 -1
- package/dist/lib/word-extraction.js +79 -0
- package/dist/lib/word-extraction.js.map +1 -1
- package/dist/lib/wordcomments.d.ts.map +1 -1
- package/dist/lib/wordcomments.js +165 -73
- package/dist/lib/wordcomments.js.map +1 -1
- package/lib/anchor-match.ts +276 -0
- package/lib/annotations.ts +25 -11
- package/lib/commands/index.ts +3 -0
- package/lib/commands/quality.ts +1 -1
- package/lib/commands/section-boundaries.ts +82 -0
- package/lib/commands/sync.ts +170 -0
- package/lib/commands/verify-anchors.ts +272 -0
- package/lib/comment-realign.ts +2 -2
- package/lib/import.ts +197 -209
- package/lib/response.ts +1 -1
- package/lib/word-extraction.ts +93 -0
- package/lib/wordcomments.ts +180 -82
- package/package.json +1 -1
- package/skill/REFERENCE.md +29 -2
- package/skill/SKILL.md +12 -2
- package/dist/package.json +0 -137
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
// Compare pristine vs drifted verify-anchors output
|
|
2
|
+
import * as fs from 'fs';
|
|
3
|
+
|
|
4
|
+
const a = JSON.parse(fs.readFileSync('C:/GillesC/tmp/docrev-stress/pristine.json', 'utf-8'));
|
|
5
|
+
const b = JSON.parse(fs.readFileSync('C:/GillesC/tmp/docrev-stress/drifted.json', 'utf-8'));
|
|
6
|
+
|
|
7
|
+
console.log('Summary deltas:');
|
|
8
|
+
const keys = Array.from(new Set([...Object.keys(a.summary), ...Object.keys(b.summary)]));
|
|
9
|
+
for (const k of keys) {
|
|
10
|
+
const av = a.summary[k] ?? 0;
|
|
11
|
+
const bv = b.summary[k] ?? 0;
|
|
12
|
+
const delta = bv - av;
|
|
13
|
+
console.log(` ${k.padEnd(15)} ${String(av).padStart(4)} -> ${String(bv).padStart(4)} (delta ${delta >= 0 ? '+' : ''}${delta})`);
|
|
14
|
+
}
|
|
15
|
+
console.log();
|
|
16
|
+
|
|
17
|
+
// Show comments whose quality changed (especially those that moved to a worse bucket)
|
|
18
|
+
const aMap = new Map(a.comments.map((c: any) => [c.id, c]));
|
|
19
|
+
console.log('Quality changes:');
|
|
20
|
+
let regressions = 0;
|
|
21
|
+
let improvements = 0;
|
|
22
|
+
for (const c of b.comments) {
|
|
23
|
+
const prev = aMap.get(c.id) as any;
|
|
24
|
+
if (!prev) continue;
|
|
25
|
+
if (prev.quality !== c.quality) {
|
|
26
|
+
const dir = qualityRank(c.quality) > qualityRank(prev.quality) ? '⬇' : '⬆';
|
|
27
|
+
if (dir === '⬇') regressions++; else improvements++;
|
|
28
|
+
console.log(` ${dir} #${c.id} [${c.section || '—'}] ${prev.quality}/${prev.strategy} -> ${c.quality}/${c.strategy} anchor="${(c.anchor || '').slice(0, 35)}"`);
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
console.log();
|
|
32
|
+
console.log(`regressions: ${regressions} improvements: ${improvements}`);
|
|
33
|
+
|
|
34
|
+
function qualityRank(q: string): number {
|
|
35
|
+
switch (q) {
|
|
36
|
+
case 'clean': return 0;
|
|
37
|
+
case 'drift': return 1;
|
|
38
|
+
case 'context-only': return 2;
|
|
39
|
+
case 'ambiguous': return 1; // ambiguous is sideways, not strictly worse
|
|
40
|
+
case 'unmatched': return 3;
|
|
41
|
+
default: return 4;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
// Apply drift edits that actually intersect real comment anchors
|
|
2
|
+
import * as fs from 'fs';
|
|
3
|
+
|
|
4
|
+
const dir = 'C:/GillesC/tmp/docrev-stress/project-drifted';
|
|
5
|
+
|
|
6
|
+
function edit(file: string, edits: Array<[string, string]>) {
|
|
7
|
+
const p = `${dir}/${file}`;
|
|
8
|
+
let t = fs.readFileSync(p, 'utf-8');
|
|
9
|
+
for (const [from, to] of edits) {
|
|
10
|
+
if (!t.includes(from)) {
|
|
11
|
+
console.error(`!! ${file}: pattern not found: "${from.slice(0, 60)}"`);
|
|
12
|
+
continue;
|
|
13
|
+
}
|
|
14
|
+
t = t.replace(from, to);
|
|
15
|
+
console.log(`OK ${file}: replaced "${from.slice(0, 60)}..."`);
|
|
16
|
+
}
|
|
17
|
+
fs.writeFileSync(p, t);
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
// Each of these intersects real anchors; expected effect noted.
|
|
21
|
+
edit('abstract.md', [
|
|
22
|
+
// Title: 3 anchors are 'Patterns of habitat niche expansion' (full title) — but title isn't in abstract.md, it's in YAML frontmatter, so probably already failing
|
|
23
|
+
// 'overrepresented' is the most common anchor (6 comments). Replace it everywhere.
|
|
24
|
+
// After this, the word 'overrepresented' is gone → all those comments should go to context-only or unmatched
|
|
25
|
+
['overrepresented in more than one habitat type', 'preferentially distributed across habitat types'],
|
|
26
|
+
// 'undisturbed' appears in 'undisturbed habitat types' → 2 comments
|
|
27
|
+
['undisturbed habitat types', 'pristine vegetation communities'],
|
|
28
|
+
// 'human economies' (3 anchors)
|
|
29
|
+
['human economies', 'economic activity'],
|
|
30
|
+
// 'nutrient-rich'
|
|
31
|
+
['nutrient-rich, disturbed habitats', 'anthropic, modified environments'],
|
|
32
|
+
// 'when residence time was prolonged.' — long anchor, should go to drift / partial-start
|
|
33
|
+
['when residence time was prolonged.', 'as residence time increased.'],
|
|
34
|
+
// 'pervasive' (1 anchor)
|
|
35
|
+
['Invasion debt in terms of habitat niche breadth is pervasive', 'Invasion debt in terms of habitat niche breadth is widespread'],
|
|
36
|
+
// 'alpine habitats' (1 anchor)
|
|
37
|
+
['alpine habitats', 'high-altitude environments'],
|
|
38
|
+
// 'semi-natural vegetation' (2 anchors)
|
|
39
|
+
['semi-natural vegetation', 'natural plant communities'],
|
|
40
|
+
// 'Man-made habitats function as gateways' (1 anchor)
|
|
41
|
+
['Man-made habitats function as gateways', 'Anthropogenic environments serve as entry routes'],
|
|
42
|
+
// 'Invasion debt in terms of habitat niche breadth' (2 anchors)
|
|
43
|
+
// Already mutated above (pervasive→widespread keeps anchor matchable). Add another small change to test partial-start
|
|
44
|
+
// 'habitat niche expansion' — anchor for #27
|
|
45
|
+
['habitat niche expansion', 'ecological niche broadening'],
|
|
46
|
+
// ', yet' anchor for #19
|
|
47
|
+
[', yet', ', however'],
|
|
48
|
+
// '44' — short numeric anchor
|
|
49
|
+
['44%', '53%'],
|
|
50
|
+
// '17 habitats'
|
|
51
|
+
['17 habitats', '18 habitats'],
|
|
52
|
+
]);
|
|
53
|
+
|
|
54
|
+
console.log('---done');
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import { extractWordComments, extractCommentAnchors } from '../lib/import.js';
|
|
2
|
+
|
|
3
|
+
const docx = 'C:/GillesC/tmp/docrev-stress/reviewed.docx';
|
|
4
|
+
|
|
5
|
+
const comments = await extractWordComments(docx);
|
|
6
|
+
const { anchors, fullDocText } = await extractCommentAnchors(docx);
|
|
7
|
+
|
|
8
|
+
console.log(`comments: ${comments.length}`);
|
|
9
|
+
console.log(`anchors: ${anchors.size}`);
|
|
10
|
+
console.log(`docText: ${fullDocText.length} chars`);
|
|
11
|
+
console.log();
|
|
12
|
+
|
|
13
|
+
const byAuthor: Record<string, number> = {};
|
|
14
|
+
for (const c of comments) byAuthor[c.author] = (byAuthor[c.author] || 0) + 1;
|
|
15
|
+
console.log('by author:', byAuthor);
|
|
16
|
+
console.log();
|
|
17
|
+
|
|
18
|
+
console.log('first 5 comments:');
|
|
19
|
+
for (const c of comments.slice(0, 5)) {
|
|
20
|
+
const a = anchors.get(c.id);
|
|
21
|
+
const pos = a?.docPosition ?? -1;
|
|
22
|
+
const anchor = (a?.anchor || '').slice(0, 60);
|
|
23
|
+
const text = c.text.replace(/\s+/g, ' ').slice(0, 80);
|
|
24
|
+
console.log(` #${c.id} [${c.author}] pos=${pos} anchor="${anchor}" text="${text}"`);
|
|
25
|
+
}
|
|
26
|
+
console.log();
|
|
27
|
+
console.log('last 3 comments:');
|
|
28
|
+
for (const c of comments.slice(-3)) {
|
|
29
|
+
const a = anchors.get(c.id);
|
|
30
|
+
const pos = a?.docPosition ?? -1;
|
|
31
|
+
const anchor = (a?.anchor || '').slice(0, 60);
|
|
32
|
+
const text = c.text.replace(/\s+/g, ' ').slice(0, 80);
|
|
33
|
+
console.log(` #${c.id} [${c.author}] pos=${pos} anchor="${anchor}" text="${text}"`);
|
|
34
|
+
}
|
|
35
|
+
console.log();
|
|
36
|
+
|
|
37
|
+
const sectionKeywords = ['Abstract', 'Introduction', 'Methods', 'Results', 'Discussion', 'Conclusion', 'References', 'Acknowledgements', 'Data Availability', 'Author Contributions'];
|
|
38
|
+
console.log('candidate headings:');
|
|
39
|
+
for (const kw of sectionKeywords) {
|
|
40
|
+
const idx = fullDocText.indexOf(kw);
|
|
41
|
+
if (idx >= 0) {
|
|
42
|
+
const context = fullDocText.slice(idx, idx + 80).replace(/\s+/g, ' ');
|
|
43
|
+
console.log(` ${kw} @ ${idx}: "${context}"`);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Check anchor distribution to understand section spans
|
|
48
|
+
const positions = [...anchors.values()].map(a => a.docPosition).sort((a, b) => a - b);
|
|
49
|
+
console.log();
|
|
50
|
+
console.log(`anchor positions: min=${positions[0]} max=${positions[positions.length-1]} median=${positions[Math.floor(positions.length/2)]}`);
|
|
51
|
+
|
|
52
|
+
// Check for empty anchors
|
|
53
|
+
const empty = [...anchors.values()].filter(a => a.isEmpty).length;
|
|
54
|
+
console.log(`empty anchors: ${empty}`);
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
// Inspect the document.xml paragraph styles to find true headings.
|
|
2
|
+
import AdmZip from 'adm-zip';
|
|
3
|
+
import * as fs from 'fs';
|
|
4
|
+
|
|
5
|
+
const docx = 'C:/GillesC/tmp/docrev-stress/reviewed.docx';
|
|
6
|
+
const zip = new AdmZip(docx);
|
|
7
|
+
const xml = zip.getEntry('word/document.xml')!.getData().toString('utf8');
|
|
8
|
+
|
|
9
|
+
// Build text-position map from <w:t> runs the same way extractCommentAnchors does
|
|
10
|
+
const textNodePattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
|
|
11
|
+
const textNodes: Array<{ xmlStart: number; xmlEnd: number; textStart: number; textEnd: number; text: string }> = [];
|
|
12
|
+
let textPosition = 0;
|
|
13
|
+
let m;
|
|
14
|
+
function decode(s: string): string {
|
|
15
|
+
return s.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>').replace(/"/g, '"').replace(/'/g, "'");
|
|
16
|
+
}
|
|
17
|
+
while ((m = textNodePattern.exec(xml)) !== null) {
|
|
18
|
+
const decoded = decode(m[1]);
|
|
19
|
+
textNodes.push({ xmlStart: m.index, xmlEnd: m.index + m[0].length, textStart: textPosition, textEnd: textPosition + decoded.length, text: decoded });
|
|
20
|
+
textPosition += decoded.length;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
function xmlToText(xmlPos: number): number {
|
|
24
|
+
for (const n of textNodes) {
|
|
25
|
+
if (xmlPos >= n.xmlStart && xmlPos < n.xmlEnd) return n.textStart;
|
|
26
|
+
if (xmlPos < n.xmlStart) return n.textStart;
|
|
27
|
+
}
|
|
28
|
+
return textNodes.length ? textNodes[textNodes.length - 1].textEnd : 0;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// Now walk paragraphs; each <w:p ...>...</w:p>. Inside, find <w:pStyle w:val="..."/> if present, and concatenate text runs.
|
|
32
|
+
const paraPattern = /<w:p\b[^>]*>([\s\S]*?)<\/w:p>/g;
|
|
33
|
+
const headings: Array<{ style: string; text: string; xmlStart: number; textPos: number }> = [];
|
|
34
|
+
let pm;
|
|
35
|
+
while ((pm = paraPattern.exec(xml)) !== null) {
|
|
36
|
+
const inner = pm[1];
|
|
37
|
+
const styleMatch = inner.match(/<w:pStyle[^>]*w:val="([^"]+)"/);
|
|
38
|
+
if (!styleMatch) continue;
|
|
39
|
+
const style = styleMatch[1];
|
|
40
|
+
if (!/heading/i.test(style)) continue;
|
|
41
|
+
// Extract text from runs
|
|
42
|
+
const textInRange = /<w:t[^>]*>([^<]*)<\/w:t>/g;
|
|
43
|
+
let txt = '';
|
|
44
|
+
let tm;
|
|
45
|
+
while ((tm = textInRange.exec(inner)) !== null) txt += decode(tm[1]);
|
|
46
|
+
if (!txt.trim()) continue;
|
|
47
|
+
const xmlStart = pm.index;
|
|
48
|
+
const textPos = xmlToText(xmlStart);
|
|
49
|
+
headings.push({ style, text: txt.trim(), xmlStart, textPos });
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
console.log(`Found ${headings.length} heading paragraphs:`);
|
|
53
|
+
for (const h of headings) {
|
|
54
|
+
console.log(` [${h.style.padEnd(8)}] textPos=${String(h.textPos).padStart(6)} "${h.text.slice(0, 60)}"`);
|
|
55
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { extractCommentAnchors } from '../lib/import.js';
|
|
2
|
+
|
|
3
|
+
const docx = 'C:/GillesC/tmp/docrev-stress/reviewed.docx';
|
|
4
|
+
const { fullDocText } = await extractCommentAnchors(docx);
|
|
5
|
+
|
|
6
|
+
const keywords = ['Abstract', 'Introduction', 'Methods', 'Results', 'Discussion', 'Conclusion', 'References'];
|
|
7
|
+
for (const kw of keywords) {
|
|
8
|
+
const lower = fullDocText.toLowerCase();
|
|
9
|
+
const needle = kw.toLowerCase();
|
|
10
|
+
const occ: Array<{ idx: number; ctx: string; afterChar: string }> = [];
|
|
11
|
+
let idx = 0;
|
|
12
|
+
while ((idx = lower.indexOf(needle, idx)) !== -1) {
|
|
13
|
+
const ctx = fullDocText.slice(Math.max(0, idx - 20), idx + needle.length + 20).replace(/\s+/g, ' ');
|
|
14
|
+
const afterChar = fullDocText.slice(idx + needle.length, idx + needle.length + 1);
|
|
15
|
+
occ.push({ idx, ctx, afterChar });
|
|
16
|
+
idx++;
|
|
17
|
+
}
|
|
18
|
+
console.log(`\n--- ${kw} (${occ.length} occurrences) ---`);
|
|
19
|
+
for (const o of occ.slice(0, 6)) {
|
|
20
|
+
console.log(` @${o.idx} after="${o.afterChar}" ctx: ...${o.ctx}...`);
|
|
21
|
+
}
|
|
22
|
+
if (occ.length > 6) console.log(` ... and ${occ.length - 6} more`);
|
|
23
|
+
}
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import * as fs from 'fs';
|
|
2
|
+
import * as path from 'path';
|
|
3
|
+
|
|
4
|
+
const ROOT = 'C:/GillesC/tmp/docrev-stress/project';
|
|
5
|
+
fs.mkdirSync(ROOT, { recursive: true });
|
|
6
|
+
|
|
7
|
+
const full = fs.readFileSync('C:/GillesC/tmp/docrev-stress/full.md', 'utf-8');
|
|
8
|
+
const lines = full.split(/\r?\n/);
|
|
9
|
+
|
|
10
|
+
interface Section { file: string; header: string; lines: string[]; }
|
|
11
|
+
|
|
12
|
+
const sections: Section[] = [];
|
|
13
|
+
|
|
14
|
+
// Map H1 heading text to section file
|
|
15
|
+
const headerToFile: Record<string, string> = {
|
|
16
|
+
'Abstract': 'abstract.md',
|
|
17
|
+
'Introduction': 'introduction.md',
|
|
18
|
+
'Methods': 'methods.md',
|
|
19
|
+
'Results': 'results.md',
|
|
20
|
+
'Discussion': 'discussion.md',
|
|
21
|
+
'Conclusion': 'conclusion.md',
|
|
22
|
+
'References': 'references.md',
|
|
23
|
+
'Supplementary Materials': 'supplement.md',
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
let current: Section | null = null;
|
|
27
|
+
const preambleLines: string[] = [];
|
|
28
|
+
|
|
29
|
+
for (const line of lines) {
|
|
30
|
+
const m = line.match(/^# (.+)$/);
|
|
31
|
+
if (m && headerToFile[m[1].trim()]) {
|
|
32
|
+
if (current) sections.push(current);
|
|
33
|
+
current = { file: headerToFile[m[1].trim()], header: m[1].trim(), lines: [line] };
|
|
34
|
+
} else if (current) {
|
|
35
|
+
current.lines.push(line);
|
|
36
|
+
} else {
|
|
37
|
+
preambleLines.push(line);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
if (current) sections.push(current);
|
|
41
|
+
|
|
42
|
+
// Write sections
|
|
43
|
+
for (const s of sections) {
|
|
44
|
+
const out = path.join(ROOT, s.file);
|
|
45
|
+
fs.writeFileSync(out, s.lines.join('\n'), 'utf-8');
|
|
46
|
+
console.log(`wrote ${out} (${s.lines.length} lines)`);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// Write sections.yaml
|
|
50
|
+
const yaml: string[] = ['version: 1', 'sections:'];
|
|
51
|
+
for (const s of sections) {
|
|
52
|
+
yaml.push(` ${s.file}:`);
|
|
53
|
+
yaml.push(` header: "${s.header}"`);
|
|
54
|
+
yaml.push(` aliases: []`);
|
|
55
|
+
}
|
|
56
|
+
fs.writeFileSync(path.join(ROOT, 'sections.yaml'), yaml.join('\n') + '\n', 'utf-8');
|
|
57
|
+
console.log(`wrote sections.yaml`);
|
|
58
|
+
|
|
59
|
+
// Minimal rev.yaml so commands that look for project root don't choke
|
|
60
|
+
const revYaml = `title: "Stress Test"
|
|
61
|
+
authors:
|
|
62
|
+
- name: Gilles Colling
|
|
63
|
+
output:
|
|
64
|
+
docx: {}
|
|
65
|
+
pdf: {}
|
|
66
|
+
`;
|
|
67
|
+
fs.writeFileSync(path.join(ROOT, 'rev.yaml'), revYaml, 'utf-8');
|
|
68
|
+
|
|
69
|
+
console.log();
|
|
70
|
+
console.log(`section files: ${sections.length}`);
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import { extractCommentAnchors } from '../lib/import.js';
|
|
2
|
+
|
|
3
|
+
const docx = 'C:/GillesC/tmp/docrev-stress/reviewed.docx';
|
|
4
|
+
const { fullDocText } = await extractCommentAnchors(docx);
|
|
5
|
+
|
|
6
|
+
const needle = 'methods';
|
|
7
|
+
const lower = fullDocText.toLowerCase();
|
|
8
|
+
let idx = 0;
|
|
9
|
+
const trace: Array<{idx:number, after:string, skipped:boolean}> = [];
|
|
10
|
+
while ((idx = lower.indexOf(needle, idx)) !== -1 && trace.length < 10) {
|
|
11
|
+
const after = fullDocText.slice(idx + needle.length, idx + needle.length + 5);
|
|
12
|
+
const skip = after.startsWith(':') || after.startsWith(' :');
|
|
13
|
+
trace.push({ idx, after, skipped: skip });
|
|
14
|
+
if (!skip) break;
|
|
15
|
+
idx++;
|
|
16
|
+
}
|
|
17
|
+
for (const t of trace) console.log(`idx=${t.idx} after="${t.after}" skipped=${t.skipped}`);
|
|
18
|
+
console.log();
|
|
19
|
+
console.log(`first non-:-prefixed Methods idx = ${trace.find(t => !t.skipped)?.idx ?? -1}`);
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
// Verify sync --comments-only didn't touch existing prose, only inserted comments.
|
|
2
|
+
import * as fs from 'fs';
|
|
3
|
+
import { stripCriticMarkup } from '../lib/anchor-match.js';
|
|
4
|
+
|
|
5
|
+
const original = 'C:/GillesC/tmp/docrev-stress/project';
|
|
6
|
+
const synced = 'C:/GillesC/tmp/docrev-stress/project-sync';
|
|
7
|
+
const files = ['abstract.md', 'introduction.md', 'methods.md', 'results.md', 'discussion.md', 'conclusion.md', 'references.md', 'supplement.md'];
|
|
8
|
+
|
|
9
|
+
let totalComments = 0;
|
|
10
|
+
let mismatches = 0;
|
|
11
|
+
for (const f of files) {
|
|
12
|
+
const orig = fs.readFileSync(`${original}/${f}`, 'utf-8');
|
|
13
|
+
const synd = fs.readFileSync(`${synced}/${f}`, 'utf-8');
|
|
14
|
+
|
|
15
|
+
// Count comments inserted
|
|
16
|
+
const commentMatches = synd.match(/\{>>.*?<<\}/gs) || [];
|
|
17
|
+
totalComments += commentMatches.length;
|
|
18
|
+
|
|
19
|
+
// The originals may already contain CriticMarkup (Word highlights -> [X]{.mark}),
|
|
20
|
+
// so strip both sides to compare just the underlying prose.
|
|
21
|
+
const synStripped = stripCriticMarkup(synd);
|
|
22
|
+
const origNorm = stripCriticMarkup(orig);
|
|
23
|
+
|
|
24
|
+
if (synStripped !== origNorm) {
|
|
25
|
+
// Find first difference
|
|
26
|
+
let i = 0;
|
|
27
|
+
while (i < origNorm.length && i < synStripped.length && origNorm[i] === synStripped[i]) i++;
|
|
28
|
+
const ctxOrig = origNorm.slice(Math.max(0, i - 30), i + 50).replace(/\n/g, '\\n');
|
|
29
|
+
const ctxSyn = synStripped.slice(Math.max(0, i - 30), i + 50).replace(/\n/g, '\\n');
|
|
30
|
+
console.log(`✗ ${f}: prose differs at byte ${i} (lengths orig=${origNorm.length} stripped=${synStripped.length})`);
|
|
31
|
+
console.log(` orig: ${ctxOrig}`);
|
|
32
|
+
console.log(` syn: ${ctxSyn}`);
|
|
33
|
+
mismatches++;
|
|
34
|
+
} else {
|
|
35
|
+
console.log(`✓ ${f}: prose preserved (${commentMatches.length} comments inserted)`);
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
console.log();
|
|
39
|
+
console.log(`total comments inserted: ${totalComments}`);
|
|
40
|
+
console.log(`prose mismatches: ${mismatches}/${files.length}`);
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Anchor matching primitives shared between sync (insertion) and
|
|
3
|
+
* verify-anchors (drift reporting). The functions are pure: given an
|
|
4
|
+
* anchor string and surrounding context, locate candidate positions in
|
|
5
|
+
* a target text using progressively looser strategies.
|
|
6
|
+
*/
|
|
7
|
+
export type AnchorStrategy = 'direct' | 'normalized' | 'stripped' | 'partial-start' | 'partial-start-stripped' | 'context-both' | 'context-before' | 'context-after' | 'split-match' | 'empty-anchor' | 'failed';
|
|
8
|
+
export interface AnchorSearchResult {
|
|
9
|
+
occurrences: number[];
|
|
10
|
+
matchedAnchor: string | null;
|
|
11
|
+
strategy: AnchorStrategy;
|
|
12
|
+
stripped?: boolean;
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Strip CriticMarkup so the matcher sees plain prose instead of
|
|
16
|
+
* `{++inserted++}`/`{--deleted--}`/etc. Used when an anchor lives
|
|
17
|
+
* underneath previously imported track changes.
|
|
18
|
+
*/
|
|
19
|
+
export declare function stripCriticMarkup(text: string): string;
|
|
20
|
+
/**
|
|
21
|
+
* Return every starting index where `needle` occurs in `haystack`.
|
|
22
|
+
* Empty needles return no occurrences (empty matches are not useful
|
|
23
|
+
* for anchor placement).
|
|
24
|
+
*/
|
|
25
|
+
/**
|
|
26
|
+
* Score how well the docx-side `before` / `after` context matches the
|
|
27
|
+
* surroundings of a candidate position in the target text. Used by
|
|
28
|
+
* `verify-anchors` to tell apart "multiple hits but context picks one
|
|
29
|
+
* cleanly" (sync will place it correctly) from "multiple hits, context
|
|
30
|
+
* doesn't help" (truly ambiguous, needs human placement).
|
|
31
|
+
*
|
|
32
|
+
* Returns 0 if no context was provided.
|
|
33
|
+
*/
|
|
34
|
+
export declare function scoreContextAt(pos: number, text: string, before: string, after: string, anchorLen: number): number;
|
|
35
|
+
export declare function findAllOccurrences(haystack: string, needle: string): number[];
|
|
36
|
+
/**
|
|
37
|
+
* Find candidate positions for `anchor` in `text`, falling back through
|
|
38
|
+
* progressively looser strategies (whitespace normalization, stripped
|
|
39
|
+
* CriticMarkup, partial-prefix, surrounding context, word splitting).
|
|
40
|
+
*
|
|
41
|
+
* The returned `strategy` lets callers distinguish a clean direct hit
|
|
42
|
+
* from a fuzzy approximation — useful for drift reporting.
|
|
43
|
+
*/
|
|
44
|
+
export declare function findAnchorInText(anchor: string, text: string, before?: string, after?: string): AnchorSearchResult;
|
|
45
|
+
/**
|
|
46
|
+
* Classify a strategy as a clean hit, a fuzzy/drifted hit, or no hit.
|
|
47
|
+
* Used by `verify-anchors` to summarize per-comment match quality.
|
|
48
|
+
*/
|
|
49
|
+
export type AnchorMatchQuality = 'clean' | 'drift' | 'context-only' | 'unmatched';
|
|
50
|
+
export declare function classifyStrategy(strategy: AnchorStrategy, occurrences: number): AnchorMatchQuality;
|
|
51
|
+
//# sourceMappingURL=anchor-match.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"anchor-match.d.ts","sourceRoot":"","sources":["../../lib/anchor-match.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,MAAM,MAAM,cAAc,GACtB,QAAQ,GACR,YAAY,GACZ,UAAU,GACV,eAAe,GACf,wBAAwB,GACxB,cAAc,GACd,gBAAgB,GAChB,eAAe,GACf,aAAa,GACb,cAAc,GACd,QAAQ,CAAC;AAEb,MAAM,WAAW,kBAAkB;IACjC,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,QAAQ,EAAE,cAAc,CAAC;IACzB,QAAQ,CAAC,EAAE,OAAO,CAAC;CACpB;AAED;;;;GAIG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAOtD;AAED;;;;GAIG;AACH;;;;;;;;GAQG;AACH,wBAAgB,cAAc,CAC5B,GAAG,EAAE,MAAM,EACX,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,MAAM,EACd,KAAK,EAAE,MAAM,EACb,SAAS,EAAE,MAAM,GAChB,MAAM,CAqBR;AAED,wBAAgB,kBAAkB,CAAC,QAAQ,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,CAS7E;AAED;;;;;;;GAOG;AACH,wBAAgB,gBAAgB,CAC9B,MAAM,EAAE,MAAM,EACd,IAAI,EAAE,MAAM,EACZ,MAAM,GAAE,MAAW,EACnB,KAAK,GAAE,MAAW,GACjB,kBAAkB,CA2IpB;AAED;;;GAGG;AACH,MAAM,MAAM,kBAAkB,GAAG,OAAO,GAAG,OAAO,GAAG,cAAc,GAAG,WAAW,CAAC;AAElF,wBAAgB,gBAAgB,CAAC,QAAQ,EAAE,cAAc,EAAE,WAAW,EAAE,MAAM,GAAG,kBAAkB,CAoBlG"}
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Anchor matching primitives shared between sync (insertion) and
|
|
3
|
+
* verify-anchors (drift reporting). The functions are pure: given an
|
|
4
|
+
* anchor string and surrounding context, locate candidate positions in
|
|
5
|
+
* a target text using progressively looser strategies.
|
|
6
|
+
*/
|
|
7
|
+
/**
|
|
8
|
+
* Strip CriticMarkup so the matcher sees plain prose instead of
|
|
9
|
+
* `{++inserted++}`/`{--deleted--}`/etc. Used when an anchor lives
|
|
10
|
+
* underneath previously imported track changes.
|
|
11
|
+
*/
|
|
12
|
+
export function stripCriticMarkup(text) {
|
|
13
|
+
return text
|
|
14
|
+
.replace(/\{\+\+([^+]*)\+\+\}/g, '$1') // insertions: keep new text
|
|
15
|
+
.replace(/\{--([^-]*)--\}/g, '') // deletions: remove old text
|
|
16
|
+
.replace(/\{~~([^~]*)~>([^~]*)~~\}/g, '$2') // substitutions: keep new text
|
|
17
|
+
.replace(/\{>>[\s\S]*?<<\}/g, '') // comments: remove (non-greedy; comment text may contain '<')
|
|
18
|
+
.replace(/\[([^\]]*)\]\{\.mark\}/g, '$1'); // marked text: keep text
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Return every starting index where `needle` occurs in `haystack`.
|
|
22
|
+
* Empty needles return no occurrences (empty matches are not useful
|
|
23
|
+
* for anchor placement).
|
|
24
|
+
*/
|
|
25
|
+
/**
|
|
26
|
+
* Score how well the docx-side `before` / `after` context matches the
|
|
27
|
+
* surroundings of a candidate position in the target text. Used by
|
|
28
|
+
* `verify-anchors` to tell apart "multiple hits but context picks one
|
|
29
|
+
* cleanly" (sync will place it correctly) from "multiple hits, context
|
|
30
|
+
* doesn't help" (truly ambiguous, needs human placement).
|
|
31
|
+
*
|
|
32
|
+
* Returns 0 if no context was provided.
|
|
33
|
+
*/
|
|
34
|
+
export function scoreContextAt(pos, text, before, after, anchorLen) {
|
|
35
|
+
let score = 0;
|
|
36
|
+
if (before) {
|
|
37
|
+
const contextBefore = text.slice(Math.max(0, pos - before.length - 20), pos).toLowerCase();
|
|
38
|
+
const beforeLower = before.toLowerCase();
|
|
39
|
+
const beforeWords = beforeLower.split(/\s+/).filter(w => w.length > 3);
|
|
40
|
+
for (const word of beforeWords) {
|
|
41
|
+
if (contextBefore.includes(word))
|
|
42
|
+
score += 2;
|
|
43
|
+
}
|
|
44
|
+
if (contextBefore.includes(beforeLower.slice(-30)))
|
|
45
|
+
score += 5;
|
|
46
|
+
}
|
|
47
|
+
if (after) {
|
|
48
|
+
const contextAfter = text.slice(pos + anchorLen, pos + anchorLen + after.length + 20).toLowerCase();
|
|
49
|
+
const afterLower = after.toLowerCase();
|
|
50
|
+
const afterWords = afterLower.split(/\s+/).filter(w => w.length > 3);
|
|
51
|
+
for (const word of afterWords) {
|
|
52
|
+
if (contextAfter.includes(word))
|
|
53
|
+
score += 2;
|
|
54
|
+
}
|
|
55
|
+
if (contextAfter.includes(afterLower.slice(0, 30)))
|
|
56
|
+
score += 5;
|
|
57
|
+
}
|
|
58
|
+
return score;
|
|
59
|
+
}
|
|
60
|
+
export function findAllOccurrences(haystack, needle) {
|
|
61
|
+
if (!needle || needle.length === 0)
|
|
62
|
+
return [];
|
|
63
|
+
const occurrences = [];
|
|
64
|
+
let idx = 0;
|
|
65
|
+
while ((idx = haystack.indexOf(needle, idx)) !== -1) {
|
|
66
|
+
occurrences.push(idx);
|
|
67
|
+
idx += 1;
|
|
68
|
+
}
|
|
69
|
+
return occurrences;
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* Find candidate positions for `anchor` in `text`, falling back through
|
|
73
|
+
* progressively looser strategies (whitespace normalization, stripped
|
|
74
|
+
* CriticMarkup, partial-prefix, surrounding context, word splitting).
|
|
75
|
+
*
|
|
76
|
+
* The returned `strategy` lets callers distinguish a clean direct hit
|
|
77
|
+
* from a fuzzy approximation — useful for drift reporting.
|
|
78
|
+
*/
|
|
79
|
+
export function findAnchorInText(anchor, text, before = '', after = '') {
|
|
80
|
+
// Empty anchor: skip directly to context-based matching
|
|
81
|
+
if (!anchor || anchor.trim().length === 0) {
|
|
82
|
+
if (before || after) {
|
|
83
|
+
const beforeLower = (before || '').toLowerCase();
|
|
84
|
+
const afterLower = (after || '').toLowerCase();
|
|
85
|
+
const textLower = text.toLowerCase();
|
|
86
|
+
if (before && after) {
|
|
87
|
+
const beforeIdx = textLower.indexOf(beforeLower.slice(-50));
|
|
88
|
+
if (beforeIdx !== -1) {
|
|
89
|
+
const searchStart = beforeIdx + beforeLower.slice(-50).length;
|
|
90
|
+
const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart);
|
|
91
|
+
if (afterIdx !== -1 && afterIdx - searchStart < 500) {
|
|
92
|
+
return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' };
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
if (before) {
|
|
97
|
+
const beforeIdx = textLower.lastIndexOf(beforeLower.slice(-30));
|
|
98
|
+
if (beforeIdx !== -1) {
|
|
99
|
+
return {
|
|
100
|
+
occurrences: [beforeIdx + beforeLower.slice(-30).length],
|
|
101
|
+
matchedAnchor: null,
|
|
102
|
+
strategy: 'context-before',
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
if (after) {
|
|
107
|
+
const afterIdx = textLower.indexOf(afterLower.slice(0, 30));
|
|
108
|
+
if (afterIdx !== -1) {
|
|
109
|
+
return { occurrences: [afterIdx], matchedAnchor: null, strategy: 'context-after' };
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
return { occurrences: [], matchedAnchor: null, strategy: 'empty-anchor' };
|
|
114
|
+
}
|
|
115
|
+
const anchorLower = anchor.toLowerCase();
|
|
116
|
+
const textLower = text.toLowerCase();
|
|
117
|
+
// Strategy 1: direct match
|
|
118
|
+
let occurrences = findAllOccurrences(textLower, anchorLower);
|
|
119
|
+
if (occurrences.length > 0) {
|
|
120
|
+
return { occurrences, matchedAnchor: anchor, strategy: 'direct' };
|
|
121
|
+
}
|
|
122
|
+
// Strategy 2: normalized whitespace
|
|
123
|
+
const normalizedAnchor = anchor.replace(/\s+/g, ' ').toLowerCase();
|
|
124
|
+
const normalizedText = text.replace(/\s+/g, ' ').toLowerCase();
|
|
125
|
+
const idx = normalizedText.indexOf(normalizedAnchor);
|
|
126
|
+
if (idx !== -1) {
|
|
127
|
+
return { occurrences: [idx], matchedAnchor: anchor, strategy: 'normalized' };
|
|
128
|
+
}
|
|
129
|
+
// Strategy 3: match in stripped CriticMarkup version
|
|
130
|
+
const strippedText = stripCriticMarkup(text);
|
|
131
|
+
const strippedLower = strippedText.toLowerCase();
|
|
132
|
+
occurrences = findAllOccurrences(strippedLower, anchorLower);
|
|
133
|
+
if (occurrences.length > 0) {
|
|
134
|
+
return { occurrences, matchedAnchor: anchor, strategy: 'stripped', stripped: true };
|
|
135
|
+
}
|
|
136
|
+
// Strategy 4: first N words of anchor (long anchors)
|
|
137
|
+
const words = anchor.split(/\s+/);
|
|
138
|
+
if (words.length > 3) {
|
|
139
|
+
for (let n = Math.min(6, words.length); n >= 3; n--) {
|
|
140
|
+
const partialAnchor = words.slice(0, n).join(' ').toLowerCase();
|
|
141
|
+
if (partialAnchor.length >= 15) {
|
|
142
|
+
occurrences = findAllOccurrences(textLower, partialAnchor);
|
|
143
|
+
if (occurrences.length > 0) {
|
|
144
|
+
return { occurrences, matchedAnchor: words.slice(0, n).join(' '), strategy: 'partial-start' };
|
|
145
|
+
}
|
|
146
|
+
occurrences = findAllOccurrences(strippedLower, partialAnchor);
|
|
147
|
+
if (occurrences.length > 0) {
|
|
148
|
+
return {
|
|
149
|
+
occurrences,
|
|
150
|
+
matchedAnchor: words.slice(0, n).join(' '),
|
|
151
|
+
strategy: 'partial-start-stripped',
|
|
152
|
+
stripped: true,
|
|
153
|
+
};
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
// Strategy 5: context (before/after) only
|
|
159
|
+
if (before || after) {
|
|
160
|
+
const beforeLower = before.toLowerCase();
|
|
161
|
+
const afterLower = after.toLowerCase();
|
|
162
|
+
if (before && after) {
|
|
163
|
+
const beforeIdx = textLower.indexOf(beforeLower.slice(-50));
|
|
164
|
+
if (beforeIdx !== -1) {
|
|
165
|
+
const searchStart = beforeIdx + beforeLower.slice(-50).length;
|
|
166
|
+
const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart);
|
|
167
|
+
if (afterIdx !== -1 && afterIdx - searchStart < 500) {
|
|
168
|
+
return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' };
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
if (before) {
|
|
173
|
+
const beforeIdx = textLower.lastIndexOf(beforeLower.slice(-30));
|
|
174
|
+
if (beforeIdx !== -1) {
|
|
175
|
+
return {
|
|
176
|
+
occurrences: [beforeIdx + beforeLower.slice(-30).length],
|
|
177
|
+
matchedAnchor: null,
|
|
178
|
+
strategy: 'context-before',
|
|
179
|
+
};
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
if (after) {
|
|
183
|
+
const afterIdx = textLower.indexOf(afterLower.slice(0, 30));
|
|
184
|
+
if (afterIdx !== -1) {
|
|
185
|
+
return { occurrences: [afterIdx], matchedAnchor: null, strategy: 'context-after' };
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
// Strategy 6: split anchor on transition characters
|
|
190
|
+
const splitPatterns = [' ', ', ', '. ', ' - ', ' – '];
|
|
191
|
+
for (const sep of splitPatterns) {
|
|
192
|
+
if (anchor.includes(sep)) {
|
|
193
|
+
const parts = anchor.split(sep).filter(p => p.length >= 4);
|
|
194
|
+
for (const part of parts) {
|
|
195
|
+
const partLower = part.toLowerCase();
|
|
196
|
+
occurrences = findAllOccurrences(textLower, partLower);
|
|
197
|
+
if (occurrences.length > 0 && occurrences.length < 5) {
|
|
198
|
+
return { occurrences, matchedAnchor: part, strategy: 'split-match' };
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
return { occurrences: [], matchedAnchor: null, strategy: 'failed' };
|
|
204
|
+
}
|
|
205
|
+
export function classifyStrategy(strategy, occurrences) {
|
|
206
|
+
if (occurrences === 0)
|
|
207
|
+
return 'unmatched';
|
|
208
|
+
switch (strategy) {
|
|
209
|
+
case 'direct':
|
|
210
|
+
case 'normalized':
|
|
211
|
+
return 'clean';
|
|
212
|
+
case 'stripped':
|
|
213
|
+
case 'partial-start':
|
|
214
|
+
case 'partial-start-stripped':
|
|
215
|
+
case 'split-match':
|
|
216
|
+
return 'drift';
|
|
217
|
+
case 'context-both':
|
|
218
|
+
case 'context-before':
|
|
219
|
+
case 'context-after':
|
|
220
|
+
return 'context-only';
|
|
221
|
+
case 'empty-anchor':
|
|
222
|
+
case 'failed':
|
|
223
|
+
default:
|
|
224
|
+
return 'unmatched';
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
//# sourceMappingURL=anchor-match.js.map
|