docrev 0.9.11 → 0.9.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. package/.claude/settings.local.json +9 -9
  2. package/.gitattributes +1 -1
  3. package/CHANGELOG.md +149 -149
  4. package/PLAN-tables-and-postprocess.md +850 -850
  5. package/README.md +391 -391
  6. package/bin/rev.js +11 -11
  7. package/bin/rev.ts +145 -145
  8. package/completions/rev.bash +127 -127
  9. package/completions/rev.ps1 +210 -210
  10. package/completions/rev.zsh +207 -207
  11. package/dev_notes/stress2/build_adversarial.ts +186 -186
  12. package/dev_notes/stress2/drift_matcher.ts +62 -62
  13. package/dev_notes/stress2/probe_anchors.ts +35 -35
  14. package/dev_notes/stress2/project/discussion.before.md +3 -3
  15. package/dev_notes/stress2/project/discussion.md +3 -3
  16. package/dev_notes/stress2/project/methods.before.md +20 -20
  17. package/dev_notes/stress2/project/methods.md +20 -20
  18. package/dev_notes/stress2/project/rev.yaml +5 -5
  19. package/dev_notes/stress2/project/sections.yaml +4 -4
  20. package/dev_notes/stress2/sections.yaml +5 -5
  21. package/dev_notes/stress2/trace_placement.ts +50 -50
  22. package/dev_notes/stresstest_boundaries.ts +27 -27
  23. package/dev_notes/stresstest_drift_apply.ts +43 -43
  24. package/dev_notes/stresstest_drift_compare.ts +43 -43
  25. package/dev_notes/stresstest_drift_v2.ts +54 -54
  26. package/dev_notes/stresstest_inspect.ts +54 -54
  27. package/dev_notes/stresstest_pstyle.ts +55 -55
  28. package/dev_notes/stresstest_section_debug.ts +23 -23
  29. package/dev_notes/stresstest_split.ts +70 -70
  30. package/dev_notes/stresstest_trace.ts +19 -19
  31. package/dev_notes/stresstest_verify_no_overwrite.ts +40 -40
  32. package/dist/lib/build.d.ts +50 -1
  33. package/dist/lib/build.d.ts.map +1 -1
  34. package/dist/lib/build.js +80 -30
  35. package/dist/lib/build.js.map +1 -1
  36. package/dist/lib/commands/build.d.ts.map +1 -1
  37. package/dist/lib/commands/build.js +38 -5
  38. package/dist/lib/commands/build.js.map +1 -1
  39. package/dist/lib/commands/utilities.js +164 -164
  40. package/dist/lib/commands/word-tools.js +8 -8
  41. package/dist/lib/grammar.js +3 -3
  42. package/dist/lib/import.d.ts.map +1 -1
  43. package/dist/lib/import.js +146 -24
  44. package/dist/lib/import.js.map +1 -1
  45. package/dist/lib/pdf-comments.js +44 -44
  46. package/dist/lib/plugins.js +57 -57
  47. package/dist/lib/pptx-themes.js +115 -115
  48. package/dist/lib/spelling.js +2 -2
  49. package/dist/lib/templates.js +387 -387
  50. package/dist/lib/themes.js +51 -51
  51. package/dist/lib/types.d.ts +20 -0
  52. package/dist/lib/types.d.ts.map +1 -1
  53. package/dist/lib/word-extraction.d.ts +6 -0
  54. package/dist/lib/word-extraction.d.ts.map +1 -1
  55. package/dist/lib/word-extraction.js +46 -3
  56. package/dist/lib/word-extraction.js.map +1 -1
  57. package/dist/lib/wordcomments.d.ts.map +1 -1
  58. package/dist/lib/wordcomments.js +23 -5
  59. package/dist/lib/wordcomments.js.map +1 -1
  60. package/eslint.config.js +27 -27
  61. package/lib/anchor-match.ts +276 -276
  62. package/lib/annotations.ts +644 -644
  63. package/lib/build.ts +1300 -1227
  64. package/lib/citations.ts +160 -160
  65. package/lib/commands/build.ts +833 -801
  66. package/lib/commands/citations.ts +515 -515
  67. package/lib/commands/comments.ts +1050 -1050
  68. package/lib/commands/context.ts +174 -174
  69. package/lib/commands/core.ts +309 -309
  70. package/lib/commands/doi.ts +435 -435
  71. package/lib/commands/file-ops.ts +372 -372
  72. package/lib/commands/history.ts +320 -320
  73. package/lib/commands/index.ts +87 -87
  74. package/lib/commands/init.ts +259 -259
  75. package/lib/commands/merge-resolve.ts +378 -378
  76. package/lib/commands/preview.ts +178 -178
  77. package/lib/commands/project-info.ts +244 -244
  78. package/lib/commands/quality.ts +517 -517
  79. package/lib/commands/response.ts +454 -454
  80. package/lib/commands/section-boundaries.ts +82 -82
  81. package/lib/commands/sections.ts +451 -451
  82. package/lib/commands/sync.ts +706 -706
  83. package/lib/commands/text-ops.ts +449 -449
  84. package/lib/commands/utilities.ts +448 -448
  85. package/lib/commands/verify-anchors.ts +272 -272
  86. package/lib/commands/word-tools.ts +340 -340
  87. package/lib/comment-realign.ts +517 -517
  88. package/lib/config.ts +84 -84
  89. package/lib/crossref.ts +781 -781
  90. package/lib/csl.ts +191 -191
  91. package/lib/dependencies.ts +98 -98
  92. package/lib/diff-engine.ts +465 -465
  93. package/lib/doi-cache.ts +115 -115
  94. package/lib/doi.ts +897 -897
  95. package/lib/equations.ts +506 -506
  96. package/lib/errors.ts +346 -346
  97. package/lib/format.ts +541 -541
  98. package/lib/git.ts +326 -326
  99. package/lib/grammar.ts +303 -303
  100. package/lib/image-registry.ts +180 -180
  101. package/lib/import.ts +911 -792
  102. package/lib/journals.ts +543 -543
  103. package/lib/merge.ts +633 -633
  104. package/lib/orcid.ts +144 -144
  105. package/lib/pdf-comments.ts +263 -263
  106. package/lib/pdf-import.ts +524 -524
  107. package/lib/plugins.ts +362 -362
  108. package/lib/postprocess.ts +188 -188
  109. package/lib/pptx-color-filter.lua +37 -37
  110. package/lib/pptx-template.ts +469 -469
  111. package/lib/pptx-themes.ts +483 -483
  112. package/lib/protect-restore.ts +520 -520
  113. package/lib/rate-limiter.ts +94 -94
  114. package/lib/response.ts +197 -197
  115. package/lib/restore-references.ts +240 -240
  116. package/lib/review.ts +327 -327
  117. package/lib/schema.ts +417 -417
  118. package/lib/scientific-words.ts +73 -73
  119. package/lib/sections.ts +335 -335
  120. package/lib/slides.ts +756 -756
  121. package/lib/spelling.ts +334 -334
  122. package/lib/templates.ts +526 -526
  123. package/lib/themes.ts +742 -742
  124. package/lib/trackchanges.ts +247 -247
  125. package/lib/tui.ts +450 -450
  126. package/lib/types.ts +550 -530
  127. package/lib/undo.ts +250 -250
  128. package/lib/utils.ts +69 -69
  129. package/lib/variables.ts +179 -179
  130. package/lib/word-extraction.ts +806 -759
  131. package/lib/word.ts +643 -643
  132. package/lib/wordcomments.ts +817 -798
  133. package/package.json +137 -137
  134. package/scripts/postbuild.js +28 -28
  135. package/skill/REFERENCE.md +431 -431
  136. package/skill/SKILL.md +258 -258
  137. package/tsconfig.json +26 -26
  138. package/types/index.d.ts +525 -525
package/lib/pdf-import.ts CHANGED
@@ -1,524 +1,524 @@
1
- /**
2
- * PDF comment extraction for docrev
3
- *
4
- * Extracts annotations (comments, highlights, sticky notes) from PDF files
5
- * and converts them to CriticMarkup format for insertion into markdown.
6
- * Also extracts the actual text content under highlights using pdfjs-dist.
7
- */
8
-
9
- import * as fs from 'fs';
10
- import { PDFDocument } from 'pdf-lib';
11
-
12
- /**
13
- * Annotation types we care about
14
- */
15
- const COMMENT_TYPES = [
16
- 'Text', // Sticky notes
17
- 'FreeText', // Text boxes
18
- 'Highlight', // Highlighted text with comment
19
- 'Underline', // Underlined text with comment
20
- 'StrikeOut', // Strikethrough (deletion suggestion)
21
- 'Squiggly', // Squiggly underline
22
- 'Popup', // Popup comments (attached to other annotations)
23
- ];
24
-
25
- /**
26
- * Raw PDF annotation extracted from pdf-lib
27
- */
28
- export interface PdfAnnotation {
29
- type: string;
30
- page: number;
31
- contents: string;
32
- author: string;
33
- date: string;
34
- rect: number[];
35
- quadPoints: number[];
36
- }
37
-
38
- /**
39
- * PDF comment converted to CriticMarkup format
40
- */
41
- export interface PdfComment {
42
- author: string;
43
- text: string;
44
- page: number;
45
- type: string;
46
- date?: string;
47
- }
48
-
49
- /**
50
- * PDF annotation with extracted highlighted text
51
- */
52
- export interface PdfAnnotationWithText extends PdfAnnotation {
53
- highlightedText: string;
54
- }
55
-
56
- /**
57
- * Options for PDF extraction
58
- */
59
- export interface ExtractOptions {
60
- timeout?: number;
61
- }
62
-
63
- /**
64
- * Options for markdown insertion
65
- */
66
- export interface InsertOptions {
67
- sectionPerPage?: boolean;
68
- }
69
-
70
- /**
71
- * Statistics about PDF comments
72
- */
73
- export interface PdfCommentStats {
74
- total: number;
75
- byType: Record<string, number>;
76
- byAuthor: Record<string, number>;
77
- byPage: Record<number, number>;
78
- }
79
-
80
- /**
81
- * Text item from pdfjs-dist
82
- */
83
- interface PdfTextItem {
84
- str: string;
85
- x: number;
86
- y: number;
87
- width: number;
88
- height: number;
89
- }
90
-
91
- /**
92
- * Extract raw annotations from a PDF file
93
- * @param pdfPath - Path to PDF file
94
- * @param options - { timeout: number (ms) }
95
- * @returns Array of PDF annotations
96
- */
97
- export async function extractPdfAnnotations(
98
- pdfPath: string,
99
- options: ExtractOptions = {}
100
- ): Promise<PdfAnnotation[]> {
101
- const { timeout = 30000 } = options;
102
-
103
- // Validate file exists
104
- if (!fs.existsSync(pdfPath)) {
105
- throw new Error(`File not found: ${pdfPath}`);
106
- }
107
-
108
- let pdfBytes: Buffer;
109
- try {
110
- pdfBytes = fs.readFileSync(pdfPath);
111
- } catch (err) {
112
- const error = err as Error;
113
- throw new Error(`Cannot read PDF file: ${error.message}`);
114
- }
115
-
116
- // Create a promise that rejects after timeout
117
- const timeoutPromise = new Promise<never>((_, reject) => {
118
- setTimeout(() => reject(new Error(`PDF extraction timed out after ${timeout / 1000}s`)), timeout);
119
- });
120
-
121
- let pdfDoc: PDFDocument;
122
- try {
123
- pdfDoc = await Promise.race([
124
- PDFDocument.load(pdfBytes, { ignoreEncryption: true }),
125
- timeoutPromise,
126
- ]);
127
- } catch (err) {
128
- const error = err as Error;
129
- if (error.message.includes('timed out')) {
130
- throw error;
131
- }
132
- throw new Error(`Invalid or corrupted PDF file: ${error.message}`);
133
- }
134
-
135
- const annotations: PdfAnnotation[] = [];
136
- const pages = pdfDoc.getPages();
137
-
138
- for (let pageNum = 0; pageNum < pages.length; pageNum++) {
139
- const page = pages[pageNum];
140
- const annots = page.node.Annots();
141
-
142
- if (!annots) continue;
143
-
144
- const annotRefs = annots.asArray();
145
-
146
- for (const annotRef of annotRefs) {
147
- try {
148
- const annot = (annotRef as any).dict || annotRef;
149
- if (!annot) continue;
150
-
151
- // Get annotation type
152
- const subtypeName = annot.get(pdfDoc.context.obj('Subtype'));
153
- const subtype = subtypeName?.toString?.()?.replace('/', '') || '';
154
-
155
- if (!COMMENT_TYPES.includes(subtype)) continue;
156
-
157
- // Extract contents (the comment text)
158
- const contentsObj = annot.get(pdfDoc.context.obj('Contents'));
159
- const contents = contentsObj?.toString?.() || contentsObj?.decodeText?.() || '';
160
-
161
- // Extract author (T field in PDF spec)
162
- const authorObj = annot.get(pdfDoc.context.obj('T'));
163
- const author = authorObj?.toString?.() || authorObj?.decodeText?.() || 'Unknown';
164
-
165
- // Extract modification date
166
- const dateObj = annot.get(pdfDoc.context.obj('M'));
167
- const dateStr = dateObj?.toString?.() || '';
168
- const date = parsePdfDate(dateStr);
169
-
170
- // Extract rectangle (position on page)
171
- const rectObj = annot.get(pdfDoc.context.obj('Rect'));
172
- const rect = rectObj?.asArray?.()?.map((n: any) => n?.asNumber?.() || 0) || [0, 0, 0, 0];
173
-
174
- // Extract QuadPoints for highlights (the actual text bounds)
175
- const quadObj = annot.get(pdfDoc.context.obj('QuadPoints'));
176
- const quadPoints = quadObj?.asArray?.()?.map((n: any) => n?.asNumber?.() || 0) || [];
177
-
178
- // Skip empty annotations
179
- if (!contents.trim() && subtype !== 'StrikeOut') continue;
180
-
181
- annotations.push({
182
- type: subtype,
183
- page: pageNum + 1,
184
- contents: cleanPdfString(contents),
185
- author: cleanPdfString(author),
186
- date,
187
- rect,
188
- quadPoints,
189
- });
190
- } catch (err) {
191
- // Skip malformed annotations
192
- continue;
193
- }
194
- }
195
- }
196
-
197
- // Sort by page, then by vertical position (top to bottom)
198
- annotations.sort((a, b) => {
199
- if (a.page !== b.page) return a.page - b.page;
200
- // Higher Y = higher on page in PDF coords
201
- return (b.rect[1] || 0) - (a.rect[1] || 0);
202
- });
203
-
204
- return annotations;
205
- }
206
-
207
- /**
208
- * Parse PDF date string (D:YYYYMMDDHHmmSS format)
209
- * @param dateStr - PDF date string
210
- * @returns ISO date string
211
- */
212
- function parsePdfDate(dateStr: string): string {
213
- if (!dateStr) return '';
214
-
215
- // Remove D: prefix and timezone info
216
- const clean = dateStr.replace(/^D:/, '').replace(/[Z+-].*$/, '');
217
-
218
- if (clean.length >= 8) {
219
- const year = clean.slice(0, 4);
220
- const month = clean.slice(4, 6);
221
- const day = clean.slice(6, 8);
222
- return `${year}-${month}-${day}`;
223
- }
224
-
225
- return '';
226
- }
227
-
228
- /**
229
- * Clean PDF string (remove parentheses, decode escape sequences)
230
- * @param str - Raw PDF string
231
- * @returns Cleaned string
232
- */
233
- function cleanPdfString(str: string): string {
234
- if (!str) return '';
235
-
236
- return str
237
- .replace(/^\(/, '') // Remove leading paren
238
- .replace(/\)$/, '') // Remove trailing paren
239
- .replace(/\\n/g, '\n') // Newlines
240
- .replace(/\\r/g, '') // Carriage returns
241
- .replace(/\\t/g, ' ') // Tabs
242
- .replace(/\\\(/g, '(') // Escaped parens
243
- .replace(/\\\)/g, ')')
244
- .replace(/\\\\/g, '\\') // Escaped backslash
245
- .trim();
246
- }
247
-
248
- /**
249
- * Convert PDF annotations to CriticMarkup comments
250
- * @param annotations - From extractPdfAnnotations
251
- * @returns Array of PDF comments
252
- */
253
- export function annotationsToComments(annotations: PdfAnnotation[]): PdfComment[] {
254
- return annotations
255
- .filter(a => a.contents.trim())
256
- .map(a => ({
257
- author: a.author || 'Reviewer',
258
- text: a.contents,
259
- page: a.page,
260
- type: a.type,
261
- date: a.date,
262
- }));
263
- }
264
-
265
- /**
266
- * Extract comments from PDF and format for display
267
- * @param pdfPath - Path to PDF file
268
- * @returns Array of PDF comments
269
- */
270
- export async function extractPdfComments(pdfPath: string): Promise<PdfComment[]> {
271
- const annotations = await extractPdfAnnotations(pdfPath);
272
- return annotationsToComments(annotations);
273
- }
274
-
275
- /**
276
- * Insert PDF comments into markdown based on page/position heuristics
277
- * Since PDFs don't have direct text anchors like Word, we use page numbers
278
- * and append comments to the end of corresponding sections
279
- *
280
- * @param markdown - The markdown content
281
- * @param comments - Comments from extractPdfComments
282
- * @param options - { sectionPerPage: boolean }
283
- * @returns Markdown with comments inserted
284
- */
285
- export function insertPdfCommentsIntoMarkdown(
286
- markdown: string,
287
- comments: PdfComment[],
288
- options: InsertOptions = {}
289
- ): string {
290
- if (comments.length === 0) return markdown;
291
-
292
- // Group comments by page
293
- const commentsByPage = new Map<number, PdfComment[]>();
294
- for (const c of comments) {
295
- if (!commentsByPage.has(c.page)) {
296
- commentsByPage.set(c.page, []);
297
- }
298
- commentsByPage.get(c.page)!.push(c);
299
- }
300
-
301
- // Strategy: Append all comments at the end with page references
302
- // This is the safest approach since we can't reliably map PDF positions to markdown
303
- const lines = markdown.split('\n');
304
- const commentBlock: string[] = [];
305
-
306
- commentBlock.push('');
307
- commentBlock.push('<!-- PDF Comments -->');
308
-
309
- for (const [page, pageComments] of Array.from(commentsByPage.entries())) {
310
- for (const c of pageComments) {
311
- const authorPrefix = c.author ? `${c.author}: ` : '';
312
- const pageRef = `[p.${page}]`;
313
- commentBlock.push(`{>>${authorPrefix}${pageRef} ${c.text}<<}`);
314
- }
315
- }
316
-
317
- return lines.join('\n') + commentBlock.join('\n');
318
- }
319
-
320
- /**
321
- * Format PDF comments for CLI display
322
- * @param comments - Array of PDF comments
323
- * @returns Formatted string
324
- */
325
- export function formatPdfComments(comments: PdfComment[]): string {
326
- if (comments.length === 0) {
327
- return 'No comments found in PDF.';
328
- }
329
-
330
- const lines: string[] = [];
331
- let currentPage = 0;
332
-
333
- for (const c of comments) {
334
- if (c.page !== currentPage) {
335
- if (currentPage > 0) lines.push('');
336
- lines.push(`Page ${c.page}:`);
337
- currentPage = c.page;
338
- }
339
-
340
- const typeIcon = getTypeIcon(c.type);
341
- const author = c.author || 'Unknown';
342
- lines.push(` ${typeIcon} [${author}] ${c.text}`);
343
- }
344
-
345
- return lines.join('\n');
346
- }
347
-
348
- /**
349
- * Get icon for annotation type
350
- * @param type - Annotation type
351
- * @returns Icon string
352
- */
353
- function getTypeIcon(type: string): string {
354
- switch (type) {
355
- case 'Text': return '📝'; // Sticky note
356
- case 'FreeText': return '💬'; // Text box
357
- case 'Highlight': return '🖍️'; // Highlight
358
- case 'Underline': return '📍'; // Underline
359
- case 'StrikeOut': return '❌'; // Strikethrough
360
- case 'Squiggly': return '〰️'; // Squiggly
361
- default: return '💬';
362
- }
363
- }
364
-
365
- /**
366
- * Get statistics about PDF comments
367
- * @param comments - Array of PDF comments
368
- * @returns Statistics object
369
- */
370
- export function getPdfCommentStats(comments: PdfComment[]): PdfCommentStats {
371
- const stats: PdfCommentStats = {
372
- total: comments.length,
373
- byType: {},
374
- byAuthor: {},
375
- byPage: {},
376
- };
377
-
378
- for (const c of comments) {
379
- stats.byType[c.type] = (stats.byType[c.type] || 0) + 1;
380
- stats.byAuthor[c.author] = (stats.byAuthor[c.author] || 0) + 1;
381
- stats.byPage[c.page] = (stats.byPage[c.page] || 0) + 1;
382
- }
383
-
384
- return stats;
385
- }
386
-
387
- /**
388
- * Extract text content from a PDF page
389
- * @param page - pdfjs page object
390
- * @returns Array of text items with positions
391
- */
392
- async function getPageTextItems(page: any): Promise<PdfTextItem[]> {
393
- const textContent = await page.getTextContent();
394
- return textContent.items.map((item: any) => ({
395
- str: item.str,
396
- x: item.transform[4],
397
- y: item.transform[5],
398
- width: item.width,
399
- height: item.height,
400
- }));
401
- }
402
-
403
- /**
404
- * Check if a point is inside a quadrilateral defined by QuadPoints
405
- * QuadPoints format: [x1,y1, x2,y2, x3,y3, x4,y4] for each quad
406
- * @param x - X coordinate
407
- * @param y - Y coordinate
408
- * @param quad - 8 numbers defining corners
409
- * @returns True if point is inside quad
410
- */
411
- function isPointInQuad(x: number, y: number, quad: number[]): boolean {
412
- if (quad.length < 8) return false;
413
-
414
- // Get bounding box from quad points
415
- const xs = [quad[0], quad[2], quad[4], quad[6]];
416
- const ys = [quad[1], quad[3], quad[5], quad[7]];
417
- const minX = Math.min(...xs);
418
- const maxX = Math.max(...xs);
419
- const minY = Math.min(...ys);
420
- const maxY = Math.max(...ys);
421
-
422
- return x >= minX && x <= maxX && y >= minY && y <= maxY;
423
- }
424
-
425
- /**
426
- * Extract highlighted text from a PDF using QuadPoints
427
- * @param pdfPath - Path to PDF file
428
- * @param annotations - Annotations with quadPoints from extractPdfAnnotations
429
- * @returns Annotations with highlighted text extracted
430
- */
431
- export async function extractHighlightedText(
432
- pdfPath: string,
433
- annotations: PdfAnnotation[]
434
- ): Promise<PdfAnnotationWithText[]> {
435
- const pdfBytes = fs.readFileSync(pdfPath);
436
- const data = new Uint8Array(pdfBytes);
437
-
438
- // Load pdfjs-dist dynamically (requires DOMMatrix, not available in Node 18)
439
- const { getDocument } = await import('pdfjs-dist/legacy/build/pdf.mjs');
440
- const loadingTask = getDocument({ data, useSystemFonts: true });
441
- const pdfDoc = await loadingTask.promise;
442
-
443
- const results: PdfAnnotationWithText[] = [];
444
-
445
- for (const annot of annotations) {
446
- // Only process text markup annotations (Highlight, Underline, StrikeOut, Squiggly)
447
- if (!['Highlight', 'Underline', 'StrikeOut', 'Squiggly'].includes(annot.type)) {
448
- results.push({ ...annot, highlightedText: '' });
449
- continue;
450
- }
451
-
452
- if (!annot.quadPoints || annot.quadPoints.length < 8) {
453
- results.push({ ...annot, highlightedText: '' });
454
- continue;
455
- }
456
-
457
- try {
458
- const page = await pdfDoc.getPage(annot.page);
459
- const textItems = await getPageTextItems(page);
460
-
461
- // Split quadPoints into individual quads (8 numbers each)
462
- const quads: number[][] = [];
463
- for (let i = 0; i < annot.quadPoints.length; i += 8) {
464
- quads.push(annot.quadPoints.slice(i, i + 8));
465
- }
466
-
467
- // Find text items that fall within any of the quads
468
- const matchedText: string[] = [];
469
- for (const item of textItems) {
470
- // Check if text item center is in any quad
471
- const centerX = item.x + (item.width || 0) / 2;
472
- const centerY = item.y + (item.height || 0) / 2;
473
-
474
- for (const quad of quads) {
475
- if (isPointInQuad(centerX, centerY, quad) || isPointInQuad(item.x, item.y, quad)) {
476
- matchedText.push(item.str);
477
- break;
478
- }
479
- }
480
- }
481
-
482
- results.push({
483
- ...annot,
484
- highlightedText: matchedText.join(' ').trim(),
485
- });
486
- } catch (err) {
487
- // If text extraction fails, just return empty
488
- results.push({ ...annot, highlightedText: '' });
489
- }
490
- }
491
-
492
- return results;
493
- }
494
-
495
- /**
496
- * Extract annotations with highlighted text in one call
497
- * @param pdfPath - Path to PDF file
498
- * @returns Annotations with highlighted text
499
- */
500
- export async function extractPdfAnnotationsWithText(pdfPath: string): Promise<PdfAnnotationWithText[]> {
501
- const annotations = await extractPdfAnnotations(pdfPath);
502
- return extractHighlightedText(pdfPath, annotations);
503
- }
504
-
505
- /**
506
- * Format annotation with highlighted text for display
507
- * @param annot - Annotation with highlightedText
508
- * @returns Formatted string
509
- */
510
- export function formatAnnotationWithText(annot: PdfAnnotationWithText): string {
511
- const typeIcon = getTypeIcon(annot.type);
512
- const author = annot.author || 'Unknown';
513
- const parts: string[] = [`${typeIcon} [${author}]`];
514
-
515
- if (annot.highlightedText) {
516
- parts.push(`"${annot.highlightedText}"`);
517
- }
518
-
519
- if (annot.contents) {
520
- parts.push(`→ ${annot.contents}`);
521
- }
522
-
523
- return parts.join(' ');
524
- }
1
+ /**
2
+ * PDF comment extraction for docrev
3
+ *
4
+ * Extracts annotations (comments, highlights, sticky notes) from PDF files
5
+ * and converts them to CriticMarkup format for insertion into markdown.
6
+ * Also extracts the actual text content under highlights using pdfjs-dist.
7
+ */
8
+
9
+ import * as fs from 'fs';
10
+ import { PDFDocument } from 'pdf-lib';
11
+
12
+ /**
13
+ * Annotation types we care about
14
+ */
15
+ const COMMENT_TYPES = [
16
+ 'Text', // Sticky notes
17
+ 'FreeText', // Text boxes
18
+ 'Highlight', // Highlighted text with comment
19
+ 'Underline', // Underlined text with comment
20
+ 'StrikeOut', // Strikethrough (deletion suggestion)
21
+ 'Squiggly', // Squiggly underline
22
+ 'Popup', // Popup comments (attached to other annotations)
23
+ ];
24
+
25
+ /**
26
+ * Raw PDF annotation extracted from pdf-lib
27
+ */
28
+ export interface PdfAnnotation {
29
+ type: string;
30
+ page: number;
31
+ contents: string;
32
+ author: string;
33
+ date: string;
34
+ rect: number[];
35
+ quadPoints: number[];
36
+ }
37
+
38
+ /**
39
+ * PDF comment converted to CriticMarkup format
40
+ */
41
+ export interface PdfComment {
42
+ author: string;
43
+ text: string;
44
+ page: number;
45
+ type: string;
46
+ date?: string;
47
+ }
48
+
49
+ /**
50
+ * PDF annotation with extracted highlighted text
51
+ */
52
+ export interface PdfAnnotationWithText extends PdfAnnotation {
53
+ highlightedText: string;
54
+ }
55
+
56
+ /**
57
+ * Options for PDF extraction
58
+ */
59
+ export interface ExtractOptions {
60
+ timeout?: number;
61
+ }
62
+
63
+ /**
64
+ * Options for markdown insertion
65
+ */
66
+ export interface InsertOptions {
67
+ sectionPerPage?: boolean;
68
+ }
69
+
70
+ /**
71
+ * Statistics about PDF comments
72
+ */
73
+ export interface PdfCommentStats {
74
+ total: number;
75
+ byType: Record<string, number>;
76
+ byAuthor: Record<string, number>;
77
+ byPage: Record<number, number>;
78
+ }
79
+
80
+ /**
81
+ * Text item from pdfjs-dist
82
+ */
83
+ interface PdfTextItem {
84
+ str: string;
85
+ x: number;
86
+ y: number;
87
+ width: number;
88
+ height: number;
89
+ }
90
+
91
+ /**
92
+ * Extract raw annotations from a PDF file
93
+ * @param pdfPath - Path to PDF file
94
+ * @param options - { timeout: number (ms) }
95
+ * @returns Array of PDF annotations
96
+ */
97
+ export async function extractPdfAnnotations(
98
+ pdfPath: string,
99
+ options: ExtractOptions = {}
100
+ ): Promise<PdfAnnotation[]> {
101
+ const { timeout = 30000 } = options;
102
+
103
+ // Validate file exists
104
+ if (!fs.existsSync(pdfPath)) {
105
+ throw new Error(`File not found: ${pdfPath}`);
106
+ }
107
+
108
+ let pdfBytes: Buffer;
109
+ try {
110
+ pdfBytes = fs.readFileSync(pdfPath);
111
+ } catch (err) {
112
+ const error = err as Error;
113
+ throw new Error(`Cannot read PDF file: ${error.message}`);
114
+ }
115
+
116
+ // Create a promise that rejects after timeout
117
+ const timeoutPromise = new Promise<never>((_, reject) => {
118
+ setTimeout(() => reject(new Error(`PDF extraction timed out after ${timeout / 1000}s`)), timeout);
119
+ });
120
+
121
+ let pdfDoc: PDFDocument;
122
+ try {
123
+ pdfDoc = await Promise.race([
124
+ PDFDocument.load(pdfBytes, { ignoreEncryption: true }),
125
+ timeoutPromise,
126
+ ]);
127
+ } catch (err) {
128
+ const error = err as Error;
129
+ if (error.message.includes('timed out')) {
130
+ throw error;
131
+ }
132
+ throw new Error(`Invalid or corrupted PDF file: ${error.message}`);
133
+ }
134
+
135
+ const annotations: PdfAnnotation[] = [];
136
+ const pages = pdfDoc.getPages();
137
+
138
+ for (let pageNum = 0; pageNum < pages.length; pageNum++) {
139
+ const page = pages[pageNum];
140
+ const annots = page.node.Annots();
141
+
142
+ if (!annots) continue;
143
+
144
+ const annotRefs = annots.asArray();
145
+
146
+ for (const annotRef of annotRefs) {
147
+ try {
148
+ const annot = (annotRef as any).dict || annotRef;
149
+ if (!annot) continue;
150
+
151
+ // Get annotation type
152
+ const subtypeName = annot.get(pdfDoc.context.obj('Subtype'));
153
+ const subtype = subtypeName?.toString?.()?.replace('/', '') || '';
154
+
155
+ if (!COMMENT_TYPES.includes(subtype)) continue;
156
+
157
+ // Extract contents (the comment text)
158
+ const contentsObj = annot.get(pdfDoc.context.obj('Contents'));
159
+ const contents = contentsObj?.toString?.() || contentsObj?.decodeText?.() || '';
160
+
161
+ // Extract author (T field in PDF spec)
162
+ const authorObj = annot.get(pdfDoc.context.obj('T'));
163
+ const author = authorObj?.toString?.() || authorObj?.decodeText?.() || 'Unknown';
164
+
165
+ // Extract modification date
166
+ const dateObj = annot.get(pdfDoc.context.obj('M'));
167
+ const dateStr = dateObj?.toString?.() || '';
168
+ const date = parsePdfDate(dateStr);
169
+
170
+ // Extract rectangle (position on page)
171
+ const rectObj = annot.get(pdfDoc.context.obj('Rect'));
172
+ const rect = rectObj?.asArray?.()?.map((n: any) => n?.asNumber?.() || 0) || [0, 0, 0, 0];
173
+
174
+ // Extract QuadPoints for highlights (the actual text bounds)
175
+ const quadObj = annot.get(pdfDoc.context.obj('QuadPoints'));
176
+ const quadPoints = quadObj?.asArray?.()?.map((n: any) => n?.asNumber?.() || 0) || [];
177
+
178
+ // Skip empty annotations
179
+ if (!contents.trim() && subtype !== 'StrikeOut') continue;
180
+
181
+ annotations.push({
182
+ type: subtype,
183
+ page: pageNum + 1,
184
+ contents: cleanPdfString(contents),
185
+ author: cleanPdfString(author),
186
+ date,
187
+ rect,
188
+ quadPoints,
189
+ });
190
+ } catch (err) {
191
+ // Skip malformed annotations
192
+ continue;
193
+ }
194
+ }
195
+ }
196
+
197
+ // Sort by page, then by vertical position (top to bottom)
198
+ annotations.sort((a, b) => {
199
+ if (a.page !== b.page) return a.page - b.page;
200
+ // Higher Y = higher on page in PDF coords
201
+ return (b.rect[1] || 0) - (a.rect[1] || 0);
202
+ });
203
+
204
+ return annotations;
205
+ }
206
+
207
+ /**
208
+ * Parse PDF date string (D:YYYYMMDDHHmmSS format)
209
+ * @param dateStr - PDF date string
210
+ * @returns ISO date string
211
+ */
212
+ function parsePdfDate(dateStr: string): string {
213
+ if (!dateStr) return '';
214
+
215
+ // Remove D: prefix and timezone info
216
+ const clean = dateStr.replace(/^D:/, '').replace(/[Z+-].*$/, '');
217
+
218
+ if (clean.length >= 8) {
219
+ const year = clean.slice(0, 4);
220
+ const month = clean.slice(4, 6);
221
+ const day = clean.slice(6, 8);
222
+ return `${year}-${month}-${day}`;
223
+ }
224
+
225
+ return '';
226
+ }
227
+
228
+ /**
229
+ * Clean PDF string (remove parentheses, decode escape sequences)
230
+ * @param str - Raw PDF string
231
+ * @returns Cleaned string
232
+ */
233
+ function cleanPdfString(str: string): string {
234
+ if (!str) return '';
235
+
236
+ return str
237
+ .replace(/^\(/, '') // Remove leading paren
238
+ .replace(/\)$/, '') // Remove trailing paren
239
+ .replace(/\\n/g, '\n') // Newlines
240
+ .replace(/\\r/g, '') // Carriage returns
241
+ .replace(/\\t/g, ' ') // Tabs
242
+ .replace(/\\\(/g, '(') // Escaped parens
243
+ .replace(/\\\)/g, ')')
244
+ .replace(/\\\\/g, '\\') // Escaped backslash
245
+ .trim();
246
+ }
247
+
248
+ /**
249
+ * Convert PDF annotations to CriticMarkup comments
250
+ * @param annotations - From extractPdfAnnotations
251
+ * @returns Array of PDF comments
252
+ */
253
+ export function annotationsToComments(annotations: PdfAnnotation[]): PdfComment[] {
254
+ return annotations
255
+ .filter(a => a.contents.trim())
256
+ .map(a => ({
257
+ author: a.author || 'Reviewer',
258
+ text: a.contents,
259
+ page: a.page,
260
+ type: a.type,
261
+ date: a.date,
262
+ }));
263
+ }
264
+
265
+ /**
266
+ * Extract comments from PDF and format for display
267
+ * @param pdfPath - Path to PDF file
268
+ * @returns Array of PDF comments
269
+ */
270
+ export async function extractPdfComments(pdfPath: string): Promise<PdfComment[]> {
271
+ const annotations = await extractPdfAnnotations(pdfPath);
272
+ return annotationsToComments(annotations);
273
+ }
274
+
275
+ /**
276
+ * Insert PDF comments into markdown based on page/position heuristics
277
+ * Since PDFs don't have direct text anchors like Word, we use page numbers
278
+ * and append comments to the end of corresponding sections
279
+ *
280
+ * @param markdown - The markdown content
281
+ * @param comments - Comments from extractPdfComments
282
+ * @param options - { sectionPerPage: boolean }
283
+ * @returns Markdown with comments inserted
284
+ */
285
+ export function insertPdfCommentsIntoMarkdown(
286
+ markdown: string,
287
+ comments: PdfComment[],
288
+ options: InsertOptions = {}
289
+ ): string {
290
+ if (comments.length === 0) return markdown;
291
+
292
+ // Group comments by page
293
+ const commentsByPage = new Map<number, PdfComment[]>();
294
+ for (const c of comments) {
295
+ if (!commentsByPage.has(c.page)) {
296
+ commentsByPage.set(c.page, []);
297
+ }
298
+ commentsByPage.get(c.page)!.push(c);
299
+ }
300
+
301
+ // Strategy: Append all comments at the end with page references
302
+ // This is the safest approach since we can't reliably map PDF positions to markdown
303
+ const lines = markdown.split('\n');
304
+ const commentBlock: string[] = [];
305
+
306
+ commentBlock.push('');
307
+ commentBlock.push('<!-- PDF Comments -->');
308
+
309
+ for (const [page, pageComments] of Array.from(commentsByPage.entries())) {
310
+ for (const c of pageComments) {
311
+ const authorPrefix = c.author ? `${c.author}: ` : '';
312
+ const pageRef = `[p.${page}]`;
313
+ commentBlock.push(`{>>${authorPrefix}${pageRef} ${c.text}<<}`);
314
+ }
315
+ }
316
+
317
+ return lines.join('\n') + commentBlock.join('\n');
318
+ }
319
+
320
+ /**
321
+ * Format PDF comments for CLI display
322
+ * @param comments - Array of PDF comments
323
+ * @returns Formatted string
324
+ */
325
+ export function formatPdfComments(comments: PdfComment[]): string {
326
+ if (comments.length === 0) {
327
+ return 'No comments found in PDF.';
328
+ }
329
+
330
+ const lines: string[] = [];
331
+ let currentPage = 0;
332
+
333
+ for (const c of comments) {
334
+ if (c.page !== currentPage) {
335
+ if (currentPage > 0) lines.push('');
336
+ lines.push(`Page ${c.page}:`);
337
+ currentPage = c.page;
338
+ }
339
+
340
+ const typeIcon = getTypeIcon(c.type);
341
+ const author = c.author || 'Unknown';
342
+ lines.push(` ${typeIcon} [${author}] ${c.text}`);
343
+ }
344
+
345
+ return lines.join('\n');
346
+ }
347
+
348
+ /**
349
+ * Get icon for annotation type
350
+ * @param type - Annotation type
351
+ * @returns Icon string
352
+ */
353
+ function getTypeIcon(type: string): string {
354
+ switch (type) {
355
+ case 'Text': return '📝'; // Sticky note
356
+ case 'FreeText': return '💬'; // Text box
357
+ case 'Highlight': return '🖍️'; // Highlight
358
+ case 'Underline': return '📍'; // Underline
359
+ case 'StrikeOut': return '❌'; // Strikethrough
360
+ case 'Squiggly': return '〰️'; // Squiggly
361
+ default: return '💬';
362
+ }
363
+ }
364
+
365
+ /**
366
+ * Get statistics about PDF comments
367
+ * @param comments - Array of PDF comments
368
+ * @returns Statistics object
369
+ */
370
+ export function getPdfCommentStats(comments: PdfComment[]): PdfCommentStats {
371
+ const stats: PdfCommentStats = {
372
+ total: comments.length,
373
+ byType: {},
374
+ byAuthor: {},
375
+ byPage: {},
376
+ };
377
+
378
+ for (const c of comments) {
379
+ stats.byType[c.type] = (stats.byType[c.type] || 0) + 1;
380
+ stats.byAuthor[c.author] = (stats.byAuthor[c.author] || 0) + 1;
381
+ stats.byPage[c.page] = (stats.byPage[c.page] || 0) + 1;
382
+ }
383
+
384
+ return stats;
385
+ }
386
+
387
+ /**
388
+ * Extract text content from a PDF page
389
+ * @param page - pdfjs page object
390
+ * @returns Array of text items with positions
391
+ */
392
+ async function getPageTextItems(page: any): Promise<PdfTextItem[]> {
393
+ const textContent = await page.getTextContent();
394
+ return textContent.items.map((item: any) => ({
395
+ str: item.str,
396
+ x: item.transform[4],
397
+ y: item.transform[5],
398
+ width: item.width,
399
+ height: item.height,
400
+ }));
401
+ }
402
+
403
+ /**
404
+ * Check if a point is inside a quadrilateral defined by QuadPoints
405
+ * QuadPoints format: [x1,y1, x2,y2, x3,y3, x4,y4] for each quad
406
+ * @param x - X coordinate
407
+ * @param y - Y coordinate
408
+ * @param quad - 8 numbers defining corners
409
+ * @returns True if point is inside quad
410
+ */
411
+ function isPointInQuad(x: number, y: number, quad: number[]): boolean {
412
+ if (quad.length < 8) return false;
413
+
414
+ // Get bounding box from quad points
415
+ const xs = [quad[0], quad[2], quad[4], quad[6]];
416
+ const ys = [quad[1], quad[3], quad[5], quad[7]];
417
+ const minX = Math.min(...xs);
418
+ const maxX = Math.max(...xs);
419
+ const minY = Math.min(...ys);
420
+ const maxY = Math.max(...ys);
421
+
422
+ return x >= minX && x <= maxX && y >= minY && y <= maxY;
423
+ }
424
+
425
+ /**
426
+ * Extract highlighted text from a PDF using QuadPoints
427
+ * @param pdfPath - Path to PDF file
428
+ * @param annotations - Annotations with quadPoints from extractPdfAnnotations
429
+ * @returns Annotations with highlighted text extracted
430
+ */
431
+ export async function extractHighlightedText(
432
+ pdfPath: string,
433
+ annotations: PdfAnnotation[]
434
+ ): Promise<PdfAnnotationWithText[]> {
435
+ const pdfBytes = fs.readFileSync(pdfPath);
436
+ const data = new Uint8Array(pdfBytes);
437
+
438
+ // Load pdfjs-dist dynamically (requires DOMMatrix, not available in Node 18)
439
+ const { getDocument } = await import('pdfjs-dist/legacy/build/pdf.mjs');
440
+ const loadingTask = getDocument({ data, useSystemFonts: true });
441
+ const pdfDoc = await loadingTask.promise;
442
+
443
+ const results: PdfAnnotationWithText[] = [];
444
+
445
+ for (const annot of annotations) {
446
+ // Only process text markup annotations (Highlight, Underline, StrikeOut, Squiggly)
447
+ if (!['Highlight', 'Underline', 'StrikeOut', 'Squiggly'].includes(annot.type)) {
448
+ results.push({ ...annot, highlightedText: '' });
449
+ continue;
450
+ }
451
+
452
+ if (!annot.quadPoints || annot.quadPoints.length < 8) {
453
+ results.push({ ...annot, highlightedText: '' });
454
+ continue;
455
+ }
456
+
457
+ try {
458
+ const page = await pdfDoc.getPage(annot.page);
459
+ const textItems = await getPageTextItems(page);
460
+
461
+ // Split quadPoints into individual quads (8 numbers each)
462
+ const quads: number[][] = [];
463
+ for (let i = 0; i < annot.quadPoints.length; i += 8) {
464
+ quads.push(annot.quadPoints.slice(i, i + 8));
465
+ }
466
+
467
+ // Find text items that fall within any of the quads
468
+ const matchedText: string[] = [];
469
+ for (const item of textItems) {
470
+ // Check if text item center is in any quad
471
+ const centerX = item.x + (item.width || 0) / 2;
472
+ const centerY = item.y + (item.height || 0) / 2;
473
+
474
+ for (const quad of quads) {
475
+ if (isPointInQuad(centerX, centerY, quad) || isPointInQuad(item.x, item.y, quad)) {
476
+ matchedText.push(item.str);
477
+ break;
478
+ }
479
+ }
480
+ }
481
+
482
+ results.push({
483
+ ...annot,
484
+ highlightedText: matchedText.join(' ').trim(),
485
+ });
486
+ } catch (err) {
487
+ // If text extraction fails, just return empty
488
+ results.push({ ...annot, highlightedText: '' });
489
+ }
490
+ }
491
+
492
+ return results;
493
+ }
494
+
495
+ /**
496
+ * Extract annotations with highlighted text in one call
497
+ * @param pdfPath - Path to PDF file
498
+ * @returns Annotations with highlighted text
499
+ */
500
+ export async function extractPdfAnnotationsWithText(pdfPath: string): Promise<PdfAnnotationWithText[]> {
501
+ const annotations = await extractPdfAnnotations(pdfPath);
502
+ return extractHighlightedText(pdfPath, annotations);
503
+ }
504
+
505
+ /**
506
+ * Format annotation with highlighted text for display
507
+ * @param annot - Annotation with highlightedText
508
+ * @returns Formatted string
509
+ */
510
+ export function formatAnnotationWithText(annot: PdfAnnotationWithText): string {
511
+ const typeIcon = getTypeIcon(annot.type);
512
+ const author = annot.author || 'Unknown';
513
+ const parts: string[] = [`${typeIcon} [${author}]`];
514
+
515
+ if (annot.highlightedText) {
516
+ parts.push(`"${annot.highlightedText}"`);
517
+ }
518
+
519
+ if (annot.contents) {
520
+ parts.push(`→ ${annot.contents}`);
521
+ }
522
+
523
+ return parts.join(' ');
524
+ }