docrev 0.6.13 → 0.7.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,446 @@
1
+ /**
2
+ * PDF comment extraction for docrev
3
+ *
4
+ * Extracts annotations (comments, highlights, sticky notes) from PDF files
5
+ * and converts them to CriticMarkup format for insertion into markdown.
6
+ * Also extracts the actual text content under highlights using pdfjs-dist.
7
+ */
8
+
9
+ import * as fs from 'fs';
10
+ import { PDFDocument } from 'pdf-lib';
11
+
12
+ /**
13
+ * Annotation types we care about
14
+ */
15
+ const COMMENT_TYPES = [
16
+ 'Text', // Sticky notes
17
+ 'FreeText', // Text boxes
18
+ 'Highlight', // Highlighted text with comment
19
+ 'Underline', // Underlined text with comment
20
+ 'StrikeOut', // Strikethrough (deletion suggestion)
21
+ 'Squiggly', // Squiggly underline
22
+ 'Popup', // Popup comments (attached to other annotations)
23
+ ];
24
+
25
+ /**
26
+ * Extract raw annotations from a PDF file
27
+ * @param {string} pdfPath - Path to PDF file
28
+ * @param {object} options - { timeout: number (ms) }
29
+ * @returns {Promise<Array<{type: string, page: number, contents: string, author: string, date: string, rect: number[], quadPoints: number[]}>>}
30
+ */
31
+ export async function extractPdfAnnotations(pdfPath, options = {}) {
32
+ const { timeout = 30000 } = options;
33
+
34
+ // Validate file exists
35
+ if (!fs.existsSync(pdfPath)) {
36
+ throw new Error(`File not found: ${pdfPath}`);
37
+ }
38
+
39
+ let pdfBytes;
40
+ try {
41
+ pdfBytes = fs.readFileSync(pdfPath);
42
+ } catch (err) {
43
+ throw new Error(`Cannot read PDF file: ${err.message}`);
44
+ }
45
+
46
+ // Create a promise that rejects after timeout
47
+ const timeoutPromise = new Promise((_, reject) => {
48
+ setTimeout(() => reject(new Error(`PDF extraction timed out after ${timeout / 1000}s`)), timeout);
49
+ });
50
+
51
+ let pdfDoc;
52
+ try {
53
+ pdfDoc = await Promise.race([
54
+ PDFDocument.load(pdfBytes, { ignoreEncryption: true }),
55
+ timeoutPromise,
56
+ ]);
57
+ } catch (err) {
58
+ if (err.message.includes('timed out')) {
59
+ throw err;
60
+ }
61
+ throw new Error(`Invalid or corrupted PDF file: ${err.message}`);
62
+ }
63
+
64
+ const annotations = [];
65
+ const pages = pdfDoc.getPages();
66
+
67
+ for (let pageNum = 0; pageNum < pages.length; pageNum++) {
68
+ const page = pages[pageNum];
69
+ const annots = page.node.Annots();
70
+
71
+ if (!annots) continue;
72
+
73
+ const annotRefs = annots.asArray();
74
+
75
+ for (const annotRef of annotRefs) {
76
+ try {
77
+ const annot = annotRef.dict || annotRef;
78
+ if (!annot) continue;
79
+
80
+ // Get annotation type
81
+ const subtypeName = annot.get(pdfDoc.context.obj('Subtype'));
82
+ const subtype = subtypeName?.toString?.()?.replace('/', '') || '';
83
+
84
+ if (!COMMENT_TYPES.includes(subtype)) continue;
85
+
86
+ // Extract contents (the comment text)
87
+ const contentsObj = annot.get(pdfDoc.context.obj('Contents'));
88
+ const contents = contentsObj?.toString?.() || contentsObj?.decodeText?.() || '';
89
+
90
+ // Extract author (T field in PDF spec)
91
+ const authorObj = annot.get(pdfDoc.context.obj('T'));
92
+ const author = authorObj?.toString?.() || authorObj?.decodeText?.() || 'Unknown';
93
+
94
+ // Extract modification date
95
+ const dateObj = annot.get(pdfDoc.context.obj('M'));
96
+ const dateStr = dateObj?.toString?.() || '';
97
+ const date = parsePdfDate(dateStr);
98
+
99
+ // Extract rectangle (position on page)
100
+ const rectObj = annot.get(pdfDoc.context.obj('Rect'));
101
+ const rect = rectObj?.asArray?.()?.map(n => n?.asNumber?.() || 0) || [0, 0, 0, 0];
102
+
103
+ // Extract QuadPoints for highlights (the actual text bounds)
104
+ const quadObj = annot.get(pdfDoc.context.obj('QuadPoints'));
105
+ const quadPoints = quadObj?.asArray?.()?.map(n => n?.asNumber?.() || 0) || [];
106
+
107
+ // Skip empty annotations
108
+ if (!contents.trim() && subtype !== 'StrikeOut') continue;
109
+
110
+ annotations.push({
111
+ type: subtype,
112
+ page: pageNum + 1,
113
+ contents: cleanPdfString(contents),
114
+ author: cleanPdfString(author),
115
+ date,
116
+ rect,
117
+ quadPoints,
118
+ });
119
+ } catch (err) {
120
+ // Skip malformed annotations
121
+ continue;
122
+ }
123
+ }
124
+ }
125
+
126
+ // Sort by page, then by vertical position (top to bottom)
127
+ annotations.sort((a, b) => {
128
+ if (a.page !== b.page) return a.page - b.page;
129
+ // Higher Y = higher on page in PDF coords
130
+ return (b.rect[1] || 0) - (a.rect[1] || 0);
131
+ });
132
+
133
+ return annotations;
134
+ }
135
+
136
+ /**
137
+ * Parse PDF date string (D:YYYYMMDDHHmmSS format)
138
+ * @param {string} dateStr
139
+ * @returns {string} ISO date string
140
+ */
141
+ function parsePdfDate(dateStr) {
142
+ if (!dateStr) return '';
143
+
144
+ // Remove D: prefix and timezone info
145
+ const clean = dateStr.replace(/^D:/, '').replace(/[Z+-].*$/, '');
146
+
147
+ if (clean.length >= 8) {
148
+ const year = clean.slice(0, 4);
149
+ const month = clean.slice(4, 6);
150
+ const day = clean.slice(6, 8);
151
+ return `${year}-${month}-${day}`;
152
+ }
153
+
154
+ return '';
155
+ }
156
+
157
+ /**
158
+ * Clean PDF string (remove parentheses, decode escape sequences)
159
+ * @param {string} str
160
+ * @returns {string}
161
+ */
162
+ function cleanPdfString(str) {
163
+ if (!str) return '';
164
+
165
+ return str
166
+ .replace(/^\(/, '') // Remove leading paren
167
+ .replace(/\)$/, '') // Remove trailing paren
168
+ .replace(/\\n/g, '\n') // Newlines
169
+ .replace(/\\r/g, '') // Carriage returns
170
+ .replace(/\\t/g, ' ') // Tabs
171
+ .replace(/\\\(/g, '(') // Escaped parens
172
+ .replace(/\\\)/g, ')')
173
+ .replace(/\\\\/g, '\\') // Escaped backslash
174
+ .trim();
175
+ }
176
+
177
+ /**
178
+ * Convert PDF annotations to CriticMarkup comments
179
+ * @param {Array} annotations - From extractPdfAnnotations
180
+ * @returns {Array<{author: string, text: string, page: number, type: string}>}
181
+ */
182
+ export function annotationsToComments(annotations) {
183
+ return annotations
184
+ .filter(a => a.contents.trim())
185
+ .map(a => ({
186
+ author: a.author || 'Reviewer',
187
+ text: a.contents,
188
+ page: a.page,
189
+ type: a.type,
190
+ date: a.date,
191
+ }));
192
+ }
193
+
194
+ /**
195
+ * Extract comments from PDF and format for display
196
+ * @param {string} pdfPath
197
+ * @returns {Promise<Array<{author: string, text: string, page: number, type: string, date: string}>>}
198
+ */
199
+ export async function extractPdfComments(pdfPath) {
200
+ const annotations = await extractPdfAnnotations(pdfPath);
201
+ return annotationsToComments(annotations);
202
+ }
203
+
204
+ /**
205
+ * Insert PDF comments into markdown based on page/position heuristics
206
+ * Since PDFs don't have direct text anchors like Word, we use page numbers
207
+ * and append comments to the end of corresponding sections
208
+ *
209
+ * @param {string} markdown - The markdown content
210
+ * @param {Array} comments - Comments from extractPdfComments
211
+ * @param {object} options - { sectionPerPage: boolean }
212
+ * @returns {string} Markdown with comments inserted
213
+ */
214
+ export function insertPdfCommentsIntoMarkdown(markdown, comments, options = {}) {
215
+ if (comments.length === 0) return markdown;
216
+
217
+ // Group comments by page
218
+ const commentsByPage = new Map();
219
+ for (const c of comments) {
220
+ if (!commentsByPage.has(c.page)) {
221
+ commentsByPage.set(c.page, []);
222
+ }
223
+ commentsByPage.get(c.page).push(c);
224
+ }
225
+
226
+ // Strategy: Append all comments at the end with page references
227
+ // This is the safest approach since we can't reliably map PDF positions to markdown
228
+ const lines = markdown.split('\n');
229
+ const commentBlock = [];
230
+
231
+ commentBlock.push('');
232
+ commentBlock.push('<!-- PDF Comments -->');
233
+
234
+ for (const [page, pageComments] of commentsByPage) {
235
+ for (const c of pageComments) {
236
+ const authorPrefix = c.author ? `${c.author}: ` : '';
237
+ const pageRef = `[p.${page}]`;
238
+ commentBlock.push(`{>>${authorPrefix}${pageRef} ${c.text}<<}`);
239
+ }
240
+ }
241
+
242
+ return lines.join('\n') + commentBlock.join('\n');
243
+ }
244
+
245
+ /**
246
+ * Format PDF comments for CLI display
247
+ * @param {Array} comments
248
+ * @returns {string}
249
+ */
250
+ export function formatPdfComments(comments) {
251
+ if (comments.length === 0) {
252
+ return 'No comments found in PDF.';
253
+ }
254
+
255
+ const lines = [];
256
+ let currentPage = 0;
257
+
258
+ for (const c of comments) {
259
+ if (c.page !== currentPage) {
260
+ if (currentPage > 0) lines.push('');
261
+ lines.push(`Page ${c.page}:`);
262
+ currentPage = c.page;
263
+ }
264
+
265
+ const typeIcon = getTypeIcon(c.type);
266
+ const author = c.author || 'Unknown';
267
+ lines.push(` ${typeIcon} [${author}] ${c.text}`);
268
+ }
269
+
270
+ return lines.join('\n');
271
+ }
272
+
273
+ /**
274
+ * Get icon for annotation type
275
+ * @param {string} type
276
+ * @returns {string}
277
+ */
278
+ function getTypeIcon(type) {
279
+ switch (type) {
280
+ case 'Text': return '📝'; // Sticky note
281
+ case 'FreeText': return '💬'; // Text box
282
+ case 'Highlight': return '🖍️'; // Highlight
283
+ case 'Underline': return '📍'; // Underline
284
+ case 'StrikeOut': return '❌'; // Strikethrough
285
+ case 'Squiggly': return '〰️'; // Squiggly
286
+ default: return '💬';
287
+ }
288
+ }
289
+
290
+ /**
291
+ * Get statistics about PDF comments
292
+ * @param {Array} comments
293
+ * @returns {{total: number, byType: object, byAuthor: object, byPage: object}}
294
+ */
295
+ export function getPdfCommentStats(comments) {
296
+ const stats = {
297
+ total: comments.length,
298
+ byType: {},
299
+ byAuthor: {},
300
+ byPage: {},
301
+ };
302
+
303
+ for (const c of comments) {
304
+ stats.byType[c.type] = (stats.byType[c.type] || 0) + 1;
305
+ stats.byAuthor[c.author] = (stats.byAuthor[c.author] || 0) + 1;
306
+ stats.byPage[c.page] = (stats.byPage[c.page] || 0) + 1;
307
+ }
308
+
309
+ return stats;
310
+ }
311
+
312
+ /**
313
+ * Extract text content from a PDF page
314
+ * @param {object} page - pdfjs page object
315
+ * @returns {Promise<Array<{str: string, x: number, y: number, width: number, height: number}>>}
316
+ */
317
+ async function getPageTextItems(page) {
318
+ const textContent = await page.getTextContent();
319
+ return textContent.items.map(item => ({
320
+ str: item.str,
321
+ x: item.transform[4],
322
+ y: item.transform[5],
323
+ width: item.width,
324
+ height: item.height,
325
+ }));
326
+ }
327
+
328
+ /**
329
+ * Check if a point is inside a quadrilateral defined by QuadPoints
330
+ * QuadPoints format: [x1,y1, x2,y2, x3,y3, x4,y4] for each quad
331
+ * @param {number} x
332
+ * @param {number} y
333
+ * @param {number[]} quad - 8 numbers defining corners
334
+ * @returns {boolean}
335
+ */
336
+ function isPointInQuad(x, y, quad) {
337
+ if (quad.length < 8) return false;
338
+
339
+ // Get bounding box from quad points
340
+ const xs = [quad[0], quad[2], quad[4], quad[6]];
341
+ const ys = [quad[1], quad[3], quad[5], quad[7]];
342
+ const minX = Math.min(...xs);
343
+ const maxX = Math.max(...xs);
344
+ const minY = Math.min(...ys);
345
+ const maxY = Math.max(...ys);
346
+
347
+ return x >= minX && x <= maxX && y >= minY && y <= maxY;
348
+ }
349
+
350
+ /**
351
+ * Extract highlighted text from a PDF using QuadPoints
352
+ * @param {string} pdfPath - Path to PDF file
353
+ * @param {Array} annotations - Annotations with quadPoints from extractPdfAnnotations
354
+ * @returns {Promise<Array<{...annotation, highlightedText: string}>>}
355
+ */
356
+ export async function extractHighlightedText(pdfPath, annotations) {
357
+ const pdfBytes = fs.readFileSync(pdfPath);
358
+ const data = new Uint8Array(pdfBytes);
359
+
360
+ // Load pdfjs-dist dynamically (requires DOMMatrix, not available in Node 18)
361
+ const { getDocument } = await import('pdfjs-dist/legacy/build/pdf.mjs');
362
+ const loadingTask = getDocument({ data, useSystemFonts: true });
363
+ const pdfDoc = await loadingTask.promise;
364
+
365
+ const results = [];
366
+
367
+ for (const annot of annotations) {
368
+ // Only process text markup annotations (Highlight, Underline, StrikeOut, Squiggly)
369
+ if (!['Highlight', 'Underline', 'StrikeOut', 'Squiggly'].includes(annot.type)) {
370
+ results.push({ ...annot, highlightedText: '' });
371
+ continue;
372
+ }
373
+
374
+ if (!annot.quadPoints || annot.quadPoints.length < 8) {
375
+ results.push({ ...annot, highlightedText: '' });
376
+ continue;
377
+ }
378
+
379
+ try {
380
+ const page = await pdfDoc.getPage(annot.page);
381
+ const textItems = await getPageTextItems(page);
382
+
383
+ // Split quadPoints into individual quads (8 numbers each)
384
+ const quads = [];
385
+ for (let i = 0; i < annot.quadPoints.length; i += 8) {
386
+ quads.push(annot.quadPoints.slice(i, i + 8));
387
+ }
388
+
389
+ // Find text items that fall within any of the quads
390
+ const matchedText = [];
391
+ for (const item of textItems) {
392
+ // Check if text item center is in any quad
393
+ const centerX = item.x + (item.width || 0) / 2;
394
+ const centerY = item.y + (item.height || 0) / 2;
395
+
396
+ for (const quad of quads) {
397
+ if (isPointInQuad(centerX, centerY, quad) || isPointInQuad(item.x, item.y, quad)) {
398
+ matchedText.push(item.str);
399
+ break;
400
+ }
401
+ }
402
+ }
403
+
404
+ results.push({
405
+ ...annot,
406
+ highlightedText: matchedText.join(' ').trim(),
407
+ });
408
+ } catch (err) {
409
+ // If text extraction fails, just return empty
410
+ results.push({ ...annot, highlightedText: '' });
411
+ }
412
+ }
413
+
414
+ return results;
415
+ }
416
+
417
+ /**
418
+ * Extract annotations with highlighted text in one call
419
+ * @param {string} pdfPath
420
+ * @returns {Promise<Array>}
421
+ */
422
+ export async function extractPdfAnnotationsWithText(pdfPath) {
423
+ const annotations = await extractPdfAnnotations(pdfPath);
424
+ return extractHighlightedText(pdfPath, annotations);
425
+ }
426
+
427
+ /**
428
+ * Format annotation with highlighted text for display
429
+ * @param {object} annot - Annotation with highlightedText
430
+ * @returns {string}
431
+ */
432
+ export function formatAnnotationWithText(annot) {
433
+ const typeIcon = getTypeIcon(annot.type);
434
+ const author = annot.author || 'Unknown';
435
+ const parts = [`${typeIcon} [${author}]`];
436
+
437
+ if (annot.highlightedText) {
438
+ parts.push(`"${annot.highlightedText}"`);
439
+ }
440
+
441
+ if (annot.contents) {
442
+ parts.push(`→ ${annot.contents}`);
443
+ }
444
+
445
+ return parts.join(' ');
446
+ }