@sylphx/pdf-reader-mcp 2.5.2 → 2.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +67 -4
- package/dist/index.js +1964 -1669
- package/package.json +8 -3
package/dist/index.js
CHANGED
|
@@ -1,10 +1,24 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/index.ts
|
|
4
|
+
import { createRequire as createRequire2 } from "node:module";
|
|
4
5
|
import { createServer, http, stdio } from "@sylphx/mcp-server-sdk";
|
|
5
6
|
|
|
6
|
-
// src/handlers/
|
|
7
|
-
import {
|
|
7
|
+
// src/handlers/inspectPdf.ts
|
|
8
|
+
import { text, tool, toolError } from "@sylphx/mcp-server-sdk";
|
|
9
|
+
|
|
10
|
+
// src/pdf/inspector.ts
|
|
11
|
+
import { OPS as OPS2 } from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
12
|
+
|
|
13
|
+
// src/utils/errors.ts
|
|
14
|
+
class PdfError extends Error {
|
|
15
|
+
code;
|
|
16
|
+
constructor(code, message, options) {
|
|
17
|
+
super(message, options?.cause ? { cause: options.cause } : undefined);
|
|
18
|
+
this.code = code;
|
|
19
|
+
this.name = "PdfError";
|
|
20
|
+
}
|
|
21
|
+
}
|
|
8
22
|
|
|
9
23
|
// src/utils/logger.ts
|
|
10
24
|
class Logger {
|
|
@@ -83,15 +97,30 @@ var createLogger = (component, minLevel) => {
|
|
|
83
97
|
};
|
|
84
98
|
var logger = new Logger("", 2 /* WARN */);
|
|
85
99
|
|
|
86
|
-
// src/pdf/
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
var
|
|
90
|
-
var
|
|
91
|
-
var
|
|
92
|
-
var
|
|
100
|
+
// src/pdf/extractor.ts
|
|
101
|
+
import { OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
102
|
+
import { PNG } from "pngjs";
|
|
103
|
+
var logger2 = createLogger("Extractor");
|
|
104
|
+
var TEXT_SEGMENT_GAP_THRESHOLD = 48;
|
|
105
|
+
var COLUMN_CUT_MIN_GAP = 48;
|
|
106
|
+
var COLUMN_CUT_MIN_WIDTH_RATIO = 0.12;
|
|
107
|
+
var SPANNING_WIDTH_RATIO = 0.72;
|
|
108
|
+
var mergeBoundingBoxes = (boxes) => {
|
|
109
|
+
const validBoxes = boxes.filter((box) => box !== undefined);
|
|
110
|
+
if (validBoxes.length === 0)
|
|
111
|
+
return;
|
|
112
|
+
return {
|
|
113
|
+
left: Math.min(...validBoxes.map((box) => box.left)),
|
|
114
|
+
bottom: Math.min(...validBoxes.map((box) => box.bottom)),
|
|
115
|
+
right: Math.max(...validBoxes.map((box) => box.right)),
|
|
116
|
+
top: Math.max(...validBoxes.map((box) => box.top))
|
|
117
|
+
};
|
|
118
|
+
};
|
|
93
119
|
var buildBoundingBox = (x, y, width, height) => {
|
|
94
|
-
if (
|
|
120
|
+
if (x === undefined || y === undefined || width === undefined || height === undefined) {
|
|
121
|
+
return;
|
|
122
|
+
}
|
|
123
|
+
if (![x, y, width, height].every(Number.isFinite)) {
|
|
95
124
|
return;
|
|
96
125
|
}
|
|
97
126
|
return {
|
|
@@ -101,1858 +130,2122 @@ var buildBoundingBox = (x, y, width, height) => {
|
|
|
101
130
|
top: y + Math.max(0, height)
|
|
102
131
|
};
|
|
103
132
|
};
|
|
104
|
-
var
|
|
105
|
-
if (
|
|
133
|
+
var buildRectBoundingBox = (rect) => {
|
|
134
|
+
if (!rect || rect.length < 4)
|
|
135
|
+
return;
|
|
136
|
+
const [x1, y1, x2, y2] = rect;
|
|
137
|
+
if (x1 === undefined || y1 === undefined || x2 === undefined || y2 === undefined || ![x1, y1, x2, y2].every(Number.isFinite)) {
|
|
106
138
|
return;
|
|
139
|
+
}
|
|
107
140
|
return {
|
|
108
|
-
left: Math.min(
|
|
109
|
-
bottom: Math.min(
|
|
110
|
-
right: Math.max(
|
|
111
|
-
top: Math.max(
|
|
141
|
+
left: Math.min(x1, x2),
|
|
142
|
+
bottom: Math.min(y1, y2),
|
|
143
|
+
right: Math.max(x1, x2),
|
|
144
|
+
top: Math.max(y1, y2)
|
|
112
145
|
};
|
|
113
146
|
};
|
|
114
|
-
var
|
|
115
|
-
|
|
116
|
-
const
|
|
117
|
-
|
|
118
|
-
const textItem = item;
|
|
119
|
-
if (!textItem.str.trim())
|
|
120
|
-
continue;
|
|
121
|
-
if (!textItem.transform || textItem.transform.length < 6)
|
|
122
|
-
continue;
|
|
123
|
-
const x = textItem.transform[4];
|
|
124
|
-
const y = textItem.transform[5];
|
|
125
|
-
if (x === undefined || y === undefined)
|
|
126
|
-
continue;
|
|
127
|
-
const height = textItem.height ?? Math.abs(textItem.transform[3] ?? 0);
|
|
128
|
-
items.push({
|
|
129
|
-
text: textItem.str,
|
|
130
|
-
x,
|
|
131
|
-
y,
|
|
132
|
-
width: textItem.width ?? textItem.str.length * 6,
|
|
133
|
-
...height > 0 ? { height } : {},
|
|
134
|
-
...height > 0 ? {
|
|
135
|
-
bounding_box: buildBoundingBox(x, y, textItem.width ?? textItem.str.length * 6, height)
|
|
136
|
-
} : {}
|
|
137
|
-
});
|
|
138
|
-
}
|
|
139
|
-
return items;
|
|
147
|
+
var finiteNumber = (value) => typeof value === "number" && Number.isFinite(value);
|
|
148
|
+
var textFromAnnotationField = (direct, objectValue) => {
|
|
149
|
+
const value = direct ?? objectValue?.str;
|
|
150
|
+
return value && value.trim().length > 0 ? value : undefined;
|
|
140
151
|
};
|
|
141
|
-
var
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
const
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
152
|
+
var sanitizeOutlineItems = (items) => items.map((item) => {
|
|
153
|
+
const title = item.title?.trim();
|
|
154
|
+
if (!title)
|
|
155
|
+
return;
|
|
156
|
+
const children = item.items ? sanitizeOutlineItems(item.items) : undefined;
|
|
157
|
+
return {
|
|
158
|
+
title,
|
|
159
|
+
...item.bold !== undefined ? { bold: item.bold } : {},
|
|
160
|
+
...item.italic !== undefined ? { italic: item.italic } : {},
|
|
161
|
+
...item.color ? { color: Array.from(item.color) } : {},
|
|
162
|
+
...item.url ? { url: item.url } : {},
|
|
163
|
+
...item.dest !== undefined ? { dest: item.dest } : {},
|
|
164
|
+
...children && children.length > 0 ? { items: children } : {}
|
|
165
|
+
};
|
|
166
|
+
}).filter((item) => item !== undefined);
|
|
167
|
+
var PDF_PERMISSION_LABELS = new Map([
|
|
168
|
+
[4, "print"],
|
|
169
|
+
[8, "modify"],
|
|
170
|
+
[16, "copy"],
|
|
171
|
+
[32, "annotate"],
|
|
172
|
+
[256, "fill_forms"],
|
|
173
|
+
[512, "copy_for_accessibility"],
|
|
174
|
+
[1024, "assemble"],
|
|
175
|
+
[2048, "print_high_quality"]
|
|
176
|
+
]);
|
|
177
|
+
var permissionLabels = (permissions) => permissions.map((permission) => PDF_PERMISSION_LABELS.get(permission) ?? `unknown:${String(permission)}`);
|
|
178
|
+
var attachmentSize = (content) => {
|
|
179
|
+
if (!content)
|
|
180
|
+
return;
|
|
181
|
+
if ("byteLength" in content && typeof content.byteLength === "number") {
|
|
182
|
+
return content.byteLength;
|
|
161
183
|
}
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
row.items.sort((a, b) => a.x - b.x);
|
|
184
|
+
if ("length" in content && typeof content.length === "number") {
|
|
185
|
+
return content.length;
|
|
165
186
|
}
|
|
166
|
-
return
|
|
187
|
+
return;
|
|
167
188
|
};
|
|
168
|
-
var
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
const current = allXPositions[i];
|
|
186
|
-
const previous = allXPositions[i - 1];
|
|
187
|
-
if (current === undefined || previous === undefined)
|
|
188
|
-
continue;
|
|
189
|
-
const gap = current - previous;
|
|
190
|
-
if (gap >= gapThreshold) {
|
|
191
|
-
boundaries.push(current);
|
|
192
|
-
}
|
|
193
|
-
}
|
|
194
|
-
return boundaries;
|
|
189
|
+
var textSegmentToContentItem = (y, segment) => {
|
|
190
|
+
const textContent = segment.map((part) => part.text).join("");
|
|
191
|
+
if (!textContent.trim())
|
|
192
|
+
return null;
|
|
193
|
+
const boundingBox = mergeBoundingBoxes(segment.map((part) => part.bounding_box));
|
|
194
|
+
const xPosition = boundingBox?.left ?? segment[0]?.x;
|
|
195
|
+
const width = boundingBox !== undefined ? boundingBox.right - boundingBox.left : segment.reduce((sum, part) => sum + part.width, 0);
|
|
196
|
+
const height = boundingBox !== undefined ? boundingBox.top - boundingBox.bottom : Math.max(...segment.map((part) => part.height), 0);
|
|
197
|
+
return {
|
|
198
|
+
type: "text",
|
|
199
|
+
yPosition: y,
|
|
200
|
+
xPosition,
|
|
201
|
+
width,
|
|
202
|
+
height,
|
|
203
|
+
bounding_box: boundingBox,
|
|
204
|
+
textContent
|
|
205
|
+
};
|
|
195
206
|
};
|
|
196
|
-
var
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
207
|
+
var splitTextPartsIntoSegments = (parts) => {
|
|
208
|
+
const sortedParts = [...parts].sort((a, b) => a.x - b.x);
|
|
209
|
+
const segments = [];
|
|
210
|
+
let currentSegment = [];
|
|
211
|
+
let previousRight;
|
|
212
|
+
for (const part of sortedParts) {
|
|
213
|
+
if (previousRight !== undefined && part.x - previousRight > TEXT_SEGMENT_GAP_THRESHOLD) {
|
|
214
|
+
if (currentSegment.length > 0) {
|
|
215
|
+
segments.push(currentSegment);
|
|
216
|
+
}
|
|
217
|
+
currentSegment = [];
|
|
201
218
|
}
|
|
219
|
+
currentSegment.push(part);
|
|
220
|
+
previousRight = Math.max(previousRight ?? part.x, part.x + part.width);
|
|
202
221
|
}
|
|
203
|
-
|
|
222
|
+
if (currentSegment.length > 0) {
|
|
223
|
+
segments.push(currentSegment);
|
|
224
|
+
}
|
|
225
|
+
return segments;
|
|
204
226
|
};
|
|
205
|
-
var
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
227
|
+
var sortByYThenX = (items) => [...items].sort((a, b) => b.yPosition - a.yPosition || (a.xPosition ?? 0) - (b.xPosition ?? 0));
|
|
228
|
+
var findVerticalColumnCut = (items) => {
|
|
229
|
+
const boxedItems = items.filter((item) => item.bounding_box !== undefined);
|
|
230
|
+
if (boxedItems.length < 4)
|
|
231
|
+
return;
|
|
232
|
+
const left = Math.min(...boxedItems.map((item) => item.bounding_box?.left ?? 0));
|
|
233
|
+
const right = Math.max(...boxedItems.map((item) => item.bounding_box?.right ?? 0));
|
|
234
|
+
const pageWidth = right - left;
|
|
235
|
+
if (pageWidth <= 0)
|
|
236
|
+
return;
|
|
237
|
+
const narrowItems = boxedItems.filter((item) => {
|
|
238
|
+
const box = item.bounding_box;
|
|
239
|
+
if (!box)
|
|
240
|
+
return false;
|
|
241
|
+
return box.right - box.left < pageWidth * SPANNING_WIDTH_RATIO;
|
|
242
|
+
});
|
|
243
|
+
if (narrowItems.length < 4)
|
|
244
|
+
return;
|
|
245
|
+
const sorted = [...narrowItems].sort((a, b) => (a.bounding_box?.left ?? 0) - (b.bounding_box?.left ?? 0));
|
|
246
|
+
let currentRight = sorted[0]?.bounding_box?.right;
|
|
247
|
+
if (currentRight === undefined)
|
|
248
|
+
return;
|
|
249
|
+
let largestGap = 0;
|
|
250
|
+
let cutPosition;
|
|
251
|
+
for (let i = 1;i < sorted.length; i++) {
|
|
252
|
+
const box = sorted[i]?.bounding_box;
|
|
253
|
+
if (!box)
|
|
211
254
|
continue;
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
255
|
+
if (box.left > currentRight) {
|
|
256
|
+
const gap = box.left - currentRight;
|
|
257
|
+
if (gap > largestGap) {
|
|
258
|
+
largestGap = gap;
|
|
259
|
+
cutPosition = (box.left + currentRight) / 2;
|
|
260
|
+
}
|
|
215
261
|
}
|
|
262
|
+
currentRight = Math.max(currentRight, box.right);
|
|
216
263
|
}
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
264
|
+
if (cutPosition === undefined)
|
|
265
|
+
return;
|
|
266
|
+
const minGap = Math.max(COLUMN_CUT_MIN_GAP, pageWidth * COLUMN_CUT_MIN_WIDTH_RATIO);
|
|
267
|
+
if (largestGap < minGap)
|
|
268
|
+
return;
|
|
269
|
+
const leftCount = narrowItems.filter((item) => {
|
|
270
|
+
const box = item.bounding_box;
|
|
271
|
+
if (!box)
|
|
272
|
+
return false;
|
|
273
|
+
return (box.left + box.right) / 2 < cutPosition;
|
|
274
|
+
}).length;
|
|
275
|
+
const rightCount = narrowItems.length - leftCount;
|
|
276
|
+
return leftCount >= 2 && rightCount >= 2 ? cutPosition : undefined;
|
|
230
277
|
};
|
|
231
|
-
var
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
278
|
+
var sortPageContentItems = (items) => {
|
|
279
|
+
const cutPosition = findVerticalColumnCut(items);
|
|
280
|
+
if (cutPosition === undefined)
|
|
281
|
+
return sortByYThenX(items);
|
|
282
|
+
const leftColumn = [];
|
|
283
|
+
const rightColumn = [];
|
|
284
|
+
const spanning = [];
|
|
285
|
+
for (const item of items) {
|
|
286
|
+
const box = item.bounding_box;
|
|
287
|
+
if (!box) {
|
|
288
|
+
spanning.push(item);
|
|
289
|
+
continue;
|
|
290
|
+
}
|
|
291
|
+
if (box.left < cutPosition && box.right > cutPosition) {
|
|
292
|
+
spanning.push(item);
|
|
293
|
+
continue;
|
|
294
|
+
}
|
|
295
|
+
const center = (box.left + box.right) / 2;
|
|
296
|
+
if (center < cutPosition) {
|
|
297
|
+
leftColumn.push(item);
|
|
298
|
+
} else {
|
|
299
|
+
rightColumn.push(item);
|
|
247
300
|
}
|
|
248
|
-
score += itemsPerColumn.size / columnBoundaries.length;
|
|
249
|
-
checks++;
|
|
250
301
|
}
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
302
|
+
const columnItems = [...leftColumn, ...rightColumn].filter((item) => item.bounding_box);
|
|
303
|
+
const highestColumnTop = columnItems.length > 0 ? Math.max(...columnItems.map((item) => item.bounding_box?.top ?? item.yPosition)) : Number.POSITIVE_INFINITY;
|
|
304
|
+
const topSpanning = spanning.filter((item) => (item.bounding_box?.top ?? item.yPosition) >= highestColumnTop);
|
|
305
|
+
const remainingSpanning = spanning.filter((item) => (item.bounding_box?.top ?? item.yPosition) < highestColumnTop);
|
|
306
|
+
return [
|
|
307
|
+
...sortByYThenX(topSpanning),
|
|
308
|
+
...sortByYThenX(leftColumn),
|
|
309
|
+
...sortByYThenX(rightColumn),
|
|
310
|
+
...sortByYThenX(remainingSpanning)
|
|
311
|
+
];
|
|
312
|
+
};
|
|
313
|
+
var encodePixelsToPNG = (pixelData, width, height, channels) => {
|
|
314
|
+
const png = new PNG({ width, height });
|
|
315
|
+
if (channels === 4) {
|
|
316
|
+
png.data = Buffer.from(pixelData);
|
|
317
|
+
} else if (channels === 3) {
|
|
318
|
+
for (let i = 0;i < width * height; i++) {
|
|
319
|
+
const srcIdx = i * 3;
|
|
320
|
+
const dstIdx = i * 4;
|
|
321
|
+
png.data[dstIdx] = pixelData[srcIdx] ?? 0;
|
|
322
|
+
png.data[dstIdx + 1] = pixelData[srcIdx + 1] ?? 0;
|
|
323
|
+
png.data[dstIdx + 2] = pixelData[srcIdx + 2] ?? 0;
|
|
324
|
+
png.data[dstIdx + 3] = 255;
|
|
259
325
|
}
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
const
|
|
263
|
-
const
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
326
|
+
} else if (channels === 1) {
|
|
327
|
+
for (let i = 0;i < width * height; i++) {
|
|
328
|
+
const gray = pixelData[i] ?? 0;
|
|
329
|
+
const dstIdx = i * 4;
|
|
330
|
+
png.data[dstIdx] = gray;
|
|
331
|
+
png.data[dstIdx + 1] = gray;
|
|
332
|
+
png.data[dstIdx + 2] = gray;
|
|
333
|
+
png.data[dstIdx + 3] = 255;
|
|
267
334
|
}
|
|
268
335
|
}
|
|
269
|
-
|
|
336
|
+
const pngBuffer = PNG.sync.write(png);
|
|
337
|
+
return pngBuffer.toString("base64");
|
|
270
338
|
};
|
|
271
|
-
var
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
if (candidateRows.length < MIN_ROWS) {
|
|
275
|
-
return regions;
|
|
339
|
+
var processImageData = (imageData, pageNum, arrayIndex) => {
|
|
340
|
+
if (!imageData || typeof imageData !== "object") {
|
|
341
|
+
return null;
|
|
276
342
|
}
|
|
277
|
-
const
|
|
278
|
-
if (
|
|
279
|
-
return
|
|
343
|
+
const img = imageData;
|
|
344
|
+
if (!img.data || !img.width || !img.height) {
|
|
345
|
+
return null;
|
|
280
346
|
}
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
347
|
+
const channels = img.kind === 1 ? 1 : img.kind === 3 ? 4 : 3;
|
|
348
|
+
const format = img.kind === 1 ? "grayscale" : img.kind === 3 ? "rgba" : "rgb";
|
|
349
|
+
const pngBase64 = encodePixelsToPNG(img.data, img.width, img.height, channels);
|
|
350
|
+
return {
|
|
351
|
+
page: pageNum,
|
|
352
|
+
index: arrayIndex,
|
|
353
|
+
width: img.width,
|
|
354
|
+
height: img.height,
|
|
355
|
+
format,
|
|
356
|
+
data: pngBase64
|
|
357
|
+
};
|
|
358
|
+
};
|
|
359
|
+
var retrieveImageData = async (page, imageName, pageNum) => {
|
|
360
|
+
if (imageName.startsWith("g_")) {
|
|
361
|
+
try {
|
|
362
|
+
const imageData = page.commonObjs.get(imageName);
|
|
363
|
+
if (imageData) {
|
|
364
|
+
return imageData;
|
|
298
365
|
}
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
}
|
|
303
|
-
}
|
|
304
|
-
if (currentRegion.length >= MIN_ROWS) {
|
|
305
|
-
const firstRow = currentRegion[0];
|
|
306
|
-
const lastRow = currentRegion[currentRegion.length - 1];
|
|
307
|
-
if (firstRow && lastRow) {
|
|
308
|
-
regions.push({
|
|
309
|
-
rows: currentRegion,
|
|
310
|
-
columnBoundaries,
|
|
311
|
-
startY: firstRow.y,
|
|
312
|
-
endY: lastRow.y
|
|
313
|
-
});
|
|
366
|
+
} catch (error) {
|
|
367
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
368
|
+
logger2.warn("Error getting image from commonObjs", { imageName, error: message });
|
|
314
369
|
}
|
|
315
370
|
}
|
|
316
|
-
return regions;
|
|
317
|
-
};
|
|
318
|
-
var extractTablesFromPage = async (page, pageNum) => {
|
|
319
|
-
const tables = [];
|
|
320
371
|
try {
|
|
321
|
-
const
|
|
322
|
-
if (
|
|
323
|
-
return
|
|
324
|
-
}
|
|
325
|
-
const rows = clusterByY(textItems);
|
|
326
|
-
const tableRegions = identifyTableRegions(rows);
|
|
327
|
-
for (let tableIndex = 0;tableIndex < tableRegions.length; tableIndex++) {
|
|
328
|
-
const region = tableRegions[tableIndex];
|
|
329
|
-
if (!region)
|
|
330
|
-
continue;
|
|
331
|
-
const tableRows = [];
|
|
332
|
-
const tableCells = [];
|
|
333
|
-
for (let rowIndex = 0;rowIndex < region.rows.length; rowIndex++) {
|
|
334
|
-
const row = region.rows[rowIndex];
|
|
335
|
-
if (!row)
|
|
336
|
-
continue;
|
|
337
|
-
const assigned = assignToTableCells(row, rowIndex, region.columnBoundaries);
|
|
338
|
-
tableRows.push(assigned.rowValues);
|
|
339
|
-
tableCells.push(...assigned.cells);
|
|
340
|
-
}
|
|
341
|
-
const confidence = calculateConfidence(region.rows, region.columnBoundaries);
|
|
342
|
-
const tableBoundingBox = mergeBoundingBoxes(tableCells.map((cell) => cell.bounding_box).filter((box) => box !== undefined));
|
|
343
|
-
if (confidence >= 0.3) {
|
|
344
|
-
tables.push({
|
|
345
|
-
page: pageNum,
|
|
346
|
-
tableIndex,
|
|
347
|
-
rows: tableRows,
|
|
348
|
-
cells: tableCells,
|
|
349
|
-
...tableBoundingBox ? { bounding_box: tableBoundingBox } : {},
|
|
350
|
-
rowCount: tableRows.length,
|
|
351
|
-
colCount: region.columnBoundaries.length,
|
|
352
|
-
confidence: Math.round(confidence * 100) / 100
|
|
353
|
-
});
|
|
354
|
-
}
|
|
372
|
+
const imageData = page.objs.get(imageName);
|
|
373
|
+
if (imageData !== undefined) {
|
|
374
|
+
return imageData;
|
|
355
375
|
}
|
|
356
376
|
} catch (error) {
|
|
357
377
|
const message = error instanceof Error ? error.message : String(error);
|
|
358
|
-
logger2.warn("
|
|
378
|
+
logger2.warn("Sync image get failed, trying async", { imageName, error: message });
|
|
359
379
|
}
|
|
360
|
-
return
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
380
|
+
return new Promise((resolve) => {
|
|
381
|
+
let resolved = false;
|
|
382
|
+
let timeoutId = null;
|
|
383
|
+
const cleanup = () => {
|
|
384
|
+
if (timeoutId !== null) {
|
|
385
|
+
clearTimeout(timeoutId);
|
|
386
|
+
timeoutId = null;
|
|
387
|
+
}
|
|
388
|
+
};
|
|
389
|
+
timeoutId = setTimeout(() => {
|
|
390
|
+
if (!resolved) {
|
|
391
|
+
resolved = true;
|
|
392
|
+
cleanup();
|
|
393
|
+
logger2.warn("Image extraction timeout", { imageName, pageNum });
|
|
394
|
+
resolve(null);
|
|
395
|
+
}
|
|
396
|
+
}, 1e4);
|
|
365
397
|
try {
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
398
|
+
page.objs.get(imageName, (imageData) => {
|
|
399
|
+
if (!resolved) {
|
|
400
|
+
resolved = true;
|
|
401
|
+
cleanup();
|
|
402
|
+
resolve(imageData);
|
|
403
|
+
}
|
|
404
|
+
});
|
|
369
405
|
} catch (error) {
|
|
370
|
-
|
|
371
|
-
|
|
406
|
+
if (!resolved) {
|
|
407
|
+
resolved = true;
|
|
408
|
+
cleanup();
|
|
409
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
410
|
+
logger2.warn("Error in async image get", { imageName, error: message });
|
|
411
|
+
resolve(null);
|
|
412
|
+
}
|
|
372
413
|
}
|
|
373
|
-
}
|
|
374
|
-
return allTables;
|
|
414
|
+
});
|
|
375
415
|
};
|
|
376
|
-
var
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
if (
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
416
|
+
var extractMetadataAndPageCount = async (pdfDocument, includeMetadata, includePageCount) => {
|
|
417
|
+
const output = {};
|
|
418
|
+
if (includePageCount) {
|
|
419
|
+
output.num_pages = pdfDocument.numPages;
|
|
420
|
+
}
|
|
421
|
+
if (includeMetadata) {
|
|
422
|
+
try {
|
|
423
|
+
const pdfMetadata = await pdfDocument.getMetadata();
|
|
424
|
+
const infoData = pdfMetadata.info;
|
|
425
|
+
if (infoData !== undefined) {
|
|
426
|
+
output.info = infoData;
|
|
427
|
+
}
|
|
428
|
+
const metadataObj = pdfMetadata.metadata;
|
|
429
|
+
if (metadataObj && typeof metadataObj.getAll === "function") {
|
|
430
|
+
output.metadata = metadataObj.getAll();
|
|
431
|
+
} else if (metadataObj && typeof metadataObj === "object") {
|
|
432
|
+
const metadataRecord = {};
|
|
433
|
+
for (const key in metadataObj) {
|
|
434
|
+
if (Object.hasOwn(metadataObj, key)) {
|
|
435
|
+
metadataRecord[key] = metadataObj[key];
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
output.metadata = metadataRecord;
|
|
439
|
+
}
|
|
440
|
+
} catch (metaError) {
|
|
441
|
+
const message = metaError instanceof Error ? metaError.message : String(metaError);
|
|
442
|
+
logger2.warn("Error extracting metadata", { error: message });
|
|
392
443
|
}
|
|
393
|
-
lines.push(`| ${paddedRow.map((cell) => cell.trim() || " ").join(" | ")} |`);
|
|
394
444
|
}
|
|
395
|
-
return
|
|
396
|
-
`);
|
|
445
|
+
return output;
|
|
397
446
|
};
|
|
398
|
-
var
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
447
|
+
var extractDocumentStructure = async (pdfDocument, options) => {
|
|
448
|
+
const documentWithStructure = pdfDocument;
|
|
449
|
+
const output = {};
|
|
450
|
+
if (options.includeOutline && typeof documentWithStructure.getOutline === "function") {
|
|
451
|
+
try {
|
|
452
|
+
const outline = await documentWithStructure.getOutline();
|
|
453
|
+
if (outline && outline.length > 0) {
|
|
454
|
+
output.outline = sanitizeOutlineItems(outline);
|
|
455
|
+
}
|
|
456
|
+
} catch (error) {
|
|
457
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
458
|
+
logger2.warn("Error extracting outline", { error: message });
|
|
459
|
+
}
|
|
408
460
|
}
|
|
409
|
-
|
|
410
|
-
|
|
461
|
+
if (options.includePageLabels && typeof documentWithStructure.getPageLabels === "function") {
|
|
462
|
+
try {
|
|
463
|
+
const pageLabels = await documentWithStructure.getPageLabels();
|
|
464
|
+
if (pageLabels && pageLabels.length > 0) {
|
|
465
|
+
output.page_labels = pageLabels;
|
|
466
|
+
}
|
|
467
|
+
} catch (error) {
|
|
468
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
469
|
+
logger2.warn("Error extracting page labels", { error: message });
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
if (options.includePermissions && typeof documentWithStructure.getPermissions === "function") {
|
|
473
|
+
try {
|
|
474
|
+
const permissions = await documentWithStructure.getPermissions();
|
|
475
|
+
if (permissions && permissions.length > 0) {
|
|
476
|
+
output.permissions = permissionLabels(permissions);
|
|
477
|
+
}
|
|
478
|
+
} catch (error) {
|
|
479
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
480
|
+
logger2.warn("Error extracting permissions", { error: message });
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
if (options.includePermissions && typeof documentWithStructure.getMarkInfo === "function") {
|
|
484
|
+
try {
|
|
485
|
+
const markInfo = await documentWithStructure.getMarkInfo();
|
|
486
|
+
if (markInfo && Object.keys(markInfo).length > 0) {
|
|
487
|
+
output.mark_info = markInfo;
|
|
488
|
+
}
|
|
489
|
+
} catch (error) {
|
|
490
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
491
|
+
logger2.warn("Error extracting mark info", { error: message });
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
if (options.includeFormFields && typeof documentWithStructure.getFieldObjects === "function") {
|
|
495
|
+
try {
|
|
496
|
+
const fieldObjects = await documentWithStructure.getFieldObjects();
|
|
497
|
+
if (fieldObjects) {
|
|
498
|
+
const fields = Object.entries(fieldObjects).flatMap(([name, fieldOrFields]) => {
|
|
499
|
+
const fieldList = Array.isArray(fieldOrFields) ? fieldOrFields : [fieldOrFields];
|
|
500
|
+
return fieldList.map((field) => normalizeFormField(name, field));
|
|
501
|
+
}).filter((field) => field !== undefined);
|
|
502
|
+
if (fields.length > 0) {
|
|
503
|
+
output.form_fields = fields;
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
} catch (error) {
|
|
507
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
508
|
+
logger2.warn("Error extracting form fields", { error: message });
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
if (options.includeAttachments && typeof documentWithStructure.getAttachments === "function") {
|
|
512
|
+
try {
|
|
513
|
+
const attachments = await documentWithStructure.getAttachments();
|
|
514
|
+
if (attachments) {
|
|
515
|
+
const attachmentSummaries = Object.entries(attachments).map(([name, attachment]) => {
|
|
516
|
+
const size = attachmentSize(attachment.content);
|
|
517
|
+
return {
|
|
518
|
+
name,
|
|
519
|
+
...attachment.filename ? { filename: attachment.filename } : {},
|
|
520
|
+
...attachment.description ? { description: attachment.description } : {},
|
|
521
|
+
...size !== undefined ? { size_bytes: size } : {}
|
|
522
|
+
};
|
|
523
|
+
});
|
|
524
|
+
if (attachmentSummaries.length > 0) {
|
|
525
|
+
output.attachments = attachmentSummaries;
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
} catch (error) {
|
|
529
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
530
|
+
logger2.warn("Error extracting attachments", { error: message });
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
return output;
|
|
411
534
|
};
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
const
|
|
418
|
-
|
|
535
|
+
var normalizeFormField = (fallbackName, field) => {
|
|
536
|
+
const name = (field.name ?? field.fieldName ?? fallbackName).trim();
|
|
537
|
+
if (!name)
|
|
538
|
+
return;
|
|
539
|
+
const page = field.page !== undefined ? field.page : field.pageIndex !== undefined ? field.pageIndex + 1 : undefined;
|
|
540
|
+
const fieldType = field.type ?? field.fieldType;
|
|
541
|
+
const boundingBox = buildRectBoundingBox(field.rect);
|
|
542
|
+
return {
|
|
543
|
+
name,
|
|
544
|
+
...fieldType ? { type: fieldType } : {},
|
|
545
|
+
...field.value !== undefined ? { value: field.value } : {},
|
|
546
|
+
...field.defaultValue !== undefined ? { default_value: field.defaultValue } : {},
|
|
547
|
+
...page !== undefined ? { page } : {},
|
|
548
|
+
...field.id ? { id: field.id } : {},
|
|
549
|
+
...field.editable !== undefined ? { editable: field.editable } : {},
|
|
550
|
+
...field.required !== undefined ? { required: field.required } : {},
|
|
551
|
+
...boundingBox ? { bounding_box: boundingBox } : {}
|
|
552
|
+
};
|
|
419
553
|
};
|
|
420
|
-
var
|
|
421
|
-
const
|
|
422
|
-
|
|
423
|
-
|
|
554
|
+
var normalizeAnnotation = (annotation, pageNum) => {
|
|
555
|
+
const contents = textFromAnnotationField(annotation.contents, annotation.contentsObj);
|
|
556
|
+
const title = textFromAnnotationField(annotation.title, annotation.titleObj);
|
|
557
|
+
const boundingBox = buildRectBoundingBox(annotation.rect);
|
|
558
|
+
const subtype = annotation.subtype?.trim();
|
|
559
|
+
const url = annotation.url ?? annotation.unsafeUrl;
|
|
560
|
+
if (!annotation.id && !subtype && !contents && !title && !url && annotation.dest === undefined) {
|
|
561
|
+
return;
|
|
424
562
|
}
|
|
425
|
-
const midpoint = Math.floor(heights.length / 2);
|
|
426
|
-
const medianHeight = heights.length % 2 === 0 ? ((heights[midpoint - 1] ?? 0) + (heights[midpoint] ?? 0)) / 2 : heights[midpoint] ?? 0;
|
|
427
563
|
return {
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
564
|
+
page: pageNum,
|
|
565
|
+
...annotation.id ? { id: annotation.id } : {},
|
|
566
|
+
...subtype ? { subtype } : {},
|
|
567
|
+
...contents ? { contents } : {},
|
|
568
|
+
...title ? { title } : {},
|
|
569
|
+
...url ? { url } : {},
|
|
570
|
+
...annotation.dest !== undefined ? { dest: annotation.dest } : {},
|
|
571
|
+
...boundingBox ? { bounding_box: boundingBox } : {}
|
|
431
572
|
};
|
|
432
573
|
};
|
|
433
|
-
var
|
|
434
|
-
|
|
574
|
+
var isRecord = (value) => typeof value === "object" && value !== null;
|
|
575
|
+
var normalizeStructureTreeContent = (rawContent) => {
|
|
576
|
+
const type = typeof rawContent.type === "string" ? rawContent.type.trim() : "";
|
|
577
|
+
const id = typeof rawContent.id === "string" ? rawContent.id.trim() : "";
|
|
578
|
+
if (!type && !id)
|
|
435
579
|
return;
|
|
436
|
-
const textContent = item.textContent.trim();
|
|
437
|
-
if (/^([-*]\s+|\d+[.)]\s+)/.test(textContent)) {
|
|
438
|
-
return {
|
|
439
|
-
role: "list_item",
|
|
440
|
-
confidence: 0.92,
|
|
441
|
-
signals: ["list-prefix"]
|
|
442
|
-
};
|
|
443
|
-
}
|
|
444
|
-
const height = item.height ?? 0;
|
|
445
|
-
const isShortLine = textContent.length <= 120;
|
|
446
|
-
const endsLikeSentence = /[.!?]$/.test(textContent);
|
|
447
|
-
const isLargeText = stats.textItemCount > 1 && height > 0 && stats.medianHeight > 0 && height >= stats.medianHeight * 1.3 && height >= stats.maxHeight * 0.8;
|
|
448
|
-
if (isLargeText && isShortLine && !endsLikeSentence) {
|
|
449
|
-
const ratio = height / stats.medianHeight;
|
|
450
|
-
const level = ratio >= 1.8 ? 1 : ratio >= 1.55 ? 2 : 3;
|
|
451
|
-
return {
|
|
452
|
-
role: "heading",
|
|
453
|
-
level,
|
|
454
|
-
confidence: 0.78,
|
|
455
|
-
signals: ["larger-text", "short-line"]
|
|
456
|
-
};
|
|
457
|
-
}
|
|
458
580
|
return {
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
signals: ["default-text"]
|
|
581
|
+
type: type || "content",
|
|
582
|
+
...id ? { id } : {}
|
|
462
583
|
};
|
|
463
584
|
};
|
|
464
|
-
var
|
|
465
|
-
if (
|
|
466
|
-
return
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
page,
|
|
470
|
-
content: item.textContent,
|
|
471
|
-
bounding_box: item.bounding_box,
|
|
472
|
-
provenance: {
|
|
473
|
-
engine: "pdfjs",
|
|
474
|
-
source: "text-content"
|
|
475
|
-
},
|
|
476
|
-
...semanticHint ? { semantic_hint: semanticHint } : {}
|
|
477
|
-
};
|
|
478
|
-
}
|
|
479
|
-
if (item.type === "image" && item.imageData) {
|
|
480
|
-
return {
|
|
481
|
-
id: buildElementId(page, "image", index),
|
|
482
|
-
type: "image",
|
|
483
|
-
page,
|
|
484
|
-
image: imageElementMetadata(item.imageData),
|
|
485
|
-
bounding_box: item.bounding_box,
|
|
486
|
-
provenance: {
|
|
487
|
-
engine: "pdfjs",
|
|
488
|
-
source: "image-xobject"
|
|
489
|
-
}
|
|
490
|
-
};
|
|
585
|
+
var normalizeStructureTreeChild = (rawChild) => {
|
|
586
|
+
if (!isRecord(rawChild))
|
|
587
|
+
return;
|
|
588
|
+
if ("role" in rawChild || "children" in rawChild) {
|
|
589
|
+
return normalizeStructureTreeNode(rawChild);
|
|
491
590
|
}
|
|
492
|
-
return;
|
|
591
|
+
return normalizeStructureTreeContent(rawChild);
|
|
493
592
|
};
|
|
494
|
-
var
|
|
495
|
-
const
|
|
496
|
-
const
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
tablesByPage.set(table.page, pageTables);
|
|
501
|
-
}
|
|
502
|
-
const appendTableElement = (table) => {
|
|
503
|
-
elements.push({
|
|
504
|
-
id: buildElementId(table.page, "table", table.tableIndex + 1),
|
|
505
|
-
type: "table",
|
|
506
|
-
page: table.page,
|
|
507
|
-
table: {
|
|
508
|
-
rows: table.rows,
|
|
509
|
-
...table.cells ? { cells: table.cells } : {},
|
|
510
|
-
...table.bounding_box ? { bounding_box: table.bounding_box } : {},
|
|
511
|
-
rowCount: table.rowCount,
|
|
512
|
-
colCount: table.colCount,
|
|
513
|
-
confidence: table.confidence
|
|
514
|
-
},
|
|
515
|
-
bounding_box: table.bounding_box,
|
|
516
|
-
confidence: table.confidence,
|
|
517
|
-
provenance: {
|
|
518
|
-
engine: "pdfjs",
|
|
519
|
-
source: "table-detector"
|
|
520
|
-
}
|
|
521
|
-
});
|
|
593
|
+
var normalizeStructureTreeNode = (rawNode) => {
|
|
594
|
+
const role = typeof rawNode.role === "string" && rawNode.role.trim() ? rawNode.role.trim() : "Unknown";
|
|
595
|
+
const children = Array.isArray(rawNode.children) ? rawNode.children.map((child) => normalizeStructureTreeChild(child)).filter((child) => child !== undefined) : [];
|
|
596
|
+
return {
|
|
597
|
+
role,
|
|
598
|
+
...children.length > 0 ? { children } : {}
|
|
522
599
|
};
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
const
|
|
529
|
-
if (
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
if (pageTables) {
|
|
536
|
-
for (const table of pageTables.sort((a, b) => a.tableIndex - b.tableIndex)) {
|
|
537
|
-
appendTableElement(table);
|
|
600
|
+
};
|
|
601
|
+
var extractAnnotations = async (pdfDocument, pagesToProcess) => {
|
|
602
|
+
const pageAnnotations = [];
|
|
603
|
+
for (const pageNum of pagesToProcess) {
|
|
604
|
+
try {
|
|
605
|
+
const page = await pdfDocument.getPage(pageNum);
|
|
606
|
+
if (typeof page.getAnnotations !== "function")
|
|
607
|
+
continue;
|
|
608
|
+
const annotations = await page.getAnnotations({ intent: "display" });
|
|
609
|
+
const normalized = annotations.map((annotation) => normalizeAnnotation(annotation, pageNum)).filter((annotation) => annotation !== undefined);
|
|
610
|
+
if (normalized.length > 0) {
|
|
611
|
+
pageAnnotations.push({ page: pageNum, annotations: normalized });
|
|
538
612
|
}
|
|
539
|
-
|
|
613
|
+
} catch (error) {
|
|
614
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
615
|
+
logger2.warn("Error extracting annotations from page", { pageNum, error: message });
|
|
540
616
|
}
|
|
541
617
|
}
|
|
542
|
-
|
|
543
|
-
for (const table of remainingTables) {
|
|
544
|
-
appendTableElement(table);
|
|
545
|
-
}
|
|
546
|
-
return elements;
|
|
618
|
+
return pageAnnotations;
|
|
547
619
|
};
|
|
548
|
-
var
|
|
549
|
-
const
|
|
550
|
-
for (const
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
if (
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
620
|
+
var extractStructureTrees = async (pdfDocument, pagesToProcess) => {
|
|
621
|
+
const pageStructureTrees = [];
|
|
622
|
+
for (const pageNum of pagesToProcess) {
|
|
623
|
+
try {
|
|
624
|
+
const page = await pdfDocument.getPage(pageNum);
|
|
625
|
+
if (typeof page.getStructTree !== "function")
|
|
626
|
+
continue;
|
|
627
|
+
const rawTree = await page.getStructTree();
|
|
628
|
+
if (!rawTree)
|
|
629
|
+
continue;
|
|
630
|
+
pageStructureTrees.push({
|
|
631
|
+
page: pageNum,
|
|
632
|
+
tree: normalizeStructureTreeNode(rawTree)
|
|
633
|
+
});
|
|
634
|
+
} catch (error) {
|
|
635
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
636
|
+
logger2.warn("Error extracting structure tree", { pageNum, error: message });
|
|
558
637
|
}
|
|
559
|
-
sections.push(pageLines.join(`
|
|
560
|
-
`).trimEnd());
|
|
561
638
|
}
|
|
562
|
-
|
|
563
|
-
sections.push(tablesToMarkdown(tables));
|
|
564
|
-
}
|
|
565
|
-
return sections.join(`
|
|
566
|
-
|
|
567
|
-
`).trim();
|
|
568
|
-
};
|
|
569
|
-
var escapeHtml = (value) => value.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
570
|
-
var renderTablesToHtml = (tables) => {
|
|
571
|
-
if (!tables || tables.length === 0)
|
|
572
|
-
return [];
|
|
573
|
-
return tables.map((table) => {
|
|
574
|
-
const rows = table.rows.map((row) => {
|
|
575
|
-
const cells = row.map((cell) => `<td>${escapeHtml(cell)}</td>`).join("");
|
|
576
|
-
return `<tr>${cells}</tr>`;
|
|
577
|
-
}).join(`
|
|
578
|
-
`);
|
|
579
|
-
return [
|
|
580
|
-
`<table data-page="${String(table.page)}" data-table-index="${String(table.tableIndex)}">`,
|
|
581
|
-
"<tbody>",
|
|
582
|
-
rows,
|
|
583
|
-
"</tbody>",
|
|
584
|
-
"</table>"
|
|
585
|
-
].join(`
|
|
586
|
-
`);
|
|
587
|
-
});
|
|
639
|
+
return pageStructureTrees;
|
|
588
640
|
};
|
|
589
|
-
var
|
|
590
|
-
const
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
`<figcaption>Image ${String(item.imageData.index + 1)}: ${String(item.imageData.width)}x${String(item.imageData.height)} ${escapeHtml(item.imageData.format)}</figcaption>`,
|
|
602
|
-
"</figure>"
|
|
603
|
-
].join(`
|
|
604
|
-
`));
|
|
641
|
+
var extractPageGeometry = async (pdfDocument, pagesToProcess) => {
|
|
642
|
+
const pageGeometry = [];
|
|
643
|
+
for (const pageNum of pagesToProcess) {
|
|
644
|
+
try {
|
|
645
|
+
const page = await pdfDocument.getPage(pageNum);
|
|
646
|
+
const viewBox = buildRectBoundingBox(page.view);
|
|
647
|
+
const viewport = page.getViewport({ scale: 1 });
|
|
648
|
+
const width = finiteNumber(viewport.width) ? viewport.width : viewBox ? viewBox.right - viewBox.left : undefined;
|
|
649
|
+
const height = finiteNumber(viewport.height) ? viewport.height : viewBox ? viewBox.top - viewBox.bottom : undefined;
|
|
650
|
+
if (!finiteNumber(width) || !finiteNumber(height)) {
|
|
651
|
+
logger2.warn("Skipping page geometry with invalid dimensions", { pageNum });
|
|
652
|
+
continue;
|
|
605
653
|
}
|
|
654
|
+
pageGeometry.push({
|
|
655
|
+
page: pageNum,
|
|
656
|
+
width,
|
|
657
|
+
height,
|
|
658
|
+
rotation: finiteNumber(page.rotate) ? page.rotate : 0,
|
|
659
|
+
...finiteNumber(page.userUnit) ? { user_unit: page.userUnit } : {},
|
|
660
|
+
...viewBox ? { view_box: viewBox } : {}
|
|
661
|
+
});
|
|
662
|
+
} catch (error) {
|
|
663
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
664
|
+
logger2.warn("Error extracting page geometry", { pageNum, error: message });
|
|
606
665
|
}
|
|
607
|
-
body.push("</section>");
|
|
608
|
-
return body.join(`
|
|
609
|
-
`);
|
|
610
|
-
});
|
|
611
|
-
return [...sections, ...renderTablesToHtml(tables)].join(`
|
|
612
|
-
|
|
613
|
-
`).trim();
|
|
614
|
-
};
|
|
615
|
-
var elementText = (element) => {
|
|
616
|
-
if (element.type === "text")
|
|
617
|
-
return element.content.trim();
|
|
618
|
-
if (element.type === "table") {
|
|
619
|
-
const tableText = element.table.rows.map((row) => row.join(" | ")).join(`
|
|
620
|
-
`).trim();
|
|
621
|
-
return tableText.length > 0 ? tableText : undefined;
|
|
622
666
|
}
|
|
623
|
-
return;
|
|
667
|
+
return pageGeometry;
|
|
624
668
|
};
|
|
625
|
-
var
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
pageStart: element.page,
|
|
629
|
-
pageEnd: element.page,
|
|
630
|
-
textParts: [],
|
|
631
|
-
elementIds: [],
|
|
632
|
-
boundingBoxes: [],
|
|
633
|
-
strategy,
|
|
634
|
-
heading
|
|
635
|
-
});
|
|
636
|
-
var addElementToChunk = (draft, element, textValue) => {
|
|
637
|
-
draft.pageEnd = Math.max(draft.pageEnd, element.page);
|
|
638
|
-
draft.textParts.push(textValue);
|
|
639
|
-
draft.elementIds.push(element.id);
|
|
640
|
-
if (element.bounding_box) {
|
|
641
|
-
draft.boundingBoxes.push(element.bounding_box);
|
|
669
|
+
var buildWarnings = (invalidPages, totalPages) => {
|
|
670
|
+
if (invalidPages.length === 0) {
|
|
671
|
+
return [];
|
|
642
672
|
}
|
|
673
|
+
return [
|
|
674
|
+
`Requested page numbers ${invalidPages.join(", ")} exceed total pages (${String(totalPages)}).`
|
|
675
|
+
];
|
|
643
676
|
};
|
|
644
|
-
var
|
|
645
|
-
const
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
strategy: draft.strategy,
|
|
656
|
-
...draft.heading ? { heading: draft.heading } : {},
|
|
657
|
-
...draft.boundingBoxes.length > 0 ? { bounding_boxes: draft.boundingBoxes } : {}
|
|
658
|
-
};
|
|
659
|
-
};
|
|
660
|
-
var buildCitationChunks = (elements, options) => {
|
|
661
|
-
const maxChars = options.maxChars ?? DEFAULT_CHUNK_MAX_CHARS;
|
|
662
|
-
const chunks = [];
|
|
663
|
-
let current;
|
|
664
|
-
const pushCurrent = () => {
|
|
665
|
-
if (!current)
|
|
666
|
-
return;
|
|
667
|
-
const chunk = finalizeChunk(current, chunks.length + 1);
|
|
668
|
-
if (chunk)
|
|
669
|
-
chunks.push(chunk);
|
|
670
|
-
current = undefined;
|
|
671
|
-
};
|
|
672
|
-
for (const element of elements) {
|
|
673
|
-
const textValue = elementText(element);
|
|
674
|
-
if (!textValue)
|
|
675
|
-
continue;
|
|
676
|
-
const role = elementRole(element);
|
|
677
|
-
const shouldStartSemanticChunk = options.useSemanticBoundaries && role === "heading";
|
|
678
|
-
const shouldStartTableChunk = element.type === "table";
|
|
679
|
-
const exceedsSize = current !== undefined && current.elementIds.length > 0 && chunkTextLength(current) + textValue.length > maxChars;
|
|
680
|
-
const crossesPage = current !== undefined && current.pageEnd !== element.page;
|
|
681
|
-
if (shouldStartSemanticChunk || shouldStartTableChunk || exceedsSize || crossesPage) {
|
|
682
|
-
pushCurrent();
|
|
683
|
-
}
|
|
684
|
-
if (!current) {
|
|
685
|
-
const strategy = shouldStartSemanticChunk ? "semantic" : exceedsSize ? "size" : "page";
|
|
686
|
-
const heading = shouldStartSemanticChunk && element.type === "text" ? element.content.trim() : undefined;
|
|
687
|
-
current = createChunkDraft(element, strategy, heading);
|
|
688
|
-
}
|
|
689
|
-
if (element.type === "table" && current.elementIds.length === 0) {
|
|
690
|
-
current.strategy = "table";
|
|
691
|
-
}
|
|
692
|
-
addElementToChunk(current, element, textValue);
|
|
693
|
-
if (element.type === "table") {
|
|
694
|
-
pushCurrent();
|
|
695
|
-
}
|
|
696
|
-
}
|
|
697
|
-
pushCurrent();
|
|
698
|
-
return chunks;
|
|
699
|
-
};
|
|
700
|
-
var PROMPT_INJECTION_PATTERNS = [
|
|
701
|
-
/\bignore (all )?(previous|prior|above) instructions\b/i,
|
|
702
|
-
/\bdisregard (previous|prior|above) instructions\b/i,
|
|
703
|
-
/\bsystem prompt\b/i,
|
|
704
|
-
/\bdeveloper (message|instruction)s?\b/i,
|
|
705
|
-
/\bdo not (follow|obey) .*instructions\b/i
|
|
706
|
-
];
|
|
707
|
-
var snippetFromText = (value) => {
|
|
708
|
-
const normalized = value.replace(/\s+/g, " ").trim();
|
|
709
|
-
return normalized.length > 160 ? `${normalized.slice(0, 157)}...` : normalized;
|
|
710
|
-
};
|
|
711
|
-
var isOutsideViewBox = (box, viewBox) => {
|
|
712
|
-
if (!box || !viewBox)
|
|
713
|
-
return false;
|
|
714
|
-
const tolerance = 1;
|
|
715
|
-
return box.right < viewBox.left - tolerance || box.left > viewBox.right + tolerance || box.top < viewBox.bottom - tolerance || box.bottom > viewBox.top + tolerance;
|
|
716
|
-
};
|
|
717
|
-
var buildSafetyFindings = (pageContents, pageGeometry) => {
|
|
718
|
-
const findings = [];
|
|
719
|
-
const geometryByPage = new Map(pageGeometry?.map((geometry) => [geometry.page, geometry]));
|
|
720
|
-
for (const pageContent of pageContents) {
|
|
721
|
-
let elementIndex = 1;
|
|
722
|
-
const geometry = geometryByPage.get(pageContent.page);
|
|
723
|
-
for (const item of pageContent.items) {
|
|
724
|
-
const element = contentItemToElement(item, pageContent.page, elementIndex);
|
|
725
|
-
if (!element) {
|
|
677
|
+
var extractPageContent = async (pdfDocument, pageNum, includeImages, sourceDescription) => {
|
|
678
|
+
const contentItems = [];
|
|
679
|
+
try {
|
|
680
|
+
const page = await pdfDocument.getPage(pageNum);
|
|
681
|
+
const textContent = await page.getTextContent();
|
|
682
|
+
const textByY = new Map;
|
|
683
|
+
for (const item of textContent.items) {
|
|
684
|
+
const textItem = item;
|
|
685
|
+
const xCoord = textItem.transform?.[4];
|
|
686
|
+
const yCoord = textItem.transform?.[5];
|
|
687
|
+
if (yCoord === undefined)
|
|
726
688
|
continue;
|
|
689
|
+
const y = Math.round(yCoord);
|
|
690
|
+
const width = textItem.width ?? textItem.str.length * 6;
|
|
691
|
+
const height = textItem.height ?? Math.abs(textItem.transform?.[3] ?? 0);
|
|
692
|
+
const boundingBox = buildBoundingBox(xCoord, yCoord, width, height);
|
|
693
|
+
if (!textByY.has(y)) {
|
|
694
|
+
textByY.set(y, []);
|
|
727
695
|
}
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
}
|
|
742
|
-
if (item.height !== undefined && item.height > 0 && item.height < 2) {
|
|
743
|
-
findings.push({
|
|
744
|
-
type: "tiny_text",
|
|
745
|
-
severity: "medium",
|
|
746
|
-
page: pageContent.page,
|
|
747
|
-
element_id: element.id,
|
|
748
|
-
message: "Text is unusually small and may be hidden, decorative, or extraction noise.",
|
|
749
|
-
snippet,
|
|
750
|
-
...element.bounding_box ? { bounding_box: element.bounding_box } : {}
|
|
751
|
-
});
|
|
696
|
+
textByY.get(y)?.push({
|
|
697
|
+
text: textItem.str,
|
|
698
|
+
x: xCoord ?? 0,
|
|
699
|
+
width,
|
|
700
|
+
height,
|
|
701
|
+
bounding_box: boundingBox
|
|
702
|
+
});
|
|
703
|
+
}
|
|
704
|
+
for (const [y, textParts] of textByY.entries()) {
|
|
705
|
+
for (const segment of splitTextPartsIntoSegments(textParts)) {
|
|
706
|
+
const contentItem = textSegmentToContentItem(y, segment);
|
|
707
|
+
if (contentItem) {
|
|
708
|
+
contentItems.push(contentItem);
|
|
752
709
|
}
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
});
|
|
710
|
+
}
|
|
711
|
+
}
|
|
712
|
+
if (includeImages) {
|
|
713
|
+
const operatorList = await page.getOperatorList();
|
|
714
|
+
const imageIndices = [];
|
|
715
|
+
for (let i = 0;i < operatorList.fnArray.length; i++) {
|
|
716
|
+
const op = operatorList.fnArray[i];
|
|
717
|
+
if (op === OPS.paintImageXObject || op === OPS.paintXObject) {
|
|
718
|
+
imageIndices.push(i);
|
|
763
719
|
}
|
|
764
720
|
}
|
|
765
|
-
|
|
721
|
+
const imagePromises = imageIndices.map(async (imgIndex, arrayIndex) => {
|
|
722
|
+
const argsArray = operatorList.argsArray[imgIndex];
|
|
723
|
+
if (!argsArray || argsArray.length === 0) {
|
|
724
|
+
return null;
|
|
725
|
+
}
|
|
726
|
+
const imageName = argsArray[0];
|
|
727
|
+
let xPosition;
|
|
728
|
+
let yPosition;
|
|
729
|
+
if (argsArray.length > 1 && Array.isArray(argsArray[1])) {
|
|
730
|
+
const transform = argsArray[1];
|
|
731
|
+
const xCoord = transform[4];
|
|
732
|
+
const yCoord = transform[5];
|
|
733
|
+
if (xCoord !== undefined) {
|
|
734
|
+
xPosition = Math.round(xCoord);
|
|
735
|
+
}
|
|
736
|
+
if (yCoord !== undefined) {
|
|
737
|
+
yPosition = Math.round(yCoord);
|
|
738
|
+
}
|
|
739
|
+
}
|
|
740
|
+
const imageData = await retrieveImageData(page, imageName, pageNum);
|
|
741
|
+
const extractedImage = processImageData(imageData, pageNum, arrayIndex);
|
|
742
|
+
if (extractedImage) {
|
|
743
|
+
const imageBox = buildBoundingBox(xPosition, yPosition, extractedImage.width, extractedImage.height);
|
|
744
|
+
extractedImage.bounding_box = imageBox;
|
|
745
|
+
return {
|
|
746
|
+
type: "image",
|
|
747
|
+
yPosition: imageBox?.top ?? yPosition ?? 0,
|
|
748
|
+
xPosition,
|
|
749
|
+
width: extractedImage.width,
|
|
750
|
+
height: extractedImage.height,
|
|
751
|
+
bounding_box: imageBox,
|
|
752
|
+
imageData: extractedImage
|
|
753
|
+
};
|
|
754
|
+
}
|
|
755
|
+
return null;
|
|
756
|
+
});
|
|
757
|
+
const resolvedImages = await Promise.all(imagePromises);
|
|
758
|
+
const validImages = resolvedImages.filter((item) => item !== null);
|
|
759
|
+
contentItems.push(...validImages);
|
|
766
760
|
}
|
|
761
|
+
} catch (error) {
|
|
762
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
763
|
+
logger2.warn("Error extracting page content", {
|
|
764
|
+
pageNum,
|
|
765
|
+
sourceDescription,
|
|
766
|
+
error: message
|
|
767
|
+
});
|
|
768
|
+
return [
|
|
769
|
+
{
|
|
770
|
+
type: "text",
|
|
771
|
+
yPosition: 0,
|
|
772
|
+
textContent: `[Error processing page ${String(pageNum)}]`
|
|
773
|
+
}
|
|
774
|
+
];
|
|
767
775
|
}
|
|
768
|
-
return
|
|
776
|
+
return sortPageContentItems(contentItems);
|
|
769
777
|
};
|
|
770
778
|
|
|
771
|
-
// src/pdf/
|
|
772
|
-
import
|
|
773
|
-
import {
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
779
|
+
// src/pdf/loader.ts
|
|
780
|
+
import fs3 from "node:fs/promises";
|
|
781
|
+
import { createRequire } from "node:module";
|
|
782
|
+
import { getDocument } from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
783
|
+
|
|
784
|
+
// src/utils/config.ts
|
|
785
|
+
import dns from "node:dns";
|
|
786
|
+
import fs from "node:fs";
|
|
787
|
+
import net from "node:net";
|
|
788
|
+
import path from "node:path";
|
|
789
|
+
var splitList = (value, separators) => value.split(separators).map((s) => s.trim()).filter((s) => s.length > 0);
|
|
790
|
+
var canonicalizeDir = (p) => {
|
|
791
|
+
try {
|
|
792
|
+
return fs.realpathSync(p);
|
|
793
|
+
} catch (err) {
|
|
794
|
+
if (typeof err === "object" && err !== null && "code" in err && (err.code === "ENOENT" || err.code === "ENOTDIR")) {
|
|
795
|
+
const parent = path.dirname(p);
|
|
796
|
+
if (parent === p)
|
|
797
|
+
return p;
|
|
798
|
+
return path.join(canonicalizeDir(parent), path.basename(p));
|
|
799
|
+
}
|
|
800
|
+
throw err;
|
|
793
801
|
}
|
|
794
|
-
|
|
795
|
-
|
|
802
|
+
};
|
|
803
|
+
var parseDirs = (values) => values.map((dir) => canonicalizeDir(path.resolve(path.normalize(dir))));
|
|
804
|
+
var parseBool = (value, fallback) => {
|
|
805
|
+
if (value === undefined)
|
|
806
|
+
return fallback;
|
|
807
|
+
const v = value.trim().toLowerCase();
|
|
808
|
+
if (v === "false" || v === "0" || v === "no" || v === "off")
|
|
809
|
+
return false;
|
|
810
|
+
if (v === "true" || v === "1" || v === "yes" || v === "on")
|
|
811
|
+
return true;
|
|
812
|
+
return fallback;
|
|
813
|
+
};
|
|
814
|
+
var parseCliFlags = (argv) => {
|
|
815
|
+
const dirs = [];
|
|
816
|
+
const hosts = [];
|
|
817
|
+
let noHttp = false;
|
|
818
|
+
let allowPrivateIps = false;
|
|
819
|
+
for (const arg of argv) {
|
|
820
|
+
if (arg.startsWith("--allow-dir=")) {
|
|
821
|
+
dirs.push(arg.slice("--allow-dir=".length));
|
|
822
|
+
} else if (arg.startsWith("--allow-host=")) {
|
|
823
|
+
hosts.push(arg.slice("--allow-host=".length).toLowerCase());
|
|
824
|
+
} else if (arg === "--no-http") {
|
|
825
|
+
noHttp = true;
|
|
826
|
+
} else if (arg === "--allow-private-ips") {
|
|
827
|
+
allowPrivateIps = true;
|
|
828
|
+
}
|
|
796
829
|
}
|
|
830
|
+
return { dirs, hosts, noHttp, allowPrivateIps };
|
|
831
|
+
};
|
|
832
|
+
var envList = (raw, separators, transform = (v) => v) => raw ? splitList(raw, separators).map(transform) : [];
|
|
833
|
+
var readSecurityConfig = (argv = process.argv.slice(2), env = process.env) => {
|
|
834
|
+
const cli = parseCliFlags(argv);
|
|
835
|
+
const envDirs = envList(env["MCP_PDF_ALLOWED_DIRS"], /[:,]/);
|
|
836
|
+
const envHosts = envList(env["MCP_PDF_ALLOWED_HOSTS"], /,/, (h) => h.toLowerCase());
|
|
837
|
+
const mergedDirs = [...cli.dirs, ...envDirs];
|
|
838
|
+
const mergedHosts = [...cli.hosts, ...envHosts];
|
|
797
839
|
return {
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
840
|
+
allowedDirs: mergedDirs.length > 0 ? parseDirs(mergedDirs) : null,
|
|
841
|
+
allowHttp: cli.noHttp ? false : parseBool(env["MCP_PDF_ALLOW_HTTP"], true),
|
|
842
|
+
allowedHosts: mergedHosts.length > 0 ? mergedHosts : null,
|
|
843
|
+
allowPrivateIps: cli.allowPrivateIps || parseBool(env["MCP_PDF_ALLOW_PRIVATE_IPS"], false)
|
|
802
844
|
};
|
|
803
845
|
};
|
|
804
|
-
var
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
if (x1 === undefined || y1 === undefined || x2 === undefined || y2 === undefined || ![x1, y1, x2, y2].every(Number.isFinite)) {
|
|
809
|
-
return;
|
|
846
|
+
var cached = null;
|
|
847
|
+
var getSecurityConfig = () => {
|
|
848
|
+
if (cached === null) {
|
|
849
|
+
cached = readSecurityConfig();
|
|
810
850
|
}
|
|
811
|
-
return
|
|
812
|
-
left: Math.min(x1, x2),
|
|
813
|
-
bottom: Math.min(y1, y2),
|
|
814
|
-
right: Math.max(x1, x2),
|
|
815
|
-
top: Math.max(y1, y2)
|
|
816
|
-
};
|
|
851
|
+
return cached;
|
|
817
852
|
};
|
|
818
|
-
var
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
853
|
+
var isPathAllowed = (absPath, allowedDirs) => {
|
|
854
|
+
if (allowedDirs === null)
|
|
855
|
+
return true;
|
|
856
|
+
if (allowedDirs.length === 0)
|
|
857
|
+
return false;
|
|
858
|
+
const normalized = path.resolve(absPath);
|
|
859
|
+
return allowedDirs.some((dir) => {
|
|
860
|
+
const rel = path.relative(dir, normalized);
|
|
861
|
+
if (rel === "")
|
|
862
|
+
return true;
|
|
863
|
+
if (rel.startsWith(".."))
|
|
864
|
+
return false;
|
|
865
|
+
if (path.isAbsolute(rel))
|
|
866
|
+
return false;
|
|
867
|
+
return true;
|
|
868
|
+
});
|
|
822
869
|
};
|
|
823
|
-
var
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
...item.italic !== undefined ? { italic: item.italic } : {},
|
|
832
|
-
...item.color ? { color: Array.from(item.color) } : {},
|
|
833
|
-
...item.url ? { url: item.url } : {},
|
|
834
|
-
...item.dest !== undefined ? { dest: item.dest } : {},
|
|
835
|
-
...children && children.length > 0 ? { items: children } : {}
|
|
836
|
-
};
|
|
837
|
-
}).filter((item) => item !== undefined);
|
|
838
|
-
var PDF_PERMISSION_LABELS = new Map([
|
|
839
|
-
[4, "print"],
|
|
840
|
-
[8, "modify"],
|
|
841
|
-
[16, "copy"],
|
|
842
|
-
[32, "annotate"],
|
|
843
|
-
[256, "fill_forms"],
|
|
844
|
-
[512, "copy_for_accessibility"],
|
|
845
|
-
[1024, "assemble"],
|
|
846
|
-
[2048, "print_high_quality"]
|
|
847
|
-
]);
|
|
848
|
-
var permissionLabels = (permissions) => permissions.map((permission) => PDF_PERMISSION_LABELS.get(permission) ?? `unknown:${String(permission)}`);
|
|
849
|
-
var attachmentSize = (content) => {
|
|
850
|
-
if (!content)
|
|
851
|
-
return;
|
|
852
|
-
if ("byteLength" in content && typeof content.byteLength === "number") {
|
|
853
|
-
return content.byteLength;
|
|
870
|
+
var isUrlAllowed = (urlString, config) => {
|
|
871
|
+
if (!config.allowHttp)
|
|
872
|
+
return false;
|
|
873
|
+
let parsed;
|
|
874
|
+
try {
|
|
875
|
+
parsed = new URL(urlString);
|
|
876
|
+
} catch {
|
|
877
|
+
return false;
|
|
854
878
|
}
|
|
855
|
-
if ("
|
|
856
|
-
return
|
|
879
|
+
if (parsed.protocol !== "http:" && parsed.protocol !== "https:")
|
|
880
|
+
return false;
|
|
881
|
+
if (config.allowedHosts === null)
|
|
882
|
+
return true;
|
|
883
|
+
return config.allowedHosts.includes(parsed.hostname.toLowerCase());
|
|
884
|
+
};
|
|
885
|
+
var PRIVATE_IPV4_PREDICATES = [
|
|
886
|
+
(a) => a === 10,
|
|
887
|
+
(a, b) => a === 172 && b >= 16 && b <= 31,
|
|
888
|
+
(a, b) => a === 192 && b === 168,
|
|
889
|
+
(a) => a === 127,
|
|
890
|
+
(a, b) => a === 169 && b === 254,
|
|
891
|
+
(a) => a === 0,
|
|
892
|
+
(a, b) => a === 100 && b >= 64 && b <= 127,
|
|
893
|
+
(a) => a >= 224
|
|
894
|
+
];
|
|
895
|
+
var isPrivateIpv4 = (ip) => {
|
|
896
|
+
const parts = ip.split(".").map((s) => Number.parseInt(s, 10));
|
|
897
|
+
const a = parts[0];
|
|
898
|
+
const b = parts[1];
|
|
899
|
+
if (a === undefined || b === undefined)
|
|
900
|
+
return true;
|
|
901
|
+
return PRIVATE_IPV4_PREDICATES.some((pred) => pred(a, b));
|
|
902
|
+
};
|
|
903
|
+
var isPrivateIpv6 = (ip) => {
|
|
904
|
+
const lower = ip.toLowerCase();
|
|
905
|
+
if (lower === "::1" || lower === "::")
|
|
906
|
+
return true;
|
|
907
|
+
if (lower.startsWith("fc") || lower.startsWith("fd"))
|
|
908
|
+
return true;
|
|
909
|
+
if (lower.startsWith("fe80"))
|
|
910
|
+
return true;
|
|
911
|
+
if (lower.startsWith("ff"))
|
|
912
|
+
return true;
|
|
913
|
+
if (lower.startsWith("::ffff:")) {
|
|
914
|
+
const tail = lower.slice("::ffff:".length);
|
|
915
|
+
if (net.isIPv4(tail))
|
|
916
|
+
return isPrivateIpv4(tail);
|
|
857
917
|
}
|
|
858
|
-
return;
|
|
918
|
+
return false;
|
|
859
919
|
};
|
|
860
|
-
var
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
const width = boundingBox !== undefined ? boundingBox.right - boundingBox.left : segment.reduce((sum, part) => sum + part.width, 0);
|
|
867
|
-
const height = boundingBox !== undefined ? boundingBox.top - boundingBox.bottom : Math.max(...segment.map((part) => part.height), 0);
|
|
868
|
-
return {
|
|
869
|
-
type: "text",
|
|
870
|
-
yPosition: y,
|
|
871
|
-
xPosition,
|
|
872
|
-
width,
|
|
873
|
-
height,
|
|
874
|
-
bounding_box: boundingBox,
|
|
875
|
-
textContent
|
|
876
|
-
};
|
|
920
|
+
var isPrivateIp = (ip) => {
|
|
921
|
+
if (net.isIPv4(ip))
|
|
922
|
+
return isPrivateIpv4(ip);
|
|
923
|
+
if (net.isIPv6(ip))
|
|
924
|
+
return isPrivateIpv6(ip);
|
|
925
|
+
return true;
|
|
877
926
|
};
|
|
878
|
-
var
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
let previousRight;
|
|
883
|
-
for (const part of sortedParts) {
|
|
884
|
-
if (previousRight !== undefined && part.x - previousRight > TEXT_SEGMENT_GAP_THRESHOLD) {
|
|
885
|
-
if (currentSegment.length > 0) {
|
|
886
|
-
segments.push(currentSegment);
|
|
887
|
-
}
|
|
888
|
-
currentSegment = [];
|
|
927
|
+
var assertUrlNotPrivate = async (hostname) => {
|
|
928
|
+
if (net.isIP(hostname)) {
|
|
929
|
+
if (isPrivateIp(hostname)) {
|
|
930
|
+
throw new Error(`URL host '${hostname}' resolves to a non-public address (SSRF protection).`);
|
|
889
931
|
}
|
|
890
|
-
|
|
891
|
-
previousRight = Math.max(previousRight ?? part.x, part.x + part.width);
|
|
932
|
+
return;
|
|
892
933
|
}
|
|
893
|
-
|
|
894
|
-
|
|
934
|
+
let addresses;
|
|
935
|
+
try {
|
|
936
|
+
addresses = await dns.promises.lookup(hostname, { all: true });
|
|
937
|
+
} catch {
|
|
938
|
+
throw new Error(`URL host '${hostname}' could not be resolved.`);
|
|
895
939
|
}
|
|
896
|
-
|
|
897
|
-
};
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
return;
|
|
903
|
-
const left = Math.min(...boxedItems.map((item) => item.bounding_box?.left ?? 0));
|
|
904
|
-
const right = Math.max(...boxedItems.map((item) => item.bounding_box?.right ?? 0));
|
|
905
|
-
const pageWidth = right - left;
|
|
906
|
-
if (pageWidth <= 0)
|
|
907
|
-
return;
|
|
908
|
-
const narrowItems = boxedItems.filter((item) => {
|
|
909
|
-
const box = item.bounding_box;
|
|
910
|
-
if (!box)
|
|
911
|
-
return false;
|
|
912
|
-
return box.right - box.left < pageWidth * SPANNING_WIDTH_RATIO;
|
|
913
|
-
});
|
|
914
|
-
if (narrowItems.length < 4)
|
|
915
|
-
return;
|
|
916
|
-
const sorted = [...narrowItems].sort((a, b) => (a.bounding_box?.left ?? 0) - (b.bounding_box?.left ?? 0));
|
|
917
|
-
let currentRight = sorted[0]?.bounding_box?.right;
|
|
918
|
-
if (currentRight === undefined)
|
|
919
|
-
return;
|
|
920
|
-
let largestGap = 0;
|
|
921
|
-
let cutPosition;
|
|
922
|
-
for (let i = 1;i < sorted.length; i++) {
|
|
923
|
-
const box = sorted[i]?.bounding_box;
|
|
924
|
-
if (!box)
|
|
925
|
-
continue;
|
|
926
|
-
if (box.left > currentRight) {
|
|
927
|
-
const gap = box.left - currentRight;
|
|
928
|
-
if (gap > largestGap) {
|
|
929
|
-
largestGap = gap;
|
|
930
|
-
cutPosition = (box.left + currentRight) / 2;
|
|
931
|
-
}
|
|
940
|
+
if (addresses.length === 0) {
|
|
941
|
+
throw new Error(`URL host '${hostname}' resolved to no addresses.`);
|
|
942
|
+
}
|
|
943
|
+
for (const { address } of addresses) {
|
|
944
|
+
if (isPrivateIp(address)) {
|
|
945
|
+
throw new Error(`URL host '${hostname}' resolves to a non-public address (SSRF protection).`);
|
|
932
946
|
}
|
|
933
|
-
currentRight = Math.max(currentRight, box.right);
|
|
934
947
|
}
|
|
935
|
-
if (cutPosition === undefined)
|
|
936
|
-
return;
|
|
937
|
-
const minGap = Math.max(COLUMN_CUT_MIN_GAP, pageWidth * COLUMN_CUT_MIN_WIDTH_RATIO);
|
|
938
|
-
if (largestGap < minGap)
|
|
939
|
-
return;
|
|
940
|
-
const leftCount = narrowItems.filter((item) => {
|
|
941
|
-
const box = item.bounding_box;
|
|
942
|
-
if (!box)
|
|
943
|
-
return false;
|
|
944
|
-
return (box.left + box.right) / 2 < cutPosition;
|
|
945
|
-
}).length;
|
|
946
|
-
const rightCount = narrowItems.length - leftCount;
|
|
947
|
-
return leftCount >= 2 && rightCount >= 2 ? cutPosition : undefined;
|
|
948
948
|
};
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
if (
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
spanning.push(item);
|
|
964
|
-
continue;
|
|
965
|
-
}
|
|
966
|
-
const center = (box.left + box.right) / 2;
|
|
967
|
-
if (center < cutPosition) {
|
|
968
|
-
leftColumn.push(item);
|
|
969
|
-
} else {
|
|
970
|
-
rightColumn.push(item);
|
|
949
|
+
|
|
950
|
+
// src/utils/pathUtils.ts
|
|
951
|
+
import fs2 from "node:fs";
|
|
952
|
+
import path2 from "node:path";
|
|
953
|
+
var PROJECT_ROOT = process.cwd();
|
|
954
|
+
var canonicalize = (p) => {
|
|
955
|
+
try {
|
|
956
|
+
return fs2.realpathSync(p);
|
|
957
|
+
} catch (err) {
|
|
958
|
+
if (typeof err === "object" && err !== null && "code" in err && (err.code === "ENOENT" || err.code === "ENOTDIR")) {
|
|
959
|
+
const parent = path2.dirname(p);
|
|
960
|
+
if (parent === p)
|
|
961
|
+
return p;
|
|
962
|
+
return path2.join(canonicalize(parent), path2.basename(p));
|
|
971
963
|
}
|
|
964
|
+
throw err;
|
|
972
965
|
}
|
|
973
|
-
const columnItems = [...leftColumn, ...rightColumn].filter((item) => item.bounding_box);
|
|
974
|
-
const highestColumnTop = columnItems.length > 0 ? Math.max(...columnItems.map((item) => item.bounding_box?.top ?? item.yPosition)) : Number.POSITIVE_INFINITY;
|
|
975
|
-
const topSpanning = spanning.filter((item) => (item.bounding_box?.top ?? item.yPosition) >= highestColumnTop);
|
|
976
|
-
const remainingSpanning = spanning.filter((item) => (item.bounding_box?.top ?? item.yPosition) < highestColumnTop);
|
|
977
|
-
return [
|
|
978
|
-
...sortByYThenX(topSpanning),
|
|
979
|
-
...sortByYThenX(leftColumn),
|
|
980
|
-
...sortByYThenX(rightColumn),
|
|
981
|
-
...sortByYThenX(remainingSpanning)
|
|
982
|
-
];
|
|
983
966
|
};
|
|
984
|
-
var
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
png.data = Buffer.from(pixelData);
|
|
988
|
-
} else if (channels === 3) {
|
|
989
|
-
for (let i = 0;i < width * height; i++) {
|
|
990
|
-
const srcIdx = i * 3;
|
|
991
|
-
const dstIdx = i * 4;
|
|
992
|
-
png.data[dstIdx] = pixelData[srcIdx] ?? 0;
|
|
993
|
-
png.data[dstIdx + 1] = pixelData[srcIdx + 1] ?? 0;
|
|
994
|
-
png.data[dstIdx + 2] = pixelData[srcIdx + 2] ?? 0;
|
|
995
|
-
png.data[dstIdx + 3] = 255;
|
|
996
|
-
}
|
|
997
|
-
} else if (channels === 1) {
|
|
998
|
-
for (let i = 0;i < width * height; i++) {
|
|
999
|
-
const gray = pixelData[i] ?? 0;
|
|
1000
|
-
const dstIdx = i * 4;
|
|
1001
|
-
png.data[dstIdx] = gray;
|
|
1002
|
-
png.data[dstIdx + 1] = gray;
|
|
1003
|
-
png.data[dstIdx + 2] = gray;
|
|
1004
|
-
png.data[dstIdx + 3] = 255;
|
|
1005
|
-
}
|
|
967
|
+
var resolvePath = (userPath) => {
|
|
968
|
+
if (typeof userPath !== "string") {
|
|
969
|
+
throw new PdfError(-32602 /* InvalidParams */, "Path must be a string.");
|
|
1006
970
|
}
|
|
1007
|
-
const
|
|
1008
|
-
|
|
971
|
+
const normalizedUserPath = path2.normalize(userPath);
|
|
972
|
+
const resolved = path2.isAbsolute(normalizedUserPath) ? normalizedUserPath : path2.resolve(PROJECT_ROOT, normalizedUserPath);
|
|
973
|
+
const canonical = canonicalize(resolved);
|
|
974
|
+
const { allowedDirs } = getSecurityConfig();
|
|
975
|
+
if (!isPathAllowed(canonical, allowedDirs)) {
|
|
976
|
+
throw new PdfError(-32600 /* InvalidRequest */, `Access denied: path '${userPath}' is outside the allowed directories.`);
|
|
977
|
+
}
|
|
978
|
+
return canonical;
|
|
1009
979
|
};
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
980
|
+
|
|
981
|
+
// src/pdf/loader.ts
|
|
982
|
+
var logger3 = createLogger("Loader");
|
|
983
|
+
var require2 = createRequire(import.meta.url);
|
|
984
|
+
var PDFJS_ROOT = require2.resolve("pdfjs-dist/package.json").replace("package.json", "");
|
|
985
|
+
var CMAP_URL = `${PDFJS_ROOT}cmaps/`;
|
|
986
|
+
var STANDARD_FONT_DATA_URL = `${PDFJS_ROOT}standard_fonts/`;
|
|
987
|
+
var WASM_URL = `${PDFJS_ROOT}wasm/`;
|
|
988
|
+
var ICC_URL = `${PDFJS_ROOT}iccs/`;
|
|
989
|
+
var MAX_PDF_SIZE = 100 * 1024 * 1024;
|
|
990
|
+
var URL_FETCH_TIMEOUT_MS = 30000;
|
|
991
|
+
var MAX_REDIRECTS = 5;
|
|
992
|
+
var formatBytes = (bytes) => `${(bytes / 1024 / 1024).toFixed(0)}MB`;
|
|
993
|
+
var sanitizeSourceDescription = (description) => description.length > 200 ? `${description.slice(0, 197)}...` : description;
|
|
994
|
+
var loadLocalFile = async (userPath) => {
|
|
995
|
+
const safePath = resolvePath(userPath);
|
|
996
|
+
let stats;
|
|
997
|
+
try {
|
|
998
|
+
stats = await fs3.stat(safePath);
|
|
999
|
+
} catch (err) {
|
|
1000
|
+
if (typeof err === "object" && err !== null && "code" in err && err.code === "ENOENT") {
|
|
1001
|
+
throw new PdfError(-32600 /* InvalidRequest */, `File not found at '${userPath}'.`, {
|
|
1002
|
+
cause: err instanceof Error ? err : undefined
|
|
1003
|
+
});
|
|
1004
|
+
}
|
|
1005
|
+
throw new PdfError(-32600 /* InvalidRequest */, `Failed to access file at '${userPath}'.`, {
|
|
1006
|
+
cause: err instanceof Error ? err : undefined
|
|
1007
|
+
});
|
|
1013
1008
|
}
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
return null;
|
|
1009
|
+
if (!stats.isFile()) {
|
|
1010
|
+
throw new PdfError(-32600 /* InvalidRequest */, `Path '${userPath}' is not a regular file.`);
|
|
1017
1011
|
}
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
index: arrayIndex,
|
|
1024
|
-
width: img.width,
|
|
1025
|
-
height: img.height,
|
|
1026
|
-
format,
|
|
1027
|
-
data: pngBase64
|
|
1028
|
-
};
|
|
1012
|
+
if (stats.size > MAX_PDF_SIZE) {
|
|
1013
|
+
throw new PdfError(-32600 /* InvalidRequest */, `PDF file exceeds maximum size of ${formatBytes(MAX_PDF_SIZE)}. File size: ${formatBytes(stats.size)}.`);
|
|
1014
|
+
}
|
|
1015
|
+
const buffer = await fs3.readFile(safePath);
|
|
1016
|
+
return new Uint8Array(buffer);
|
|
1029
1017
|
};
|
|
1030
|
-
var
|
|
1031
|
-
if (
|
|
1018
|
+
var validateUrlHop = async (urlString, config) => {
|
|
1019
|
+
if (!isUrlAllowed(urlString, config)) {
|
|
1020
|
+
const reason = config.allowHttp ? "host is not in the allowed list or scheme is not http(s)" : "HTTP access is disabled";
|
|
1021
|
+
throw new PdfError(-32600 /* InvalidRequest */, `Access denied: URL '${urlString}' rejected (${reason}).`);
|
|
1022
|
+
}
|
|
1023
|
+
if (!config.allowPrivateIps) {
|
|
1024
|
+
let hostname;
|
|
1032
1025
|
try {
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
}
|
|
1037
|
-
} catch (error) {
|
|
1038
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
1039
|
-
logger3.warn("Error getting image from commonObjs", { imageName, error: message });
|
|
1026
|
+
hostname = new URL(urlString).hostname;
|
|
1027
|
+
} catch {
|
|
1028
|
+
throw new PdfError(-32600 /* InvalidRequest */, `Invalid URL: '${urlString}'.`);
|
|
1040
1029
|
}
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1030
|
+
try {
|
|
1031
|
+
await assertUrlNotPrivate(hostname);
|
|
1032
|
+
} catch (err) {
|
|
1033
|
+
const reason = err instanceof Error ? err.message : "SSRF check failed";
|
|
1034
|
+
throw new PdfError(-32600 /* InvalidRequest */, `Access denied: ${reason}`);
|
|
1046
1035
|
}
|
|
1047
|
-
} catch (error) {
|
|
1048
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
1049
|
-
logger3.warn("Sync image get failed, trying async", { imageName, error: message });
|
|
1050
1036
|
}
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1037
|
+
};
|
|
1038
|
+
var fetchUrlBody = async (url, config) => {
|
|
1039
|
+
let currentUrl = url;
|
|
1040
|
+
const controller = new AbortController;
|
|
1041
|
+
const timeout = setTimeout(() => controller.abort(), URL_FETCH_TIMEOUT_MS);
|
|
1042
|
+
try {
|
|
1043
|
+
for (let hop = 0;hop <= MAX_REDIRECTS; hop++) {
|
|
1044
|
+
await validateUrlHop(currentUrl, config);
|
|
1045
|
+
const response = await fetch(currentUrl, {
|
|
1046
|
+
redirect: "manual",
|
|
1047
|
+
signal: controller.signal
|
|
1048
|
+
});
|
|
1049
|
+
if (response.status >= 300 && response.status < 400) {
|
|
1050
|
+
const location = response.headers.get("location");
|
|
1051
|
+
if (!location) {
|
|
1052
|
+
throw new PdfError(-32600 /* InvalidRequest */, `URL fetch failed: redirect without Location header.`);
|
|
1053
|
+
}
|
|
1054
|
+
currentUrl = new URL(location, currentUrl).toString();
|
|
1055
|
+
continue;
|
|
1058
1056
|
}
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
if (!resolved) {
|
|
1062
|
-
resolved = true;
|
|
1063
|
-
cleanup();
|
|
1064
|
-
logger3.warn("Image extraction timeout", { imageName, pageNum });
|
|
1065
|
-
resolve(null);
|
|
1057
|
+
if (!response.ok) {
|
|
1058
|
+
throw new PdfError(-32600 /* InvalidRequest */, `URL fetch failed with HTTP ${String(response.status)}.`);
|
|
1066
1059
|
}
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
if (
|
|
1071
|
-
|
|
1072
|
-
cleanup();
|
|
1073
|
-
resolve(imageData);
|
|
1060
|
+
const contentLengthHeader = response.headers.get("content-length");
|
|
1061
|
+
if (contentLengthHeader !== null) {
|
|
1062
|
+
const declared = Number.parseInt(contentLengthHeader, 10);
|
|
1063
|
+
if (Number.isFinite(declared) && declared > MAX_PDF_SIZE) {
|
|
1064
|
+
throw new PdfError(-32600 /* InvalidRequest */, `Remote PDF exceeds maximum size of ${formatBytes(MAX_PDF_SIZE)} (Content-Length: ${formatBytes(declared)}).`);
|
|
1074
1065
|
}
|
|
1075
|
-
});
|
|
1076
|
-
} catch (error) {
|
|
1077
|
-
if (!resolved) {
|
|
1078
|
-
resolved = true;
|
|
1079
|
-
cleanup();
|
|
1080
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
1081
|
-
logger3.warn("Error in async image get", { imageName, error: message });
|
|
1082
|
-
resolve(null);
|
|
1083
1066
|
}
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
output.num_pages = pdfDocument.numPages;
|
|
1091
|
-
}
|
|
1092
|
-
if (includeMetadata) {
|
|
1093
|
-
try {
|
|
1094
|
-
const pdfMetadata = await pdfDocument.getMetadata();
|
|
1095
|
-
const infoData = pdfMetadata.info;
|
|
1096
|
-
if (infoData !== undefined) {
|
|
1097
|
-
output.info = infoData;
|
|
1067
|
+
if (!response.body) {
|
|
1068
|
+
const ab = await response.arrayBuffer();
|
|
1069
|
+
if (ab.byteLength > MAX_PDF_SIZE) {
|
|
1070
|
+
throw new PdfError(-32600 /* InvalidRequest */, `Remote PDF exceeds maximum size of ${formatBytes(MAX_PDF_SIZE)}.`);
|
|
1071
|
+
}
|
|
1072
|
+
return new Uint8Array(ab);
|
|
1098
1073
|
}
|
|
1099
|
-
const
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
const
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1074
|
+
const reader = response.body.getReader();
|
|
1075
|
+
const chunks = [];
|
|
1076
|
+
let total = 0;
|
|
1077
|
+
while (true) {
|
|
1078
|
+
const { done, value } = await reader.read();
|
|
1079
|
+
if (done)
|
|
1080
|
+
break;
|
|
1081
|
+
if (value) {
|
|
1082
|
+
total += value.byteLength;
|
|
1083
|
+
if (total > MAX_PDF_SIZE) {
|
|
1084
|
+
await reader.cancel().catch(() => {});
|
|
1085
|
+
throw new PdfError(-32600 /* InvalidRequest */, `Remote PDF exceeds maximum size of ${formatBytes(MAX_PDF_SIZE)} during streaming.`);
|
|
1107
1086
|
}
|
|
1087
|
+
chunks.push(value);
|
|
1108
1088
|
}
|
|
1109
|
-
output.metadata = metadataRecord;
|
|
1110
1089
|
}
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
return output;
|
|
1117
|
-
};
|
|
1118
|
-
var extractDocumentStructure = async (pdfDocument, options) => {
|
|
1119
|
-
const documentWithStructure = pdfDocument;
|
|
1120
|
-
const output = {};
|
|
1121
|
-
if (options.includeOutline && typeof documentWithStructure.getOutline === "function") {
|
|
1122
|
-
try {
|
|
1123
|
-
const outline = await documentWithStructure.getOutline();
|
|
1124
|
-
if (outline && outline.length > 0) {
|
|
1125
|
-
output.outline = sanitizeOutlineItems(outline);
|
|
1090
|
+
const combined = new Uint8Array(total);
|
|
1091
|
+
let offset = 0;
|
|
1092
|
+
for (const chunk of chunks) {
|
|
1093
|
+
combined.set(chunk, offset);
|
|
1094
|
+
offset += chunk.byteLength;
|
|
1126
1095
|
}
|
|
1127
|
-
|
|
1128
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
1129
|
-
logger3.warn("Error extracting outline", { error: message });
|
|
1096
|
+
return combined;
|
|
1130
1097
|
}
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
}
|
|
1138
|
-
} catch (error) {
|
|
1139
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
1140
|
-
logger3.warn("Error extracting page labels", { error: message });
|
|
1098
|
+
throw new PdfError(-32600 /* InvalidRequest */, `URL fetch failed: exceeded redirect limit (${String(MAX_REDIRECTS)}).`);
|
|
1099
|
+
} catch (err) {
|
|
1100
|
+
if (err instanceof PdfError)
|
|
1101
|
+
throw err;
|
|
1102
|
+
if (err instanceof Error && (err.name === "AbortError" || err.name === "TimeoutError")) {
|
|
1103
|
+
throw new PdfError(-32600 /* InvalidRequest */, `URL fetch timed out after ${String(URL_FETCH_TIMEOUT_MS / 1000)}s.`, { cause: err });
|
|
1141
1104
|
}
|
|
1105
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1106
|
+
logger3.warn("URL fetch failed", { url, error: message });
|
|
1107
|
+
throw new PdfError(-32600 /* InvalidRequest */, `URL fetch failed for '${url}'.`, {
|
|
1108
|
+
cause: err instanceof Error ? err : undefined
|
|
1109
|
+
});
|
|
1110
|
+
} finally {
|
|
1111
|
+
clearTimeout(timeout);
|
|
1142
1112
|
}
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1113
|
+
};
|
|
1114
|
+
var loadPdfDocument = async (source, sourceDescription) => {
|
|
1115
|
+
const safeSource = sanitizeSourceDescription(sourceDescription);
|
|
1116
|
+
let pdfData;
|
|
1117
|
+
try {
|
|
1118
|
+
if (source.path) {
|
|
1119
|
+
pdfData = await loadLocalFile(source.path);
|
|
1120
|
+
} else if (source.url) {
|
|
1121
|
+
const config = getSecurityConfig();
|
|
1122
|
+
pdfData = await fetchUrlBody(source.url, config);
|
|
1123
|
+
} else {
|
|
1124
|
+
throw new PdfError(-32602 /* InvalidParams */, `Source ${safeSource} missing 'path' or 'url'.`);
|
|
1152
1125
|
}
|
|
1153
|
-
}
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
const markInfo = await documentWithStructure.getMarkInfo();
|
|
1157
|
-
if (markInfo && Object.keys(markInfo).length > 0) {
|
|
1158
|
-
output.mark_info = markInfo;
|
|
1159
|
-
}
|
|
1160
|
-
} catch (error) {
|
|
1161
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
1162
|
-
logger3.warn("Error extracting mark info", { error: message });
|
|
1126
|
+
} catch (err) {
|
|
1127
|
+
if (err instanceof PdfError) {
|
|
1128
|
+
throw err;
|
|
1163
1129
|
}
|
|
1130
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1131
|
+
logger3.error("Unexpected error preparing PDF source", {
|
|
1132
|
+
sourceDescription: safeSource,
|
|
1133
|
+
error: message
|
|
1134
|
+
});
|
|
1135
|
+
throw new PdfError(-32600 /* InvalidRequest */, `Failed to prepare PDF source ${safeSource}.`, {
|
|
1136
|
+
cause: err instanceof Error ? err : undefined
|
|
1137
|
+
});
|
|
1164
1138
|
}
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
logger3.warn("Error extracting form fields", { error: message });
|
|
1180
|
-
}
|
|
1139
|
+
const loadingTask = getDocument({
|
|
1140
|
+
data: pdfData,
|
|
1141
|
+
cMapUrl: CMAP_URL,
|
|
1142
|
+
cMapPacked: true,
|
|
1143
|
+
standardFontDataUrl: STANDARD_FONT_DATA_URL,
|
|
1144
|
+
wasmUrl: WASM_URL,
|
|
1145
|
+
iccUrl: ICC_URL
|
|
1146
|
+
});
|
|
1147
|
+
try {
|
|
1148
|
+
return await loadingTask.promise;
|
|
1149
|
+
} catch (err) {
|
|
1150
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1151
|
+
logger3.error("PDF.js loading error", { sourceDescription: safeSource, error: message });
|
|
1152
|
+
throw new PdfError(-32600 /* InvalidRequest */, `Failed to load PDF document from ${safeSource}.`, { cause: err instanceof Error ? err : undefined });
|
|
1181
1153
|
}
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1154
|
+
};
|
|
1155
|
+
|
|
1156
|
+
// src/pdf/parser.ts
|
|
1157
|
+
var logger4 = createLogger("Parser");
|
|
1158
|
+
var MAX_RANGE_SIZE = 1e4;
|
|
1159
|
+
var parseRangePart = (part, pages) => {
|
|
1160
|
+
const trimmedPart = part.trim();
|
|
1161
|
+
if (trimmedPart.includes("-")) {
|
|
1162
|
+
const splitResult = trimmedPart.split("-");
|
|
1163
|
+
const startStr = splitResult[0] || "";
|
|
1164
|
+
const endStr = splitResult[1];
|
|
1165
|
+
const start = parseInt(startStr, 10);
|
|
1166
|
+
const end = endStr === "" || endStr === undefined ? Infinity : parseInt(endStr, 10);
|
|
1167
|
+
if (Number.isNaN(start) || Number.isNaN(end) || start <= 0 || start > end) {
|
|
1168
|
+
throw new Error(`Invalid page range values: ${trimmedPart}`);
|
|
1169
|
+
}
|
|
1170
|
+
const practicalEnd = Math.min(end, start + MAX_RANGE_SIZE);
|
|
1171
|
+
for (let i = start;i <= practicalEnd; i++) {
|
|
1172
|
+
pages.add(i);
|
|
1173
|
+
}
|
|
1174
|
+
if (end === Infinity && practicalEnd === start + MAX_RANGE_SIZE) {
|
|
1175
|
+
logger4.warn("Open-ended range truncated", { start, practicalEnd });
|
|
1176
|
+
}
|
|
1177
|
+
} else {
|
|
1178
|
+
const page = parseInt(trimmedPart, 10);
|
|
1179
|
+
if (Number.isNaN(page) || page <= 0) {
|
|
1180
|
+
throw new Error(`Invalid page number: ${trimmedPart}`);
|
|
1202
1181
|
}
|
|
1182
|
+
pages.add(page);
|
|
1203
1183
|
}
|
|
1204
|
-
return output;
|
|
1205
1184
|
};
|
|
1206
|
-
var
|
|
1207
|
-
const
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
...field.value !== undefined ? { value: field.value } : {},
|
|
1217
|
-
...field.defaultValue !== undefined ? { default_value: field.defaultValue } : {},
|
|
1218
|
-
...page !== undefined ? { page } : {},
|
|
1219
|
-
...field.id ? { id: field.id } : {},
|
|
1220
|
-
...field.editable !== undefined ? { editable: field.editable } : {},
|
|
1221
|
-
...field.required !== undefined ? { required: field.required } : {},
|
|
1222
|
-
...boundingBox ? { bounding_box: boundingBox } : {}
|
|
1223
|
-
};
|
|
1185
|
+
var parsePageRanges = (ranges) => {
|
|
1186
|
+
const pages = new Set;
|
|
1187
|
+
const parts = ranges.split(",");
|
|
1188
|
+
for (const part of parts) {
|
|
1189
|
+
parseRangePart(part, pages);
|
|
1190
|
+
}
|
|
1191
|
+
if (pages.size === 0) {
|
|
1192
|
+
throw new Error("Page range string resulted in zero valid pages.");
|
|
1193
|
+
}
|
|
1194
|
+
return Array.from(pages).sort((a, b) => a - b);
|
|
1224
1195
|
};
|
|
1225
|
-
var
|
|
1226
|
-
|
|
1227
|
-
const title = textFromAnnotationField(annotation.title, annotation.titleObj);
|
|
1228
|
-
const boundingBox = buildRectBoundingBox(annotation.rect);
|
|
1229
|
-
const subtype = annotation.subtype?.trim();
|
|
1230
|
-
const url = annotation.url ?? annotation.unsafeUrl;
|
|
1231
|
-
if (!annotation.id && !subtype && !contents && !title && !url && annotation.dest === undefined) {
|
|
1196
|
+
var getTargetPages = (sourcePages, sourceDescription) => {
|
|
1197
|
+
if (!sourcePages) {
|
|
1232
1198
|
return;
|
|
1233
1199
|
}
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1200
|
+
try {
|
|
1201
|
+
if (typeof sourcePages === "string") {
|
|
1202
|
+
return parsePageRanges(sourcePages);
|
|
1203
|
+
}
|
|
1204
|
+
if (sourcePages.some((p) => !Number.isInteger(p) || p <= 0)) {
|
|
1205
|
+
throw new Error("Page numbers in array must be positive integers.");
|
|
1206
|
+
}
|
|
1207
|
+
const uniquePages = [...new Set(sourcePages)].sort((a, b) => a - b);
|
|
1208
|
+
if (uniquePages.length === 0) {
|
|
1209
|
+
throw new Error("Page specification resulted in an empty set of pages.");
|
|
1210
|
+
}
|
|
1211
|
+
return uniquePages;
|
|
1212
|
+
} catch (error) {
|
|
1213
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1214
|
+
throw new PdfError(-32602 /* InvalidParams */, `Invalid page specification for source ${sourceDescription}: ${message}`);
|
|
1215
|
+
}
|
|
1244
1216
|
};
|
|
1245
|
-
var
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
}
|
|
1217
|
+
var determinePagesToProcess = (targetPages, totalPages, includeFullText) => {
|
|
1218
|
+
if (targetPages) {
|
|
1219
|
+
const pagesToProcess = targetPages.filter((p) => p <= totalPages);
|
|
1220
|
+
const invalidPages = targetPages.filter((p) => p > totalPages);
|
|
1221
|
+
return { pagesToProcess, invalidPages };
|
|
1222
|
+
}
|
|
1223
|
+
if (includeFullText) {
|
|
1224
|
+
const pagesToProcess = Array.from({ length: totalPages }, (_, i) => i + 1);
|
|
1225
|
+
return { pagesToProcess, invalidPages: [] };
|
|
1226
|
+
}
|
|
1227
|
+
return { pagesToProcess: [], invalidPages: [] };
|
|
1255
1228
|
};
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1229
|
+
|
|
1230
|
+
// src/pdf/inspector.ts
|
|
1231
|
+
var logger5 = createLogger("Inspector");
|
|
1232
|
+
var DEFAULT_SAMPLE_PAGES = 5;
|
|
1233
|
+
var MAX_SAMPLE_PAGES = 20;
|
|
1234
|
+
var LOW_TEXT_CHAR_THRESHOLD = 20;
|
|
1235
|
+
var DIGITAL_TEXT_CHAR_THRESHOLD = 80;
|
|
1236
|
+
var APPROX_CHARS_PER_TOKEN = 4;
|
|
1237
|
+
var clampSamplePageCount = (value) => Math.min(MAX_SAMPLE_PAGES, Math.max(1, Math.floor(value)));
|
|
1238
|
+
var publicSource = (source) => ({
|
|
1239
|
+
...source.path ? { path: source.path } : {},
|
|
1240
|
+
...source.url ? { url: source.url } : {},
|
|
1241
|
+
...source.pages ? { pages: source.pages } : {}
|
|
1242
|
+
});
|
|
1243
|
+
var selectEvenlySpaced = (values, maxItems) => {
|
|
1244
|
+
const uniqueValues = [...new Set(values)].sort((a, b) => a - b);
|
|
1245
|
+
if (uniqueValues.length <= maxItems)
|
|
1246
|
+
return uniqueValues;
|
|
1247
|
+
if (maxItems === 1)
|
|
1248
|
+
return [uniqueValues[0]];
|
|
1249
|
+
const selected = new Set;
|
|
1250
|
+
for (let i = 0;i < maxItems; i++) {
|
|
1251
|
+
const index = Math.round(i * (uniqueValues.length - 1) / (maxItems - 1));
|
|
1252
|
+
const value = uniqueValues[index];
|
|
1253
|
+
if (value !== undefined)
|
|
1254
|
+
selected.add(value);
|
|
1255
|
+
}
|
|
1256
|
+
for (const value of uniqueValues) {
|
|
1257
|
+
if (selected.size >= maxItems)
|
|
1258
|
+
break;
|
|
1259
|
+
selected.add(value);
|
|
1260
|
+
}
|
|
1261
|
+
return [...selected].sort((a, b) => a - b);
|
|
1262
|
+
};
|
|
1263
|
+
var selectInspectionSamplePages = (totalPages, targetPages, samplePageCount) => {
|
|
1264
|
+
if (totalPages <= 0)
|
|
1265
|
+
return [];
|
|
1266
|
+
const maxSamples = clampSamplePageCount(samplePageCount);
|
|
1267
|
+
if (targetPages !== undefined) {
|
|
1268
|
+
const validTargetPages = targetPages.filter((page) => page >= 1 && page <= totalPages);
|
|
1269
|
+
return selectEvenlySpaced(validTargetPages, maxSamples);
|
|
1270
|
+
}
|
|
1271
|
+
if (totalPages <= maxSamples) {
|
|
1272
|
+
return Array.from({ length: totalPages }, (_, index) => index + 1);
|
|
1273
|
+
}
|
|
1274
|
+
const sampled = new Set;
|
|
1275
|
+
for (let i = 0;i < maxSamples; i++) {
|
|
1276
|
+
const page = 1 + Math.round(i * (totalPages - 1) / (maxSamples - 1));
|
|
1277
|
+
sampled.add(page);
|
|
1278
|
+
}
|
|
1279
|
+
return [...sampled].sort((a, b) => a - b);
|
|
1280
|
+
};
|
|
1281
|
+
var classifyPdfInspectionProfile = (pageSignals) => {
|
|
1282
|
+
if (pageSignals.length === 0)
|
|
1283
|
+
return "unknown";
|
|
1284
|
+
const scannedCount = pageSignals.filter((signal) => signal.likely_scanned).length;
|
|
1285
|
+
const digitalTextCount = pageSignals.filter((signal) => signal.text_chars >= DIGITAL_TEXT_CHAR_THRESHOLD).length;
|
|
1286
|
+
if (scannedCount === pageSignals.length)
|
|
1287
|
+
return "scanned_or_image_only";
|
|
1288
|
+
if (scannedCount > 0 && digitalTextCount > 0)
|
|
1289
|
+
return "mixed_text_and_scan";
|
|
1290
|
+
if (digitalTextCount > 0)
|
|
1291
|
+
return "digital_text";
|
|
1292
|
+
return "low_text_or_form";
|
|
1293
|
+
};
|
|
1294
|
+
var countImagePaintOperations = async (page) => {
|
|
1295
|
+
try {
|
|
1296
|
+
const operatorList = await page.getOperatorList();
|
|
1297
|
+
return operatorList.fnArray.filter((op) => op === OPS2.paintImageXObject || op === OPS2.paintXObject).length;
|
|
1298
|
+
} catch (error) {
|
|
1299
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1300
|
+
logger5.warn("Error counting image paint operations", { error: message });
|
|
1301
|
+
return 0;
|
|
1261
1302
|
}
|
|
1262
|
-
return normalizeStructureTreeContent(rawChild);
|
|
1263
1303
|
};
|
|
1264
|
-
var
|
|
1265
|
-
const
|
|
1266
|
-
const
|
|
1304
|
+
var inspectPageSignal = async (pdfDocument, pageNum) => {
|
|
1305
|
+
const page = await pdfDocument.getPage(pageNum);
|
|
1306
|
+
const textContent = await page.getTextContent();
|
|
1307
|
+
const textValues = textContent.items.map((item) => item.str).filter((value) => typeof value === "string");
|
|
1308
|
+
const textChars = textValues.reduce((sum, value) => sum + value.trim().length, 0);
|
|
1309
|
+
const imagePaintOperations = await countImagePaintOperations(page);
|
|
1310
|
+
const likelyScanned = textChars < LOW_TEXT_CHAR_THRESHOLD && imagePaintOperations > 0;
|
|
1267
1311
|
return {
|
|
1268
|
-
|
|
1269
|
-
|
|
1312
|
+
page: pageNum,
|
|
1313
|
+
text_chars: textChars,
|
|
1314
|
+
text_items: textValues.filter((value) => value.trim().length > 0).length,
|
|
1315
|
+
estimated_tokens: Math.ceil(textChars / APPROX_CHARS_PER_TOKEN),
|
|
1316
|
+
image_paint_operations: imagePaintOperations,
|
|
1317
|
+
likely_scanned: likelyScanned,
|
|
1318
|
+
low_text_density: textChars < DIGITAL_TEXT_CHAR_THRESHOLD
|
|
1270
1319
|
};
|
|
1271
1320
|
};
|
|
1272
|
-
var
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1321
|
+
var buildDocumentSignals = (structureOutput, hasStructureTree) => ({
|
|
1322
|
+
has_outline: (structureOutput.outline?.length ?? 0) > 0,
|
|
1323
|
+
has_page_labels: (structureOutput.page_labels?.length ?? 0) > 0,
|
|
1324
|
+
has_permissions: (structureOutput.permissions?.length ?? 0) > 0,
|
|
1325
|
+
has_mark_info: Object.keys(structureOutput.mark_info ?? {}).length > 0,
|
|
1326
|
+
has_form_fields: (structureOutput.form_fields?.length ?? 0) > 0,
|
|
1327
|
+
has_attachments: (structureOutput.attachments?.length ?? 0) > 0,
|
|
1328
|
+
has_structure_tree: hasStructureTree
|
|
1329
|
+
});
|
|
1330
|
+
var setTrue = (target, key, enabled) => {
|
|
1331
|
+
if (enabled)
|
|
1332
|
+
target[key] = true;
|
|
1333
|
+
};
|
|
1334
|
+
var buildInspectionRecommendation = (source, profile, documentSignals) => {
|
|
1335
|
+
const readPdfArguments = {
|
|
1336
|
+
sources: [publicSource(source)],
|
|
1337
|
+
include_metadata: true,
|
|
1338
|
+
include_page_count: true,
|
|
1339
|
+
include_page_geometry: true
|
|
1340
|
+
};
|
|
1341
|
+
setTrue(readPdfArguments, "include_outline", documentSignals.has_outline);
|
|
1342
|
+
setTrue(readPdfArguments, "include_page_labels", documentSignals.has_page_labels);
|
|
1343
|
+
setTrue(readPdfArguments, "include_permissions", documentSignals.has_permissions);
|
|
1344
|
+
setTrue(readPdfArguments, "include_form_fields", documentSignals.has_form_fields);
|
|
1345
|
+
setTrue(readPdfArguments, "include_attachments", documentSignals.has_attachments);
|
|
1346
|
+
setTrue(readPdfArguments, "include_structure_tree", documentSignals.has_structure_tree);
|
|
1347
|
+
if (profile === "scanned_or_image_only") {
|
|
1348
|
+
return {
|
|
1349
|
+
workflow: "scanned_pdf_triage",
|
|
1350
|
+
needs_ocr: true,
|
|
1351
|
+
reason: "Sampled pages contain little selectable text and visible image paint operations; OCR or an optional advanced engine is likely required for text extraction.",
|
|
1352
|
+
read_pdf_arguments: readPdfArguments
|
|
1353
|
+
};
|
|
1288
1354
|
}
|
|
1289
|
-
|
|
1355
|
+
if (profile === "mixed_text_and_scan") {
|
|
1356
|
+
Object.assign(readPdfArguments, {
|
|
1357
|
+
include_chunks: true,
|
|
1358
|
+
include_semantic_hints: true,
|
|
1359
|
+
include_safety_findings: true,
|
|
1360
|
+
include_markdown: true,
|
|
1361
|
+
include_tables: true
|
|
1362
|
+
});
|
|
1363
|
+
return {
|
|
1364
|
+
workflow: "mixed_pdf_review",
|
|
1365
|
+
needs_ocr: true,
|
|
1366
|
+
reason: "Some sampled pages look text-based while others look image-only; use read_pdf for selectable-text pages and OCR for scanned pages.",
|
|
1367
|
+
read_pdf_arguments: readPdfArguments
|
|
1368
|
+
};
|
|
1369
|
+
}
|
|
1370
|
+
if (profile === "digital_text") {
|
|
1371
|
+
Object.assign(readPdfArguments, {
|
|
1372
|
+
include_chunks: true,
|
|
1373
|
+
include_semantic_hints: true,
|
|
1374
|
+
include_safety_findings: true,
|
|
1375
|
+
include_markdown: true,
|
|
1376
|
+
include_tables: true
|
|
1377
|
+
});
|
|
1378
|
+
return {
|
|
1379
|
+
workflow: "agentic_rag",
|
|
1380
|
+
needs_ocr: false,
|
|
1381
|
+
reason: "Sampled pages expose selectable text; citation chunks, semantic hints, table extraction, and safety findings are the highest-value next read_pdf options.",
|
|
1382
|
+
read_pdf_arguments: readPdfArguments
|
|
1383
|
+
};
|
|
1384
|
+
}
|
|
1385
|
+
return {
|
|
1386
|
+
workflow: "metadata_review",
|
|
1387
|
+
needs_ocr: false,
|
|
1388
|
+
reason: "Sampled pages expose limited text; inspect metadata, forms, attachments, structure, and selected pages before running a heavier extraction.",
|
|
1389
|
+
read_pdf_arguments: readPdfArguments
|
|
1390
|
+
};
|
|
1290
1391
|
};
|
|
1291
|
-
var
|
|
1292
|
-
const
|
|
1293
|
-
|
|
1294
|
-
try {
|
|
1295
|
-
const page = await pdfDocument.getPage(pageNum);
|
|
1296
|
-
if (typeof page.getStructTree !== "function")
|
|
1297
|
-
continue;
|
|
1298
|
-
const rawTree = await page.getStructTree();
|
|
1299
|
-
if (!rawTree)
|
|
1300
|
-
continue;
|
|
1301
|
-
pageStructureTrees.push({
|
|
1302
|
-
page: pageNum,
|
|
1303
|
-
tree: normalizeStructureTreeNode(rawTree)
|
|
1304
|
-
});
|
|
1305
|
-
} catch (error) {
|
|
1306
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
1307
|
-
logger3.warn("Error extracting structure tree", { pageNum, error: message });
|
|
1308
|
-
}
|
|
1309
|
-
}
|
|
1310
|
-
return pageStructureTrees;
|
|
1311
|
-
};
|
|
1312
|
-
var extractPageGeometry = async (pdfDocument, pagesToProcess) => {
|
|
1313
|
-
const pageGeometry = [];
|
|
1314
|
-
for (const pageNum of pagesToProcess) {
|
|
1315
|
-
try {
|
|
1316
|
-
const page = await pdfDocument.getPage(pageNum);
|
|
1317
|
-
const viewBox = buildRectBoundingBox(page.view);
|
|
1318
|
-
const viewport = page.getViewport({ scale: 1 });
|
|
1319
|
-
const width = finiteNumber(viewport.width) ? viewport.width : viewBox ? viewBox.right - viewBox.left : undefined;
|
|
1320
|
-
const height = finiteNumber(viewport.height) ? viewport.height : viewBox ? viewBox.top - viewBox.bottom : undefined;
|
|
1321
|
-
if (!finiteNumber(width) || !finiteNumber(height)) {
|
|
1322
|
-
logger3.warn("Skipping page geometry with invalid dimensions", { pageNum });
|
|
1323
|
-
continue;
|
|
1324
|
-
}
|
|
1325
|
-
pageGeometry.push({
|
|
1326
|
-
page: pageNum,
|
|
1327
|
-
width,
|
|
1328
|
-
height,
|
|
1329
|
-
rotation: finiteNumber(page.rotate) ? page.rotate : 0,
|
|
1330
|
-
...finiteNumber(page.userUnit) ? { user_unit: page.userUnit } : {},
|
|
1331
|
-
...viewBox ? { view_box: viewBox } : {}
|
|
1332
|
-
});
|
|
1333
|
-
} catch (error) {
|
|
1334
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
1335
|
-
logger3.warn("Error extracting page geometry", { pageNum, error: message });
|
|
1336
|
-
}
|
|
1337
|
-
}
|
|
1338
|
-
return pageGeometry;
|
|
1339
|
-
};
|
|
1340
|
-
var buildWarnings = (invalidPages, totalPages) => {
|
|
1341
|
-
if (invalidPages.length === 0) {
|
|
1342
|
-
return [];
|
|
1343
|
-
}
|
|
1344
|
-
return [
|
|
1345
|
-
`Requested page numbers ${invalidPages.join(", ")} exceed total pages (${String(totalPages)}).`
|
|
1346
|
-
];
|
|
1347
|
-
};
|
|
1348
|
-
var extractPageContent = async (pdfDocument, pageNum, includeImages, sourceDescription) => {
|
|
1349
|
-
const contentItems = [];
|
|
1392
|
+
var inspectPdfSource = async (source, options) => {
|
|
1393
|
+
const sourceDescription = source.path ?? source.url ?? "unknown source";
|
|
1394
|
+
let pdfDocument = null;
|
|
1350
1395
|
try {
|
|
1351
|
-
const
|
|
1352
|
-
const
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
}
|
|
1397
|
-
const imageName = argsArray[0];
|
|
1398
|
-
let xPosition;
|
|
1399
|
-
let yPosition;
|
|
1400
|
-
if (argsArray.length > 1 && Array.isArray(argsArray[1])) {
|
|
1401
|
-
const transform = argsArray[1];
|
|
1402
|
-
const xCoord = transform[4];
|
|
1403
|
-
const yCoord = transform[5];
|
|
1404
|
-
if (xCoord !== undefined) {
|
|
1405
|
-
xPosition = Math.round(xCoord);
|
|
1406
|
-
}
|
|
1407
|
-
if (yCoord !== undefined) {
|
|
1408
|
-
yPosition = Math.round(yCoord);
|
|
1409
|
-
}
|
|
1410
|
-
}
|
|
1411
|
-
const imageData = await retrieveImageData(page, imageName, pageNum);
|
|
1412
|
-
const extractedImage = processImageData(imageData, pageNum, arrayIndex);
|
|
1413
|
-
if (extractedImage) {
|
|
1414
|
-
const imageBox = buildBoundingBox2(xPosition, yPosition, extractedImage.width, extractedImage.height);
|
|
1415
|
-
extractedImage.bounding_box = imageBox;
|
|
1416
|
-
return {
|
|
1417
|
-
type: "image",
|
|
1418
|
-
yPosition: imageBox?.top ?? yPosition ?? 0,
|
|
1419
|
-
xPosition,
|
|
1420
|
-
width: extractedImage.width,
|
|
1421
|
-
height: extractedImage.height,
|
|
1422
|
-
bounding_box: imageBox,
|
|
1423
|
-
imageData: extractedImage
|
|
1424
|
-
};
|
|
1425
|
-
}
|
|
1426
|
-
return null;
|
|
1427
|
-
});
|
|
1428
|
-
const resolvedImages = await Promise.all(imagePromises);
|
|
1429
|
-
const validImages = resolvedImages.filter((item) => item !== null);
|
|
1430
|
-
contentItems.push(...validImages);
|
|
1431
|
-
}
|
|
1396
|
+
const targetPages = getTargetPages(source.pages, sourceDescription);
|
|
1397
|
+
const { pages: _pages, ...loadArgs } = source;
|
|
1398
|
+
pdfDocument = await loadPdfDocument(loadArgs, sourceDescription);
|
|
1399
|
+
const totalPages = pdfDocument.numPages;
|
|
1400
|
+
const validTargetPages = targetPages?.filter((page) => page <= totalPages);
|
|
1401
|
+
const invalidPages = targetPages?.filter((page) => page > totalPages) ?? [];
|
|
1402
|
+
const sampledPages = selectInspectionSamplePages(totalPages, validTargetPages, options.sample_pages);
|
|
1403
|
+
const metadataOutput = await extractMetadataAndPageCount(pdfDocument, options.include_metadata, true);
|
|
1404
|
+
const structureOutput = await extractDocumentStructure(pdfDocument, {
|
|
1405
|
+
includeOutline: true,
|
|
1406
|
+
includePageLabels: true,
|
|
1407
|
+
includePermissions: true,
|
|
1408
|
+
includeFormFields: true,
|
|
1409
|
+
includeAttachments: true
|
|
1410
|
+
});
|
|
1411
|
+
const structureTrees = sampledPages.length > 0 ? await extractStructureTrees(pdfDocument, sampledPages) : [];
|
|
1412
|
+
const documentSignals = buildDocumentSignals(structureOutput, structureTrees.length > 0);
|
|
1413
|
+
const pageSignals = await Promise.all(sampledPages.map((pageNum) => inspectPageSignal(pdfDocument, pageNum)));
|
|
1414
|
+
const pageGeometry = sampledPages.length > 0 ? await extractPageGeometry(pdfDocument, sampledPages) : [];
|
|
1415
|
+
const profile = classifyPdfInspectionProfile(pageSignals);
|
|
1416
|
+
const recommendation = buildInspectionRecommendation(source, profile, documentSignals);
|
|
1417
|
+
const warnings = buildWarnings(invalidPages, totalPages);
|
|
1418
|
+
if (targetPages !== undefined && sampledPages.length === 0) {
|
|
1419
|
+
warnings.push("No requested pages are inside the document page range.");
|
|
1420
|
+
}
|
|
1421
|
+
if (recommendation.needs_ocr) {
|
|
1422
|
+
warnings.push("Default PDF Reader MCP does not perform OCR; use an optional OCR-capable engine for scanned pages.");
|
|
1423
|
+
}
|
|
1424
|
+
const data = {
|
|
1425
|
+
profile,
|
|
1426
|
+
num_pages: totalPages,
|
|
1427
|
+
sampled_pages: sampledPages,
|
|
1428
|
+
page_signals: pageSignals,
|
|
1429
|
+
document_signals: documentSignals,
|
|
1430
|
+
recommendation,
|
|
1431
|
+
...metadataOutput.info ? { info: metadataOutput.info } : {},
|
|
1432
|
+
...metadataOutput.metadata ? { metadata: metadataOutput.metadata } : {},
|
|
1433
|
+
...pageGeometry.length > 0 ? { page_geometry: pageGeometry } : {},
|
|
1434
|
+
...warnings.length > 0 ? { warnings } : {}
|
|
1435
|
+
};
|
|
1436
|
+
return {
|
|
1437
|
+
source: sourceDescription,
|
|
1438
|
+
success: true,
|
|
1439
|
+
data
|
|
1440
|
+
};
|
|
1432
1441
|
} catch (error) {
|
|
1442
|
+
if (error instanceof PdfError) {
|
|
1443
|
+
return { source: sourceDescription, success: false, error: error.message };
|
|
1444
|
+
}
|
|
1433
1445
|
const message = error instanceof Error ? error.message : String(error);
|
|
1434
|
-
|
|
1435
|
-
pageNum,
|
|
1446
|
+
logger5.error("Unexpected error inspecting PDF source", {
|
|
1436
1447
|
sourceDescription,
|
|
1437
1448
|
error: message
|
|
1438
1449
|
});
|
|
1439
|
-
return
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1450
|
+
return {
|
|
1451
|
+
source: sourceDescription,
|
|
1452
|
+
success: false,
|
|
1453
|
+
error: `Failed to inspect PDF from ${sourceDescription}.`
|
|
1454
|
+
};
|
|
1455
|
+
} finally {
|
|
1456
|
+
const loadingTask = pdfDocument?.loadingTask;
|
|
1457
|
+
if (loadingTask && typeof loadingTask.destroy === "function") {
|
|
1458
|
+
try {
|
|
1459
|
+
await loadingTask.destroy();
|
|
1460
|
+
} catch (destroyError) {
|
|
1461
|
+
const message = destroyError instanceof Error ? destroyError.message : String(destroyError);
|
|
1462
|
+
logger5.warn("Error destroying PDF document after inspection", {
|
|
1463
|
+
sourceDescription,
|
|
1464
|
+
error: message
|
|
1465
|
+
});
|
|
1444
1466
|
}
|
|
1445
|
-
|
|
1467
|
+
}
|
|
1446
1468
|
}
|
|
1447
|
-
return sortPageContentItems(contentItems);
|
|
1448
1469
|
};
|
|
1470
|
+
var defaultInspectPdfOptions = () => ({
|
|
1471
|
+
sample_pages: DEFAULT_SAMPLE_PAGES,
|
|
1472
|
+
include_metadata: true
|
|
1473
|
+
});
|
|
1449
1474
|
|
|
1450
|
-
// src/
|
|
1451
|
-
import
|
|
1452
|
-
|
|
1453
|
-
|
|
1475
|
+
// src/schemas/inspectPdf.ts
|
|
1476
|
+
import {
|
|
1477
|
+
array as array2,
|
|
1478
|
+
bool as bool2,
|
|
1479
|
+
description as description2,
|
|
1480
|
+
gte as gte2,
|
|
1481
|
+
int as int2,
|
|
1482
|
+
lte,
|
|
1483
|
+
num as num2,
|
|
1484
|
+
object as object2,
|
|
1485
|
+
optional as optional2
|
|
1486
|
+
} from "@sylphx/vex";
|
|
1454
1487
|
|
|
1455
|
-
// src/
|
|
1456
|
-
import
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1488
|
+
// src/schemas/readPdf.ts
|
|
1489
|
+
import {
|
|
1490
|
+
array,
|
|
1491
|
+
bool,
|
|
1492
|
+
description,
|
|
1493
|
+
gte,
|
|
1494
|
+
int,
|
|
1495
|
+
min,
|
|
1496
|
+
num,
|
|
1497
|
+
object,
|
|
1498
|
+
optional,
|
|
1499
|
+
str,
|
|
1500
|
+
union
|
|
1501
|
+
} from "@sylphx/vex";
|
|
1502
|
+
var pageSpecifierSchema = union(array(num(int, gte(1))), str(min(1)));
|
|
1503
|
+
var pdfSourceSchema = object({
|
|
1504
|
+
path: optional(str(min(1), description("Path to the local PDF file (absolute or relative to cwd)."))),
|
|
1505
|
+
url: optional(str(min(1), description("URL of the PDF file."))),
|
|
1506
|
+
pages: optional(pageSpecifierSchema)
|
|
1507
|
+
});
|
|
1508
|
+
var readPdfArgsSchema = object({
|
|
1509
|
+
sources: array(pdfSourceSchema),
|
|
1510
|
+
include_full_text: optional(bool(description("Include the full text content of each PDF (only if 'pages' is not specified for that source)."))),
|
|
1511
|
+
include_metadata: optional(bool(description("Include metadata and info objects for each PDF."))),
|
|
1512
|
+
include_page_count: optional(bool(description("Include the total number of pages for each PDF."))),
|
|
1513
|
+
include_images: optional(bool(description("Extract and include embedded images from the PDF pages as base64-encoded data."))),
|
|
1514
|
+
include_tables: optional(bool(description("Detect and extract tables from PDF pages. Uses spatial clustering of text coordinates to identify tabular structures."))),
|
|
1515
|
+
include_elements: optional(bool(description("Include agent-ready structured document elements with page numbers, stable IDs, provenance, and best-effort bounding boxes."))),
|
|
1516
|
+
include_semantic_hints: optional(bool(description("Include deterministic semantic hints on text elements, such as heading, list item, or paragraph."))),
|
|
1517
|
+
include_markdown: optional(bool(description("Include a Markdown rendering of extracted pages for RAG, summarization, and agent context."))),
|
|
1518
|
+
include_html: optional(bool(description("Include a simple HTML rendering of extracted pages for preview, export, and downstream conversion."))),
|
|
1519
|
+
include_chunks: optional(bool(description("Include page-level citation-ready chunks with text, element IDs, page ranges, and best-effort bounding boxes."))),
|
|
1520
|
+
include_outline: optional(bool(description("Include document outline/bookmark entries when the PDF exposes them."))),
|
|
1521
|
+
include_annotations: optional(bool(description("Include page annotations such as links, notes, and form-related annotations with safe summary fields."))),
|
|
1522
|
+
include_page_labels: optional(bool(description("Include PDF page labels when available, such as roman numerals or section labels."))),
|
|
1523
|
+
include_page_geometry: optional(bool(description("Include page viewport geometry such as width, height, rotation, user unit, and view box."))),
|
|
1524
|
+
include_permissions: optional(bool(description("Include PDF permission and marking signals when exposed by the parser."))),
|
|
1525
|
+
include_form_fields: optional(bool(description("Include PDF form field summaries when AcroForm fields are exposed."))),
|
|
1526
|
+
include_attachments: optional(bool(description("Include embedded attachment metadata such as filename and size. Attachment bytes are not returned."))),
|
|
1527
|
+
include_structure_tree: optional(bool(description("Include best-effort tagged PDF structure trees for selected pages when the PDF exposes them."))),
|
|
1528
|
+
include_safety_findings: optional(bool(description("Include deterministic content safety findings for prompt-injection patterns, tiny text, and off-page text.")))
|
|
1529
|
+
});
|
|
1530
|
+
|
|
1531
|
+
// src/schemas/inspectPdf.ts
|
|
1532
|
+
var inspectPdfArgsSchema = object2({
|
|
1533
|
+
sources: array2(pdfSourceSchema),
|
|
1534
|
+
sample_pages: optional2(num2(int2, gte2(1), lte(20), description2("Maximum number of pages to sample per source for lightweight PDF profiling. Defaults to 5."))),
|
|
1535
|
+
include_metadata: optional2(bool2(description2("Include PDF metadata and info objects in the inspection response.")))
|
|
1536
|
+
});
|
|
1537
|
+
|
|
1538
|
+
// src/handlers/inspectPdf.ts
|
|
1539
|
+
var MAX_CONCURRENT_SOURCES = 3;
|
|
1540
|
+
var inspectPdf = tool().description("Inspects one or more PDFs and recommends the best read_pdf options for agentic extraction, citations, safety, and OCR triage.").input(inspectPdfArgsSchema).handler(async ({ input }) => {
|
|
1541
|
+
const options = {
|
|
1542
|
+
...defaultInspectPdfOptions(),
|
|
1543
|
+
...input.sample_pages !== undefined ? { sample_pages: input.sample_pages } : {},
|
|
1544
|
+
...input.include_metadata !== undefined ? { include_metadata: input.include_metadata } : {}
|
|
1545
|
+
};
|
|
1546
|
+
const results = [];
|
|
1547
|
+
for (let i = 0;i < input.sources.length; i += MAX_CONCURRENT_SOURCES) {
|
|
1548
|
+
const batch = input.sources.slice(i, i + MAX_CONCURRENT_SOURCES);
|
|
1549
|
+
const batchResults = await Promise.all(batch.map((source) => inspectPdfSource(source, options)));
|
|
1550
|
+
results.push(...batchResults);
|
|
1472
1551
|
}
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
if (value === undefined)
|
|
1477
|
-
return fallback;
|
|
1478
|
-
const v = value.trim().toLowerCase();
|
|
1479
|
-
if (v === "false" || v === "0" || v === "no" || v === "off")
|
|
1480
|
-
return false;
|
|
1481
|
-
if (v === "true" || v === "1" || v === "yes" || v === "on")
|
|
1482
|
-
return true;
|
|
1483
|
-
return fallback;
|
|
1484
|
-
};
|
|
1485
|
-
var parseCliFlags = (argv) => {
|
|
1486
|
-
const dirs = [];
|
|
1487
|
-
const hosts = [];
|
|
1488
|
-
let noHttp = false;
|
|
1489
|
-
let allowPrivateIps = false;
|
|
1490
|
-
for (const arg of argv) {
|
|
1491
|
-
if (arg.startsWith("--allow-dir=")) {
|
|
1492
|
-
dirs.push(arg.slice("--allow-dir=".length));
|
|
1493
|
-
} else if (arg.startsWith("--allow-host=")) {
|
|
1494
|
-
hosts.push(arg.slice("--allow-host=".length).toLowerCase());
|
|
1495
|
-
} else if (arg === "--no-http") {
|
|
1496
|
-
noHttp = true;
|
|
1497
|
-
} else if (arg === "--allow-private-ips") {
|
|
1498
|
-
allowPrivateIps = true;
|
|
1499
|
-
}
|
|
1552
|
+
if (results.every((result) => !result.success)) {
|
|
1553
|
+
const errorMessages = results.map((result) => result.error).join("; ");
|
|
1554
|
+
return toolError(`All PDF sources failed inspection: ${errorMessages}`);
|
|
1500
1555
|
}
|
|
1501
|
-
return {
|
|
1556
|
+
return text(JSON.stringify({ results }, null, 2));
|
|
1557
|
+
});
|
|
1558
|
+
|
|
1559
|
+
// src/handlers/readPdf.ts
|
|
1560
|
+
import { image, text as text2, tool as tool2, toolError as toolError2 } from "@sylphx/mcp-server-sdk";
|
|
1561
|
+
|
|
1562
|
+
// src/pdf/tableExtractor.ts
|
|
1563
|
+
var logger6 = createLogger("TableExtractor");
|
|
1564
|
+
var Y_TOLERANCE = 5;
|
|
1565
|
+
var COLUMN_GAP_THRESHOLD = 15;
|
|
1566
|
+
var MIN_ROWS = 2;
|
|
1567
|
+
var MIN_COLS = 2;
|
|
1568
|
+
var MIN_ROW_ITEMS = 2;
|
|
1569
|
+
var buildBoundingBox2 = (x, y, width, height) => {
|
|
1570
|
+
if (![x, y, width].every(Number.isFinite) || height === undefined || !Number.isFinite(height)) {
|
|
1571
|
+
return;
|
|
1572
|
+
}
|
|
1573
|
+
return {
|
|
1574
|
+
left: x,
|
|
1575
|
+
bottom: y,
|
|
1576
|
+
right: x + Math.max(0, width),
|
|
1577
|
+
top: y + Math.max(0, height)
|
|
1578
|
+
};
|
|
1502
1579
|
};
|
|
1503
|
-
var
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
const envDirs = envList(env["MCP_PDF_ALLOWED_DIRS"], /[:,]/);
|
|
1507
|
-
const envHosts = envList(env["MCP_PDF_ALLOWED_HOSTS"], /,/, (h) => h.toLowerCase());
|
|
1508
|
-
const mergedDirs = [...cli.dirs, ...envDirs];
|
|
1509
|
-
const mergedHosts = [...cli.hosts, ...envHosts];
|
|
1580
|
+
var mergeBoundingBoxes2 = (boxes) => {
|
|
1581
|
+
if (boxes.length === 0)
|
|
1582
|
+
return;
|
|
1510
1583
|
return {
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
1584
|
+
left: Math.min(...boxes.map((box) => box.left)),
|
|
1585
|
+
bottom: Math.min(...boxes.map((box) => box.bottom)),
|
|
1586
|
+
right: Math.max(...boxes.map((box) => box.right)),
|
|
1587
|
+
top: Math.max(...boxes.map((box) => box.top))
|
|
1515
1588
|
};
|
|
1516
1589
|
};
|
|
1517
|
-
var
|
|
1518
|
-
|
|
1519
|
-
|
|
1520
|
-
|
|
1590
|
+
var extractTextItemsWithPositions = async (page) => {
|
|
1591
|
+
const textContent = await page.getTextContent();
|
|
1592
|
+
const items = [];
|
|
1593
|
+
for (const item of textContent.items) {
|
|
1594
|
+
const textItem = item;
|
|
1595
|
+
if (!textItem.str.trim())
|
|
1596
|
+
continue;
|
|
1597
|
+
if (!textItem.transform || textItem.transform.length < 6)
|
|
1598
|
+
continue;
|
|
1599
|
+
const x = textItem.transform[4];
|
|
1600
|
+
const y = textItem.transform[5];
|
|
1601
|
+
if (x === undefined || y === undefined)
|
|
1602
|
+
continue;
|
|
1603
|
+
const height = textItem.height ?? Math.abs(textItem.transform[3] ?? 0);
|
|
1604
|
+
items.push({
|
|
1605
|
+
text: textItem.str,
|
|
1606
|
+
x,
|
|
1607
|
+
y,
|
|
1608
|
+
width: textItem.width ?? textItem.str.length * 6,
|
|
1609
|
+
...height > 0 ? { height } : {},
|
|
1610
|
+
...height > 0 ? {
|
|
1611
|
+
bounding_box: buildBoundingBox2(x, y, textItem.width ?? textItem.str.length * 6, height)
|
|
1612
|
+
} : {}
|
|
1613
|
+
});
|
|
1521
1614
|
}
|
|
1522
|
-
return
|
|
1615
|
+
return items;
|
|
1523
1616
|
};
|
|
1524
|
-
var
|
|
1525
|
-
if (
|
|
1526
|
-
return
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
|
|
1531
|
-
|
|
1532
|
-
|
|
1533
|
-
|
|
1534
|
-
|
|
1535
|
-
|
|
1536
|
-
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
|
|
1617
|
+
var clusterByY = (items, tolerance = Y_TOLERANCE) => {
|
|
1618
|
+
if (items.length === 0)
|
|
1619
|
+
return [];
|
|
1620
|
+
const sorted = [...items].sort((a, b) => b.y - a.y);
|
|
1621
|
+
const firstItem = sorted[0];
|
|
1622
|
+
if (!firstItem)
|
|
1623
|
+
return [];
|
|
1624
|
+
const rows = [];
|
|
1625
|
+
let currentRow = { y: firstItem.y, items: [firstItem] };
|
|
1626
|
+
for (let i = 1;i < sorted.length; i++) {
|
|
1627
|
+
const item = sorted[i];
|
|
1628
|
+
if (!item)
|
|
1629
|
+
continue;
|
|
1630
|
+
const yDiff = Math.abs(currentRow.y - item.y);
|
|
1631
|
+
if (yDiff <= tolerance) {
|
|
1632
|
+
currentRow.items.push(item);
|
|
1633
|
+
} else {
|
|
1634
|
+
rows.push(currentRow);
|
|
1635
|
+
currentRow = { y: item.y, items: [item] };
|
|
1636
|
+
}
|
|
1637
|
+
}
|
|
1638
|
+
rows.push(currentRow);
|
|
1639
|
+
for (const row of rows) {
|
|
1640
|
+
row.items.sort((a, b) => a.x - b.x);
|
|
1641
|
+
}
|
|
1642
|
+
return rows;
|
|
1540
1643
|
};
|
|
1541
|
-
var
|
|
1542
|
-
if (
|
|
1543
|
-
return
|
|
1544
|
-
|
|
1545
|
-
|
|
1546
|
-
|
|
1547
|
-
|
|
1548
|
-
|
|
1644
|
+
var detectColumnBoundaries = (rows, gapThreshold = COLUMN_GAP_THRESHOLD) => {
|
|
1645
|
+
if (rows.length === 0)
|
|
1646
|
+
return [];
|
|
1647
|
+
const allXPositions = [];
|
|
1648
|
+
for (const row of rows) {
|
|
1649
|
+
for (const item of row.items) {
|
|
1650
|
+
allXPositions.push(item.x);
|
|
1651
|
+
}
|
|
1549
1652
|
}
|
|
1550
|
-
if (
|
|
1551
|
-
return
|
|
1552
|
-
|
|
1553
|
-
|
|
1554
|
-
|
|
1653
|
+
if (allXPositions.length === 0)
|
|
1654
|
+
return [];
|
|
1655
|
+
allXPositions.sort((a, b) => a - b);
|
|
1656
|
+
const firstX = allXPositions[0];
|
|
1657
|
+
if (firstX === undefined)
|
|
1658
|
+
return [];
|
|
1659
|
+
const boundaries = [firstX];
|
|
1660
|
+
for (let i = 1;i < allXPositions.length; i++) {
|
|
1661
|
+
const current = allXPositions[i];
|
|
1662
|
+
const previous = allXPositions[i - 1];
|
|
1663
|
+
if (current === undefined || previous === undefined)
|
|
1664
|
+
continue;
|
|
1665
|
+
const gap = current - previous;
|
|
1666
|
+
if (gap >= gapThreshold) {
|
|
1667
|
+
boundaries.push(current);
|
|
1668
|
+
}
|
|
1669
|
+
}
|
|
1670
|
+
return boundaries;
|
|
1555
1671
|
};
|
|
1556
|
-
var
|
|
1557
|
-
(
|
|
1558
|
-
|
|
1559
|
-
|
|
1560
|
-
|
|
1561
|
-
|
|
1562
|
-
|
|
1563
|
-
|
|
1564
|
-
(a) => a >= 224
|
|
1565
|
-
];
|
|
1566
|
-
var isPrivateIpv4 = (ip) => {
|
|
1567
|
-
const parts = ip.split(".").map((s) => Number.parseInt(s, 10));
|
|
1568
|
-
const a = parts[0];
|
|
1569
|
-
const b = parts[1];
|
|
1570
|
-
if (a === undefined || b === undefined)
|
|
1571
|
-
return true;
|
|
1572
|
-
return PRIVATE_IPV4_PREDICATES.some((pred) => pred(a, b));
|
|
1672
|
+
var columnIndexForItem = (item, columnBoundaries, tolerance = COLUMN_GAP_THRESHOLD / 2) => {
|
|
1673
|
+
for (let i = columnBoundaries.length - 1;i >= 0; i--) {
|
|
1674
|
+
const boundary = columnBoundaries[i];
|
|
1675
|
+
if (boundary !== undefined && item.x >= boundary - tolerance) {
|
|
1676
|
+
return i;
|
|
1677
|
+
}
|
|
1678
|
+
}
|
|
1679
|
+
return 0;
|
|
1573
1680
|
};
|
|
1574
|
-
var
|
|
1575
|
-
const
|
|
1576
|
-
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
|
-
|
|
1581
|
-
|
|
1582
|
-
|
|
1583
|
-
|
|
1584
|
-
|
|
1585
|
-
|
|
1586
|
-
|
|
1587
|
-
|
|
1681
|
+
var assignToTableCells = (row, rowIndex, columnBoundaries) => {
|
|
1682
|
+
const accumulators = Array.from({ length: columnBoundaries.length }, () => ({ textParts: [], boundingBoxes: [] }));
|
|
1683
|
+
for (const item of row.items) {
|
|
1684
|
+
const colIndex = columnIndexForItem(item, columnBoundaries);
|
|
1685
|
+
const accumulator = accumulators[colIndex];
|
|
1686
|
+
if (!accumulator)
|
|
1687
|
+
continue;
|
|
1688
|
+
accumulator.textParts.push(item.text);
|
|
1689
|
+
if (item.bounding_box) {
|
|
1690
|
+
accumulator.boundingBoxes.push(item.bounding_box);
|
|
1691
|
+
}
|
|
1692
|
+
}
|
|
1693
|
+
const cells = accumulators.map((accumulator, colIndex) => {
|
|
1694
|
+
const boundingBox = mergeBoundingBoxes2(accumulator.boundingBoxes);
|
|
1695
|
+
return {
|
|
1696
|
+
text: accumulator.textParts.join(" "),
|
|
1697
|
+
rowIndex,
|
|
1698
|
+
colIndex,
|
|
1699
|
+
...boundingBox ? { bounding_box: boundingBox } : {}
|
|
1700
|
+
};
|
|
1701
|
+
});
|
|
1702
|
+
return {
|
|
1703
|
+
rowValues: cells.map((cell) => cell.text),
|
|
1704
|
+
cells
|
|
1705
|
+
};
|
|
1706
|
+
};
|
|
1707
|
+
var calculateConfidence = (rows, columnBoundaries) => {
|
|
1708
|
+
if (rows.length < MIN_ROWS || columnBoundaries.length < MIN_COLS) {
|
|
1709
|
+
return 0;
|
|
1710
|
+
}
|
|
1711
|
+
let score = 0;
|
|
1712
|
+
let checks = 0;
|
|
1713
|
+
for (const row of rows) {
|
|
1714
|
+
const itemsPerColumn = new Set;
|
|
1715
|
+
for (const item of row.items) {
|
|
1716
|
+
for (let i = columnBoundaries.length - 1;i >= 0; i--) {
|
|
1717
|
+
const boundary = columnBoundaries[i];
|
|
1718
|
+
if (boundary !== undefined && item.x >= boundary - COLUMN_GAP_THRESHOLD / 2) {
|
|
1719
|
+
itemsPerColumn.add(i);
|
|
1720
|
+
break;
|
|
1721
|
+
}
|
|
1722
|
+
}
|
|
1723
|
+
}
|
|
1724
|
+
score += itemsPerColumn.size / columnBoundaries.length;
|
|
1725
|
+
checks++;
|
|
1726
|
+
}
|
|
1727
|
+
if (rows.length >= 2) {
|
|
1728
|
+
const spacings = [];
|
|
1729
|
+
for (let i = 1;i < rows.length; i++) {
|
|
1730
|
+
const prevRow = rows[i - 1];
|
|
1731
|
+
const currRow = rows[i];
|
|
1732
|
+
if (prevRow && currRow) {
|
|
1733
|
+
spacings.push(Math.abs(prevRow.y - currRow.y));
|
|
1734
|
+
}
|
|
1735
|
+
}
|
|
1736
|
+
if (spacings.length > 0) {
|
|
1737
|
+
const avgSpacing = spacings.reduce((a, b) => a + b, 0) / spacings.length;
|
|
1738
|
+
const variance = spacings.reduce((sum, s) => sum + (s - avgSpacing) ** 2, 0) / spacings.length;
|
|
1739
|
+
const stdDev = Math.sqrt(variance);
|
|
1740
|
+
const regularityScore = avgSpacing > 0 ? Math.max(0, 1 - stdDev / avgSpacing) : 0;
|
|
1741
|
+
score += regularityScore;
|
|
1742
|
+
checks++;
|
|
1743
|
+
}
|
|
1588
1744
|
}
|
|
1589
|
-
return
|
|
1590
|
-
};
|
|
1591
|
-
var isPrivateIp = (ip) => {
|
|
1592
|
-
if (net.isIPv4(ip))
|
|
1593
|
-
return isPrivateIpv4(ip);
|
|
1594
|
-
if (net.isIPv6(ip))
|
|
1595
|
-
return isPrivateIpv6(ip);
|
|
1596
|
-
return true;
|
|
1745
|
+
return checks > 0 ? Math.min(1, score / checks) : 0;
|
|
1597
1746
|
};
|
|
1598
|
-
var
|
|
1599
|
-
|
|
1600
|
-
|
|
1601
|
-
|
|
1602
|
-
|
|
1603
|
-
return;
|
|
1747
|
+
var identifyTableRegions = (rows) => {
|
|
1748
|
+
const regions = [];
|
|
1749
|
+
const candidateRows = rows.filter((row) => row.items.length >= MIN_ROW_ITEMS);
|
|
1750
|
+
if (candidateRows.length < MIN_ROWS) {
|
|
1751
|
+
return regions;
|
|
1604
1752
|
}
|
|
1605
|
-
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
} catch {
|
|
1609
|
-
throw new Error(`URL host '${hostname}' could not be resolved.`);
|
|
1753
|
+
const columnBoundaries = detectColumnBoundaries(candidateRows);
|
|
1754
|
+
if (columnBoundaries.length < MIN_COLS) {
|
|
1755
|
+
return regions;
|
|
1610
1756
|
}
|
|
1611
|
-
|
|
1612
|
-
|
|
1757
|
+
let currentRegion = [];
|
|
1758
|
+
for (const row of candidateRows) {
|
|
1759
|
+
const alignedItems = row.items.filter((item) => {
|
|
1760
|
+
return columnBoundaries.some((boundary) => Math.abs(item.x - boundary) < COLUMN_GAP_THRESHOLD);
|
|
1761
|
+
});
|
|
1762
|
+
if (alignedItems.length >= MIN_COLS - 1) {
|
|
1763
|
+
currentRegion.push(row);
|
|
1764
|
+
} else if (currentRegion.length >= MIN_ROWS) {
|
|
1765
|
+
const firstRow = currentRegion[0];
|
|
1766
|
+
const lastRow = currentRegion[currentRegion.length - 1];
|
|
1767
|
+
if (firstRow && lastRow) {
|
|
1768
|
+
regions.push({
|
|
1769
|
+
rows: currentRegion,
|
|
1770
|
+
columnBoundaries,
|
|
1771
|
+
startY: firstRow.y,
|
|
1772
|
+
endY: lastRow.y
|
|
1773
|
+
});
|
|
1774
|
+
}
|
|
1775
|
+
currentRegion = [];
|
|
1776
|
+
} else {
|
|
1777
|
+
currentRegion = [];
|
|
1778
|
+
}
|
|
1613
1779
|
}
|
|
1614
|
-
|
|
1615
|
-
|
|
1616
|
-
|
|
1780
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
1781
|
+
const firstRow = currentRegion[0];
|
|
1782
|
+
const lastRow = currentRegion[currentRegion.length - 1];
|
|
1783
|
+
if (firstRow && lastRow) {
|
|
1784
|
+
regions.push({
|
|
1785
|
+
rows: currentRegion,
|
|
1786
|
+
columnBoundaries,
|
|
1787
|
+
startY: firstRow.y,
|
|
1788
|
+
endY: lastRow.y
|
|
1789
|
+
});
|
|
1617
1790
|
}
|
|
1618
1791
|
}
|
|
1792
|
+
return regions;
|
|
1619
1793
|
};
|
|
1620
|
-
|
|
1621
|
-
|
|
1622
|
-
class PdfError extends Error {
|
|
1623
|
-
code;
|
|
1624
|
-
constructor(code, message, options) {
|
|
1625
|
-
super(message, options?.cause ? { cause: options.cause } : undefined);
|
|
1626
|
-
this.code = code;
|
|
1627
|
-
this.name = "PdfError";
|
|
1628
|
-
}
|
|
1629
|
-
}
|
|
1630
|
-
|
|
1631
|
-
// src/utils/pathUtils.ts
|
|
1632
|
-
import fs2 from "node:fs";
|
|
1633
|
-
import path2 from "node:path";
|
|
1634
|
-
var PROJECT_ROOT = process.cwd();
|
|
1635
|
-
var canonicalize = (p) => {
|
|
1794
|
+
var extractTablesFromPage = async (page, pageNum) => {
|
|
1795
|
+
const tables = [];
|
|
1636
1796
|
try {
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
|
|
1797
|
+
const textItems = await extractTextItemsWithPositions(page);
|
|
1798
|
+
if (textItems.length === 0) {
|
|
1799
|
+
return tables;
|
|
1800
|
+
}
|
|
1801
|
+
const rows = clusterByY(textItems);
|
|
1802
|
+
const tableRegions = identifyTableRegions(rows);
|
|
1803
|
+
for (let tableIndex = 0;tableIndex < tableRegions.length; tableIndex++) {
|
|
1804
|
+
const region = tableRegions[tableIndex];
|
|
1805
|
+
if (!region)
|
|
1806
|
+
continue;
|
|
1807
|
+
const tableRows = [];
|
|
1808
|
+
const tableCells = [];
|
|
1809
|
+
for (let rowIndex = 0;rowIndex < region.rows.length; rowIndex++) {
|
|
1810
|
+
const row = region.rows[rowIndex];
|
|
1811
|
+
if (!row)
|
|
1812
|
+
continue;
|
|
1813
|
+
const assigned = assignToTableCells(row, rowIndex, region.columnBoundaries);
|
|
1814
|
+
tableRows.push(assigned.rowValues);
|
|
1815
|
+
tableCells.push(...assigned.cells);
|
|
1816
|
+
}
|
|
1817
|
+
const confidence = calculateConfidence(region.rows, region.columnBoundaries);
|
|
1818
|
+
const tableBoundingBox = mergeBoundingBoxes2(tableCells.map((cell) => cell.bounding_box).filter((box) => box !== undefined));
|
|
1819
|
+
if (confidence >= 0.3) {
|
|
1820
|
+
tables.push({
|
|
1821
|
+
page: pageNum,
|
|
1822
|
+
tableIndex,
|
|
1823
|
+
rows: tableRows,
|
|
1824
|
+
cells: tableCells,
|
|
1825
|
+
...tableBoundingBox ? { bounding_box: tableBoundingBox } : {},
|
|
1826
|
+
rowCount: tableRows.length,
|
|
1827
|
+
colCount: region.columnBoundaries.length,
|
|
1828
|
+
confidence: Math.round(confidence * 100) / 100
|
|
1829
|
+
});
|
|
1830
|
+
}
|
|
1831
|
+
}
|
|
1832
|
+
} catch (error) {
|
|
1833
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1834
|
+
logger6.warn("Error extracting tables from page", { pageNum, error: message });
|
|
1835
|
+
}
|
|
1836
|
+
return tables;
|
|
1837
|
+
};
|
|
1838
|
+
var extractTables = async (pdfDocument, pagesToProcess) => {
|
|
1839
|
+
const allTables = [];
|
|
1840
|
+
for (const pageNum of pagesToProcess) {
|
|
1841
|
+
try {
|
|
1842
|
+
const page = await pdfDocument.getPage(pageNum);
|
|
1843
|
+
const pageTables = await extractTablesFromPage(page, pageNum);
|
|
1844
|
+
allTables.push(...pageTables);
|
|
1845
|
+
} catch (error) {
|
|
1846
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1847
|
+
logger6.warn("Error getting page for table extraction", { pageNum, error: message });
|
|
1644
1848
|
}
|
|
1645
|
-
throw err;
|
|
1646
1849
|
}
|
|
1850
|
+
return allTables;
|
|
1647
1851
|
};
|
|
1648
|
-
var
|
|
1649
|
-
if (
|
|
1650
|
-
|
|
1852
|
+
var tableToMarkdown = (table) => {
|
|
1853
|
+
if (table.rows.length === 0)
|
|
1854
|
+
return "";
|
|
1855
|
+
const lines = [];
|
|
1856
|
+
const headerRow = table.rows[0];
|
|
1857
|
+
if (!headerRow)
|
|
1858
|
+
return "";
|
|
1859
|
+
lines.push(`| ${headerRow.map((cell) => cell.trim() || " ").join(" | ")} |`);
|
|
1860
|
+
lines.push(`| ${headerRow.map(() => "---").join(" | ")} |`);
|
|
1861
|
+
for (let i = 1;i < table.rows.length; i++) {
|
|
1862
|
+
const row = table.rows[i];
|
|
1863
|
+
if (!row)
|
|
1864
|
+
continue;
|
|
1865
|
+
const paddedRow = [...row];
|
|
1866
|
+
while (paddedRow.length < headerRow.length) {
|
|
1867
|
+
paddedRow.push("");
|
|
1868
|
+
}
|
|
1869
|
+
lines.push(`| ${paddedRow.map((cell) => cell.trim() || " ").join(" | ")} |`);
|
|
1651
1870
|
}
|
|
1652
|
-
|
|
1653
|
-
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
if (
|
|
1657
|
-
|
|
1871
|
+
return lines.join(`
|
|
1872
|
+
`);
|
|
1873
|
+
};
|
|
1874
|
+
var tablesToMarkdown = (tables) => {
|
|
1875
|
+
if (tables.length === 0)
|
|
1876
|
+
return "";
|
|
1877
|
+
const sections = ["## Extracted Tables", ""];
|
|
1878
|
+
for (const table of tables) {
|
|
1879
|
+
sections.push(`### Page ${table.page}, Table ${table.tableIndex + 1}`);
|
|
1880
|
+
sections.push(`*Confidence: ${(table.confidence * 100).toFixed(0)}%*`);
|
|
1881
|
+
sections.push("");
|
|
1882
|
+
sections.push(tableToMarkdown(table));
|
|
1883
|
+
sections.push("");
|
|
1658
1884
|
}
|
|
1659
|
-
return
|
|
1885
|
+
return sections.join(`
|
|
1886
|
+
`);
|
|
1660
1887
|
};
|
|
1661
1888
|
|
|
1662
|
-
// src/pdf/
|
|
1663
|
-
var
|
|
1664
|
-
var
|
|
1665
|
-
var
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
var
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
var formatBytes = (bytes) => `${(bytes / 1024 / 1024).toFixed(0)}MB`;
|
|
1674
|
-
var sanitizeSourceDescription = (description) => description.length > 200 ? `${description.slice(0, 197)}...` : description;
|
|
1675
|
-
var loadLocalFile = async (userPath) => {
|
|
1676
|
-
const safePath = resolvePath(userPath);
|
|
1677
|
-
let stats;
|
|
1678
|
-
try {
|
|
1679
|
-
stats = await fs3.stat(safePath);
|
|
1680
|
-
} catch (err) {
|
|
1681
|
-
if (typeof err === "object" && err !== null && "code" in err && err.code === "ENOENT") {
|
|
1682
|
-
throw new PdfError(-32600 /* InvalidRequest */, `File not found at '${userPath}'.`, {
|
|
1683
|
-
cause: err instanceof Error ? err : undefined
|
|
1684
|
-
});
|
|
1685
|
-
}
|
|
1686
|
-
throw new PdfError(-32600 /* InvalidRequest */, `Failed to access file at '${userPath}'.`, {
|
|
1687
|
-
cause: err instanceof Error ? err : undefined
|
|
1688
|
-
});
|
|
1889
|
+
// src/pdf/documentModel.ts
|
|
1890
|
+
var DEFAULT_CHUNK_MAX_CHARS = 1800;
|
|
1891
|
+
var buildElementId = (page, type, index) => `p${String(page)}-${type}-${String(index)}`;
|
|
1892
|
+
var imageElementMetadata = (imageData) => {
|
|
1893
|
+
const { data: _data, ...metadata } = imageData;
|
|
1894
|
+
return metadata;
|
|
1895
|
+
};
|
|
1896
|
+
var buildPageTextStats = (items) => {
|
|
1897
|
+
const heights = items.filter((item) => item.type === "text" && item.textContent?.trim() && item.height).map((item) => item.height).sort((a, b) => a - b);
|
|
1898
|
+
if (heights.length === 0) {
|
|
1899
|
+
return { maxHeight: 0, medianHeight: 0, textItemCount: 0 };
|
|
1689
1900
|
}
|
|
1690
|
-
|
|
1691
|
-
|
|
1901
|
+
const midpoint = Math.floor(heights.length / 2);
|
|
1902
|
+
const medianHeight = heights.length % 2 === 0 ? ((heights[midpoint - 1] ?? 0) + (heights[midpoint] ?? 0)) / 2 : heights[midpoint] ?? 0;
|
|
1903
|
+
return {
|
|
1904
|
+
maxHeight: heights.at(-1) ?? 0,
|
|
1905
|
+
medianHeight,
|
|
1906
|
+
textItemCount: heights.length
|
|
1907
|
+
};
|
|
1908
|
+
};
|
|
1909
|
+
var buildSemanticHint = (item, stats) => {
|
|
1910
|
+
if (item.type !== "text" || !item.textContent?.trim())
|
|
1911
|
+
return;
|
|
1912
|
+
const textContent = item.textContent.trim();
|
|
1913
|
+
if (/^([-*]\s+|\d+[.)]\s+)/.test(textContent)) {
|
|
1914
|
+
return {
|
|
1915
|
+
role: "list_item",
|
|
1916
|
+
confidence: 0.92,
|
|
1917
|
+
signals: ["list-prefix"]
|
|
1918
|
+
};
|
|
1692
1919
|
}
|
|
1693
|
-
|
|
1694
|
-
|
|
1920
|
+
const height = item.height ?? 0;
|
|
1921
|
+
const isShortLine = textContent.length <= 120;
|
|
1922
|
+
const endsLikeSentence = /[.!?]$/.test(textContent);
|
|
1923
|
+
const isLargeText = stats.textItemCount > 1 && height > 0 && stats.medianHeight > 0 && height >= stats.medianHeight * 1.3 && height >= stats.maxHeight * 0.8;
|
|
1924
|
+
if (isLargeText && isShortLine && !endsLikeSentence) {
|
|
1925
|
+
const ratio = height / stats.medianHeight;
|
|
1926
|
+
const level = ratio >= 1.8 ? 1 : ratio >= 1.55 ? 2 : 3;
|
|
1927
|
+
return {
|
|
1928
|
+
role: "heading",
|
|
1929
|
+
level,
|
|
1930
|
+
confidence: 0.78,
|
|
1931
|
+
signals: ["larger-text", "short-line"]
|
|
1932
|
+
};
|
|
1695
1933
|
}
|
|
1696
|
-
|
|
1697
|
-
|
|
1934
|
+
return {
|
|
1935
|
+
role: "paragraph",
|
|
1936
|
+
confidence: 0.5,
|
|
1937
|
+
signals: ["default-text"]
|
|
1938
|
+
};
|
|
1698
1939
|
};
|
|
1699
|
-
var
|
|
1700
|
-
if (
|
|
1701
|
-
|
|
1702
|
-
|
|
1940
|
+
var contentItemToElement = (item, page, index, semanticHint) => {
|
|
1941
|
+
if (item.type === "text" && item.textContent?.trim()) {
|
|
1942
|
+
return {
|
|
1943
|
+
id: buildElementId(page, "text", index),
|
|
1944
|
+
type: "text",
|
|
1945
|
+
page,
|
|
1946
|
+
content: item.textContent,
|
|
1947
|
+
bounding_box: item.bounding_box,
|
|
1948
|
+
provenance: {
|
|
1949
|
+
engine: "pdfjs",
|
|
1950
|
+
source: "text-content"
|
|
1951
|
+
},
|
|
1952
|
+
...semanticHint ? { semantic_hint: semanticHint } : {}
|
|
1953
|
+
};
|
|
1703
1954
|
}
|
|
1704
|
-
if (
|
|
1705
|
-
|
|
1706
|
-
|
|
1707
|
-
|
|
1708
|
-
|
|
1709
|
-
|
|
1710
|
-
|
|
1711
|
-
|
|
1712
|
-
|
|
1713
|
-
|
|
1714
|
-
|
|
1715
|
-
|
|
1716
|
-
}
|
|
1955
|
+
if (item.type === "image" && item.imageData) {
|
|
1956
|
+
return {
|
|
1957
|
+
id: buildElementId(page, "image", index),
|
|
1958
|
+
type: "image",
|
|
1959
|
+
page,
|
|
1960
|
+
image: imageElementMetadata(item.imageData),
|
|
1961
|
+
bounding_box: item.bounding_box,
|
|
1962
|
+
provenance: {
|
|
1963
|
+
engine: "pdfjs",
|
|
1964
|
+
source: "image-xobject"
|
|
1965
|
+
}
|
|
1966
|
+
};
|
|
1717
1967
|
}
|
|
1968
|
+
return;
|
|
1718
1969
|
};
|
|
1719
|
-
var
|
|
1720
|
-
|
|
1721
|
-
const
|
|
1722
|
-
const
|
|
1723
|
-
|
|
1724
|
-
|
|
1725
|
-
|
|
1726
|
-
|
|
1727
|
-
|
|
1728
|
-
|
|
1729
|
-
|
|
1730
|
-
|
|
1731
|
-
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
}
|
|
1735
|
-
|
|
1736
|
-
|
|
1737
|
-
|
|
1738
|
-
|
|
1739
|
-
|
|
1740
|
-
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
throw new PdfError(-32600 /* InvalidRequest */, `Remote PDF exceeds maximum size of ${formatBytes(MAX_PDF_SIZE)} (Content-Length: ${formatBytes(declared)}).`);
|
|
1746
|
-
}
|
|
1747
|
-
}
|
|
1748
|
-
if (!response.body) {
|
|
1749
|
-
const ab = await response.arrayBuffer();
|
|
1750
|
-
if (ab.byteLength > MAX_PDF_SIZE) {
|
|
1751
|
-
throw new PdfError(-32600 /* InvalidRequest */, `Remote PDF exceeds maximum size of ${formatBytes(MAX_PDF_SIZE)}.`);
|
|
1752
|
-
}
|
|
1753
|
-
return new Uint8Array(ab);
|
|
1754
|
-
}
|
|
1755
|
-
const reader = response.body.getReader();
|
|
1756
|
-
const chunks = [];
|
|
1757
|
-
let total = 0;
|
|
1758
|
-
while (true) {
|
|
1759
|
-
const { done, value } = await reader.read();
|
|
1760
|
-
if (done)
|
|
1761
|
-
break;
|
|
1762
|
-
if (value) {
|
|
1763
|
-
total += value.byteLength;
|
|
1764
|
-
if (total > MAX_PDF_SIZE) {
|
|
1765
|
-
await reader.cancel().catch(() => {});
|
|
1766
|
-
throw new PdfError(-32600 /* InvalidRequest */, `Remote PDF exceeds maximum size of ${formatBytes(MAX_PDF_SIZE)} during streaming.`);
|
|
1767
|
-
}
|
|
1768
|
-
chunks.push(value);
|
|
1769
|
-
}
|
|
1970
|
+
var buildStructuredElements = (pageContents, tables, includeSemanticHints) => {
|
|
1971
|
+
const elements = [];
|
|
1972
|
+
const tablesByPage = new Map;
|
|
1973
|
+
for (const table of tables ?? []) {
|
|
1974
|
+
const pageTables = tablesByPage.get(table.page) ?? [];
|
|
1975
|
+
pageTables.push(table);
|
|
1976
|
+
tablesByPage.set(table.page, pageTables);
|
|
1977
|
+
}
|
|
1978
|
+
const appendTableElement = (table) => {
|
|
1979
|
+
elements.push({
|
|
1980
|
+
id: buildElementId(table.page, "table", table.tableIndex + 1),
|
|
1981
|
+
type: "table",
|
|
1982
|
+
page: table.page,
|
|
1983
|
+
table: {
|
|
1984
|
+
rows: table.rows,
|
|
1985
|
+
...table.cells ? { cells: table.cells } : {},
|
|
1986
|
+
...table.bounding_box ? { bounding_box: table.bounding_box } : {},
|
|
1987
|
+
rowCount: table.rowCount,
|
|
1988
|
+
colCount: table.colCount,
|
|
1989
|
+
confidence: table.confidence
|
|
1990
|
+
},
|
|
1991
|
+
bounding_box: table.bounding_box,
|
|
1992
|
+
confidence: table.confidence,
|
|
1993
|
+
provenance: {
|
|
1994
|
+
engine: "pdfjs",
|
|
1995
|
+
source: "table-detector"
|
|
1770
1996
|
}
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
|
|
1774
|
-
|
|
1775
|
-
|
|
1997
|
+
});
|
|
1998
|
+
};
|
|
1999
|
+
for (const pageContent of pageContents) {
|
|
2000
|
+
const stats = includeSemanticHints ? buildPageTextStats(pageContent.items) : undefined;
|
|
2001
|
+
let elementIndex = 1;
|
|
2002
|
+
for (const item of pageContent.items) {
|
|
2003
|
+
const semanticHint = stats ? buildSemanticHint(item, stats) : undefined;
|
|
2004
|
+
const element = contentItemToElement(item, pageContent.page, elementIndex, semanticHint);
|
|
2005
|
+
if (element) {
|
|
2006
|
+
elements.push(element);
|
|
2007
|
+
elementIndex++;
|
|
1776
2008
|
}
|
|
1777
|
-
return combined;
|
|
1778
2009
|
}
|
|
1779
|
-
|
|
1780
|
-
|
|
1781
|
-
|
|
1782
|
-
|
|
1783
|
-
|
|
1784
|
-
|
|
2010
|
+
const pageTables = tablesByPage.get(pageContent.page);
|
|
2011
|
+
if (pageTables) {
|
|
2012
|
+
for (const table of pageTables.sort((a, b) => a.tableIndex - b.tableIndex)) {
|
|
2013
|
+
appendTableElement(table);
|
|
2014
|
+
}
|
|
2015
|
+
tablesByPage.delete(pageContent.page);
|
|
1785
2016
|
}
|
|
1786
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
1787
|
-
logger4.warn("URL fetch failed", { url, error: message });
|
|
1788
|
-
throw new PdfError(-32600 /* InvalidRequest */, `URL fetch failed for '${url}'.`, {
|
|
1789
|
-
cause: err instanceof Error ? err : undefined
|
|
1790
|
-
});
|
|
1791
|
-
} finally {
|
|
1792
|
-
clearTimeout(timeout);
|
|
1793
2017
|
}
|
|
2018
|
+
const remainingTables = Array.from(tablesByPage.values()).flat().sort((a, b) => a.page - b.page || a.tableIndex - b.tableIndex);
|
|
2019
|
+
for (const table of remainingTables) {
|
|
2020
|
+
appendTableElement(table);
|
|
2021
|
+
}
|
|
2022
|
+
return elements;
|
|
1794
2023
|
};
|
|
1795
|
-
var
|
|
1796
|
-
const
|
|
1797
|
-
|
|
1798
|
-
|
|
1799
|
-
|
|
1800
|
-
|
|
1801
|
-
|
|
1802
|
-
|
|
1803
|
-
|
|
1804
|
-
|
|
1805
|
-
throw new PdfError(-32602 /* InvalidParams */, `Source ${safeSource} missing 'path' or 'url'.`);
|
|
1806
|
-
}
|
|
1807
|
-
} catch (err) {
|
|
1808
|
-
if (err instanceof PdfError) {
|
|
1809
|
-
throw err;
|
|
2024
|
+
var renderMarkdownFromPageContents = (pageContents, tables) => {
|
|
2025
|
+
const sections = [];
|
|
2026
|
+
for (const pageContent of pageContents) {
|
|
2027
|
+
const pageLines = [`## Page ${String(pageContent.page)}`, ""];
|
|
2028
|
+
for (const item of pageContent.items) {
|
|
2029
|
+
if (item.type === "text" && item.textContent?.trim()) {
|
|
2030
|
+
pageLines.push(item.textContent.trim(), "");
|
|
2031
|
+
} else if (item.type === "image" && item.imageData) {
|
|
2032
|
+
pageLines.push(`[Image ${String(item.imageData.index + 1)}: ${String(item.imageData.width)}x${String(item.imageData.height)} ${item.imageData.format}]`, "");
|
|
2033
|
+
}
|
|
1810
2034
|
}
|
|
1811
|
-
|
|
1812
|
-
|
|
1813
|
-
sourceDescription: safeSource,
|
|
1814
|
-
error: message
|
|
1815
|
-
});
|
|
1816
|
-
throw new PdfError(-32600 /* InvalidRequest */, `Failed to prepare PDF source ${safeSource}.`, {
|
|
1817
|
-
cause: err instanceof Error ? err : undefined
|
|
1818
|
-
});
|
|
2035
|
+
sections.push(pageLines.join(`
|
|
2036
|
+
`).trimEnd());
|
|
1819
2037
|
}
|
|
1820
|
-
|
|
1821
|
-
|
|
1822
|
-
|
|
1823
|
-
|
|
1824
|
-
|
|
1825
|
-
|
|
1826
|
-
|
|
2038
|
+
if (tables && tables.length > 0) {
|
|
2039
|
+
sections.push(tablesToMarkdown(tables));
|
|
2040
|
+
}
|
|
2041
|
+
return sections.join(`
|
|
2042
|
+
|
|
2043
|
+
`).trim();
|
|
2044
|
+
};
|
|
2045
|
+
var escapeHtml = (value) => value.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
2046
|
+
var renderTablesToHtml = (tables) => {
|
|
2047
|
+
if (!tables || tables.length === 0)
|
|
2048
|
+
return [];
|
|
2049
|
+
return tables.map((table) => {
|
|
2050
|
+
const rows = table.rows.map((row) => {
|
|
2051
|
+
const cells = row.map((cell) => `<td>${escapeHtml(cell)}</td>`).join("");
|
|
2052
|
+
return `<tr>${cells}</tr>`;
|
|
2053
|
+
}).join(`
|
|
2054
|
+
`);
|
|
2055
|
+
return [
|
|
2056
|
+
`<table data-page="${String(table.page)}" data-table-index="${String(table.tableIndex)}">`,
|
|
2057
|
+
"<tbody>",
|
|
2058
|
+
rows,
|
|
2059
|
+
"</tbody>",
|
|
2060
|
+
"</table>"
|
|
2061
|
+
].join(`
|
|
2062
|
+
`);
|
|
2063
|
+
});
|
|
2064
|
+
};
|
|
2065
|
+
var renderHtmlFromPageContents = (pageContents, tables) => {
|
|
2066
|
+
const sections = pageContents.map((pageContent) => {
|
|
2067
|
+
const body = [
|
|
2068
|
+
`<section data-page="${String(pageContent.page)}">`,
|
|
2069
|
+
`<h2>Page ${String(pageContent.page)}</h2>`
|
|
2070
|
+
];
|
|
2071
|
+
for (const item of pageContent.items) {
|
|
2072
|
+
if (item.type === "text" && item.textContent?.trim()) {
|
|
2073
|
+
body.push(`<p>${escapeHtml(item.textContent.trim())}</p>`);
|
|
2074
|
+
} else if (item.type === "image" && item.imageData) {
|
|
2075
|
+
body.push([
|
|
2076
|
+
`<figure data-image-index="${String(item.imageData.index)}">`,
|
|
2077
|
+
`<figcaption>Image ${String(item.imageData.index + 1)}: ${String(item.imageData.width)}x${String(item.imageData.height)} ${escapeHtml(item.imageData.format)}</figcaption>`,
|
|
2078
|
+
"</figure>"
|
|
2079
|
+
].join(`
|
|
2080
|
+
`));
|
|
2081
|
+
}
|
|
2082
|
+
}
|
|
2083
|
+
body.push("</section>");
|
|
2084
|
+
return body.join(`
|
|
2085
|
+
`);
|
|
1827
2086
|
});
|
|
1828
|
-
|
|
1829
|
-
return await loadingTask.promise;
|
|
1830
|
-
} catch (err) {
|
|
1831
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
1832
|
-
logger4.error("PDF.js loading error", { sourceDescription: safeSource, error: message });
|
|
1833
|
-
throw new PdfError(-32600 /* InvalidRequest */, `Failed to load PDF document from ${safeSource}.`, { cause: err instanceof Error ? err : undefined });
|
|
1834
|
-
}
|
|
1835
|
-
};
|
|
2087
|
+
return [...sections, ...renderTablesToHtml(tables)].join(`
|
|
1836
2088
|
|
|
1837
|
-
|
|
1838
|
-
var logger5 = createLogger("Parser");
|
|
1839
|
-
var MAX_RANGE_SIZE = 1e4;
|
|
1840
|
-
var parseRangePart = (part, pages) => {
|
|
1841
|
-
const trimmedPart = part.trim();
|
|
1842
|
-
if (trimmedPart.includes("-")) {
|
|
1843
|
-
const splitResult = trimmedPart.split("-");
|
|
1844
|
-
const startStr = splitResult[0] || "";
|
|
1845
|
-
const endStr = splitResult[1];
|
|
1846
|
-
const start = parseInt(startStr, 10);
|
|
1847
|
-
const end = endStr === "" || endStr === undefined ? Infinity : parseInt(endStr, 10);
|
|
1848
|
-
if (Number.isNaN(start) || Number.isNaN(end) || start <= 0 || start > end) {
|
|
1849
|
-
throw new Error(`Invalid page range values: ${trimmedPart}`);
|
|
1850
|
-
}
|
|
1851
|
-
const practicalEnd = Math.min(end, start + MAX_RANGE_SIZE);
|
|
1852
|
-
for (let i = start;i <= practicalEnd; i++) {
|
|
1853
|
-
pages.add(i);
|
|
1854
|
-
}
|
|
1855
|
-
if (end === Infinity && practicalEnd === start + MAX_RANGE_SIZE) {
|
|
1856
|
-
logger5.warn("Open-ended range truncated", { start, practicalEnd });
|
|
1857
|
-
}
|
|
1858
|
-
} else {
|
|
1859
|
-
const page = parseInt(trimmedPart, 10);
|
|
1860
|
-
if (Number.isNaN(page) || page <= 0) {
|
|
1861
|
-
throw new Error(`Invalid page number: ${trimmedPart}`);
|
|
1862
|
-
}
|
|
1863
|
-
pages.add(page);
|
|
1864
|
-
}
|
|
2089
|
+
`).trim();
|
|
1865
2090
|
};
|
|
1866
|
-
var
|
|
1867
|
-
|
|
1868
|
-
|
|
1869
|
-
|
|
1870
|
-
|
|
2091
|
+
var elementText = (element) => {
|
|
2092
|
+
if (element.type === "text")
|
|
2093
|
+
return element.content.trim();
|
|
2094
|
+
if (element.type === "table") {
|
|
2095
|
+
const tableText = element.table.rows.map((row) => row.join(" | ")).join(`
|
|
2096
|
+
`).trim();
|
|
2097
|
+
return tableText.length > 0 ? tableText : undefined;
|
|
1871
2098
|
}
|
|
1872
|
-
|
|
1873
|
-
|
|
2099
|
+
return;
|
|
2100
|
+
};
|
|
2101
|
+
var elementRole = (element) => element.type === "text" ? element.semantic_hint?.role : undefined;
|
|
2102
|
+
var chunkTextLength = (draft) => draft.textParts.reduce((sum, part) => sum + part.length + 1, 0);
|
|
2103
|
+
var createChunkDraft = (element, strategy, heading) => ({
|
|
2104
|
+
pageStart: element.page,
|
|
2105
|
+
pageEnd: element.page,
|
|
2106
|
+
textParts: [],
|
|
2107
|
+
elementIds: [],
|
|
2108
|
+
boundingBoxes: [],
|
|
2109
|
+
strategy,
|
|
2110
|
+
heading
|
|
2111
|
+
});
|
|
2112
|
+
var addElementToChunk = (draft, element, textValue) => {
|
|
2113
|
+
draft.pageEnd = Math.max(draft.pageEnd, element.page);
|
|
2114
|
+
draft.textParts.push(textValue);
|
|
2115
|
+
draft.elementIds.push(element.id);
|
|
2116
|
+
if (element.bounding_box) {
|
|
2117
|
+
draft.boundingBoxes.push(element.bounding_box);
|
|
1874
2118
|
}
|
|
1875
|
-
return Array.from(pages).sort((a, b) => a - b);
|
|
1876
2119
|
};
|
|
1877
|
-
var
|
|
1878
|
-
|
|
2120
|
+
var finalizeChunk = (draft, index) => {
|
|
2121
|
+
const textValue = draft.textParts.join(`
|
|
2122
|
+
`).trim();
|
|
2123
|
+
if (!textValue)
|
|
1879
2124
|
return;
|
|
1880
|
-
|
|
1881
|
-
|
|
1882
|
-
|
|
1883
|
-
|
|
2125
|
+
return {
|
|
2126
|
+
id: draft.pageStart === draft.pageEnd ? `p${String(draft.pageStart)}-chunk-${String(index)}` : `p${String(draft.pageStart)}-p${String(draft.pageEnd)}-chunk-${String(index)}`,
|
|
2127
|
+
page_start: draft.pageStart,
|
|
2128
|
+
page_end: draft.pageEnd,
|
|
2129
|
+
text: textValue,
|
|
2130
|
+
element_ids: draft.elementIds,
|
|
2131
|
+
strategy: draft.strategy,
|
|
2132
|
+
...draft.heading ? { heading: draft.heading } : {},
|
|
2133
|
+
...draft.boundingBoxes.length > 0 ? { bounding_boxes: draft.boundingBoxes } : {}
|
|
2134
|
+
};
|
|
2135
|
+
};
|
|
2136
|
+
var buildCitationChunks = (elements, options) => {
|
|
2137
|
+
const maxChars = options.maxChars ?? DEFAULT_CHUNK_MAX_CHARS;
|
|
2138
|
+
const chunks = [];
|
|
2139
|
+
let current;
|
|
2140
|
+
const pushCurrent = () => {
|
|
2141
|
+
if (!current)
|
|
2142
|
+
return;
|
|
2143
|
+
const chunk = finalizeChunk(current, chunks.length + 1);
|
|
2144
|
+
if (chunk)
|
|
2145
|
+
chunks.push(chunk);
|
|
2146
|
+
current = undefined;
|
|
2147
|
+
};
|
|
2148
|
+
for (const element of elements) {
|
|
2149
|
+
const textValue = elementText(element);
|
|
2150
|
+
if (!textValue)
|
|
2151
|
+
continue;
|
|
2152
|
+
const role = elementRole(element);
|
|
2153
|
+
const shouldStartSemanticChunk = options.useSemanticBoundaries && role === "heading";
|
|
2154
|
+
const shouldStartTableChunk = element.type === "table";
|
|
2155
|
+
const exceedsSize = current !== undefined && current.elementIds.length > 0 && chunkTextLength(current) + textValue.length > maxChars;
|
|
2156
|
+
const crossesPage = current !== undefined && current.pageEnd !== element.page;
|
|
2157
|
+
if (shouldStartSemanticChunk || shouldStartTableChunk || exceedsSize || crossesPage) {
|
|
2158
|
+
pushCurrent();
|
|
1884
2159
|
}
|
|
1885
|
-
if (
|
|
1886
|
-
|
|
2160
|
+
if (!current) {
|
|
2161
|
+
const strategy = shouldStartSemanticChunk ? "semantic" : exceedsSize ? "size" : "page";
|
|
2162
|
+
const heading = shouldStartSemanticChunk && element.type === "text" ? element.content.trim() : undefined;
|
|
2163
|
+
current = createChunkDraft(element, strategy, heading);
|
|
1887
2164
|
}
|
|
1888
|
-
|
|
1889
|
-
|
|
1890
|
-
|
|
2165
|
+
if (element.type === "table" && current.elementIds.length === 0) {
|
|
2166
|
+
current.strategy = "table";
|
|
2167
|
+
}
|
|
2168
|
+
addElementToChunk(current, element, textValue);
|
|
2169
|
+
if (element.type === "table") {
|
|
2170
|
+
pushCurrent();
|
|
1891
2171
|
}
|
|
1892
|
-
return uniquePages;
|
|
1893
|
-
} catch (error) {
|
|
1894
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
1895
|
-
throw new PdfError(-32602 /* InvalidParams */, `Invalid page specification for source ${sourceDescription}: ${message}`);
|
|
1896
2172
|
}
|
|
2173
|
+
pushCurrent();
|
|
2174
|
+
return chunks;
|
|
1897
2175
|
};
|
|
1898
|
-
var
|
|
1899
|
-
|
|
1900
|
-
|
|
1901
|
-
|
|
1902
|
-
|
|
1903
|
-
|
|
1904
|
-
|
|
1905
|
-
|
|
1906
|
-
|
|
2176
|
+
var PROMPT_INJECTION_PATTERNS = [
|
|
2177
|
+
/\bignore (all )?(previous|prior|above) instructions\b/i,
|
|
2178
|
+
/\bdisregard (previous|prior|above) instructions\b/i,
|
|
2179
|
+
/\bsystem prompt\b/i,
|
|
2180
|
+
/\bdeveloper (message|instruction)s?\b/i,
|
|
2181
|
+
/\bdo not (follow|obey) .*instructions\b/i
|
|
2182
|
+
];
|
|
2183
|
+
var snippetFromText = (value) => {
|
|
2184
|
+
const normalized = value.replace(/\s+/g, " ").trim();
|
|
2185
|
+
return normalized.length > 160 ? `${normalized.slice(0, 157)}...` : normalized;
|
|
2186
|
+
};
|
|
2187
|
+
var isOutsideViewBox = (box, viewBox) => {
|
|
2188
|
+
if (!box || !viewBox)
|
|
2189
|
+
return false;
|
|
2190
|
+
const tolerance = 1;
|
|
2191
|
+
return box.right < viewBox.left - tolerance || box.left > viewBox.right + tolerance || box.top < viewBox.bottom - tolerance || box.bottom > viewBox.top + tolerance;
|
|
2192
|
+
};
|
|
2193
|
+
var buildSafetyFindings = (pageContents, pageGeometry) => {
|
|
2194
|
+
const findings = [];
|
|
2195
|
+
const geometryByPage = new Map(pageGeometry?.map((geometry) => [geometry.page, geometry]));
|
|
2196
|
+
for (const pageContent of pageContents) {
|
|
2197
|
+
let elementIndex = 1;
|
|
2198
|
+
const geometry = geometryByPage.get(pageContent.page);
|
|
2199
|
+
for (const item of pageContent.items) {
|
|
2200
|
+
const element = contentItemToElement(item, pageContent.page, elementIndex);
|
|
2201
|
+
if (!element) {
|
|
2202
|
+
continue;
|
|
2203
|
+
}
|
|
2204
|
+
if (element.type === "text") {
|
|
2205
|
+
const textContent = element.content.trim();
|
|
2206
|
+
const snippet = snippetFromText(textContent);
|
|
2207
|
+
if (PROMPT_INJECTION_PATTERNS.some((pattern) => pattern.test(textContent))) {
|
|
2208
|
+
findings.push({
|
|
2209
|
+
type: "prompt_injection_pattern",
|
|
2210
|
+
severity: "high",
|
|
2211
|
+
page: pageContent.page,
|
|
2212
|
+
element_id: element.id,
|
|
2213
|
+
message: "Text matches a common prompt-injection instruction pattern.",
|
|
2214
|
+
snippet,
|
|
2215
|
+
...element.bounding_box ? { bounding_box: element.bounding_box } : {}
|
|
2216
|
+
});
|
|
2217
|
+
}
|
|
2218
|
+
if (item.height !== undefined && item.height > 0 && item.height < 2) {
|
|
2219
|
+
findings.push({
|
|
2220
|
+
type: "tiny_text",
|
|
2221
|
+
severity: "medium",
|
|
2222
|
+
page: pageContent.page,
|
|
2223
|
+
element_id: element.id,
|
|
2224
|
+
message: "Text is unusually small and may be hidden, decorative, or extraction noise.",
|
|
2225
|
+
snippet,
|
|
2226
|
+
...element.bounding_box ? { bounding_box: element.bounding_box } : {}
|
|
2227
|
+
});
|
|
2228
|
+
}
|
|
2229
|
+
if (isOutsideViewBox(element.bounding_box, geometry?.view_box)) {
|
|
2230
|
+
findings.push({
|
|
2231
|
+
type: "off_page_text",
|
|
2232
|
+
severity: "medium",
|
|
2233
|
+
page: pageContent.page,
|
|
2234
|
+
element_id: element.id,
|
|
2235
|
+
message: "Text bounding box falls outside the PDF page view box.",
|
|
2236
|
+
snippet,
|
|
2237
|
+
...element.bounding_box ? { bounding_box: element.bounding_box } : {}
|
|
2238
|
+
});
|
|
2239
|
+
}
|
|
2240
|
+
}
|
|
2241
|
+
elementIndex++;
|
|
2242
|
+
}
|
|
1907
2243
|
}
|
|
1908
|
-
return
|
|
2244
|
+
return findings;
|
|
1909
2245
|
};
|
|
1910
2246
|
|
|
1911
|
-
// src/schemas/readPdf.ts
|
|
1912
|
-
import {
|
|
1913
|
-
array,
|
|
1914
|
-
bool,
|
|
1915
|
-
description,
|
|
1916
|
-
gte,
|
|
1917
|
-
int,
|
|
1918
|
-
min,
|
|
1919
|
-
num,
|
|
1920
|
-
object,
|
|
1921
|
-
optional,
|
|
1922
|
-
str,
|
|
1923
|
-
union
|
|
1924
|
-
} from "@sylphx/vex";
|
|
1925
|
-
var pageSpecifierSchema = union(array(num(int, gte(1))), str(min(1)));
|
|
1926
|
-
var pdfSourceSchema = object({
|
|
1927
|
-
path: optional(str(min(1), description("Path to the local PDF file (absolute or relative to cwd)."))),
|
|
1928
|
-
url: optional(str(min(1), description("URL of the PDF file."))),
|
|
1929
|
-
pages: optional(pageSpecifierSchema)
|
|
1930
|
-
});
|
|
1931
|
-
var readPdfArgsSchema = object({
|
|
1932
|
-
sources: array(pdfSourceSchema),
|
|
1933
|
-
include_full_text: optional(bool(description("Include the full text content of each PDF (only if 'pages' is not specified for that source)."))),
|
|
1934
|
-
include_metadata: optional(bool(description("Include metadata and info objects for each PDF."))),
|
|
1935
|
-
include_page_count: optional(bool(description("Include the total number of pages for each PDF."))),
|
|
1936
|
-
include_images: optional(bool(description("Extract and include embedded images from the PDF pages as base64-encoded data."))),
|
|
1937
|
-
include_tables: optional(bool(description("Detect and extract tables from PDF pages. Uses spatial clustering of text coordinates to identify tabular structures."))),
|
|
1938
|
-
include_elements: optional(bool(description("Include agent-ready structured document elements with page numbers, stable IDs, provenance, and best-effort bounding boxes."))),
|
|
1939
|
-
include_semantic_hints: optional(bool(description("Include deterministic semantic hints on text elements, such as heading, list item, or paragraph."))),
|
|
1940
|
-
include_markdown: optional(bool(description("Include a Markdown rendering of extracted pages for RAG, summarization, and agent context."))),
|
|
1941
|
-
include_html: optional(bool(description("Include a simple HTML rendering of extracted pages for preview, export, and downstream conversion."))),
|
|
1942
|
-
include_chunks: optional(bool(description("Include page-level citation-ready chunks with text, element IDs, page ranges, and best-effort bounding boxes."))),
|
|
1943
|
-
include_outline: optional(bool(description("Include document outline/bookmark entries when the PDF exposes them."))),
|
|
1944
|
-
include_annotations: optional(bool(description("Include page annotations such as links, notes, and form-related annotations with safe summary fields."))),
|
|
1945
|
-
include_page_labels: optional(bool(description("Include PDF page labels when available, such as roman numerals or section labels."))),
|
|
1946
|
-
include_page_geometry: optional(bool(description("Include page viewport geometry such as width, height, rotation, user unit, and view box."))),
|
|
1947
|
-
include_permissions: optional(bool(description("Include PDF permission and marking signals when exposed by the parser."))),
|
|
1948
|
-
include_form_fields: optional(bool(description("Include PDF form field summaries when AcroForm fields are exposed."))),
|
|
1949
|
-
include_attachments: optional(bool(description("Include embedded attachment metadata such as filename and size. Attachment bytes are not returned."))),
|
|
1950
|
-
include_structure_tree: optional(bool(description("Include best-effort tagged PDF structure trees for selected pages when the PDF exposes them."))),
|
|
1951
|
-
include_safety_findings: optional(bool(description("Include deterministic content safety findings for prompt-injection patterns, tiny text, and off-page text.")))
|
|
1952
|
-
});
|
|
1953
|
-
|
|
1954
2247
|
// src/handlers/readPdf.ts
|
|
1955
|
-
var
|
|
2248
|
+
var logger7 = createLogger("ReadPdf");
|
|
1956
2249
|
var processSingleSource = async (source, options) => {
|
|
1957
2250
|
const sourceDescription = source.path ?? source.url ?? "unknown source";
|
|
1958
2251
|
let individualResult = { source: sourceDescription, success: false };
|
|
@@ -2071,7 +2364,7 @@ var processSingleSource = async (source, options) => {
|
|
|
2071
2364
|
errorMessage = error.message;
|
|
2072
2365
|
} else {
|
|
2073
2366
|
const detail = error instanceof Error ? error.message : String(error);
|
|
2074
|
-
|
|
2367
|
+
logger7.error("Unexpected error processing PDF source", {
|
|
2075
2368
|
sourceDescription,
|
|
2076
2369
|
error: detail
|
|
2077
2370
|
});
|
|
@@ -2087,13 +2380,13 @@ var processSingleSource = async (source, options) => {
|
|
|
2087
2380
|
await loadingTask.destroy();
|
|
2088
2381
|
} catch (destroyError) {
|
|
2089
2382
|
const message = destroyError instanceof Error ? destroyError.message : String(destroyError);
|
|
2090
|
-
|
|
2383
|
+
logger7.warn("Error destroying PDF document", { sourceDescription, error: message });
|
|
2091
2384
|
}
|
|
2092
2385
|
}
|
|
2093
2386
|
}
|
|
2094
2387
|
return individualResult;
|
|
2095
2388
|
};
|
|
2096
|
-
var readPdf =
|
|
2389
|
+
var readPdf = tool2().description("Reads content/metadata/images from one or more PDFs (local/URL). Each source can specify pages to extract.").input(readPdfArgsSchema).handler(async ({ input }) => {
|
|
2097
2390
|
const {
|
|
2098
2391
|
sources,
|
|
2099
2392
|
include_full_text,
|
|
@@ -2116,7 +2409,7 @@ var readPdf = tool().description("Reads content/metadata/images from one or more
|
|
|
2116
2409
|
include_structure_tree,
|
|
2117
2410
|
include_safety_findings
|
|
2118
2411
|
} = input;
|
|
2119
|
-
const
|
|
2412
|
+
const MAX_CONCURRENT_SOURCES2 = 3;
|
|
2120
2413
|
const results = [];
|
|
2121
2414
|
const options = {
|
|
2122
2415
|
includeFullText: include_full_text ?? false,
|
|
@@ -2139,15 +2432,15 @@ var readPdf = tool().description("Reads content/metadata/images from one or more
|
|
|
2139
2432
|
includeStructureTree: include_structure_tree ?? false,
|
|
2140
2433
|
includeSafetyFindings: include_safety_findings ?? false
|
|
2141
2434
|
};
|
|
2142
|
-
for (let i = 0;i < sources.length; i +=
|
|
2143
|
-
const batch = sources.slice(i, i +
|
|
2435
|
+
for (let i = 0;i < sources.length; i += MAX_CONCURRENT_SOURCES2) {
|
|
2436
|
+
const batch = sources.slice(i, i + MAX_CONCURRENT_SOURCES2);
|
|
2144
2437
|
const batchResults = await Promise.all(batch.map((source) => processSingleSource(source, options)));
|
|
2145
2438
|
results.push(...batchResults);
|
|
2146
2439
|
}
|
|
2147
2440
|
const allFailed = results.every((r) => !r.success);
|
|
2148
2441
|
if (allFailed) {
|
|
2149
2442
|
const errorMessages = results.map((r) => r.error).join("; ");
|
|
2150
|
-
return
|
|
2443
|
+
return toolError2(`All PDF sources failed to process: ${errorMessages}`);
|
|
2151
2444
|
}
|
|
2152
2445
|
const content = [];
|
|
2153
2446
|
const resultsForJson = results.map((result) => {
|
|
@@ -2178,7 +2471,7 @@ var readPdf = tool().description("Reads content/metadata/images from one or more
|
|
|
2178
2471
|
}
|
|
2179
2472
|
return result;
|
|
2180
2473
|
});
|
|
2181
|
-
content.push(
|
|
2474
|
+
content.push(text2(JSON.stringify({ results: resultsForJson }, null, 2)));
|
|
2182
2475
|
for (const result of results) {
|
|
2183
2476
|
if (!result.success || !result.data?.page_contents)
|
|
2184
2477
|
continue;
|
|
@@ -2193,7 +2486,7 @@ var readPdf = tool().description("Reads content/metadata/images from one or more
|
|
|
2193
2486
|
}
|
|
2194
2487
|
}
|
|
2195
2488
|
if (pageTextParts.length > 0) {
|
|
2196
|
-
content.push(
|
|
2489
|
+
content.push(text2(`[Page ${pageContent.page}]
|
|
2197
2490
|
${pageTextParts.join(`
|
|
2198
2491
|
`)}`));
|
|
2199
2492
|
}
|
|
@@ -2211,13 +2504,15 @@ ${pageTextParts.join(`
|
|
|
2211
2504
|
}
|
|
2212
2505
|
if (allTables.length > 0) {
|
|
2213
2506
|
const markdownTables = tablesToMarkdown(allTables);
|
|
2214
|
-
content.push(
|
|
2507
|
+
content.push(text2(markdownTables));
|
|
2215
2508
|
}
|
|
2216
2509
|
}
|
|
2217
2510
|
return content;
|
|
2218
2511
|
});
|
|
2219
2512
|
|
|
2220
2513
|
// src/index.ts
|
|
2514
|
+
var require3 = createRequire2(import.meta.url);
|
|
2515
|
+
var packageJson = require3("../package.json");
|
|
2221
2516
|
var transportType = process.env["MCP_TRANSPORT"] ?? "stdio";
|
|
2222
2517
|
var httpPort = Number.parseInt(process.env["MCP_HTTP_PORT"] ?? "8080", 10);
|
|
2223
2518
|
var httpHost = process.env["MCP_HTTP_HOST"] ?? "0.0.0.0";
|
|
@@ -2235,9 +2530,9 @@ function createTransport() {
|
|
|
2235
2530
|
}
|
|
2236
2531
|
var server = createServer({
|
|
2237
2532
|
name: "pdf-reader-mcp",
|
|
2238
|
-
version:
|
|
2239
|
-
instructions: "MCP Server for
|
|
2240
|
-
tools: { read_pdf: readPdf },
|
|
2533
|
+
version: packageJson.version,
|
|
2534
|
+
instructions: "MCP Server for inspecting PDF files and extracting text, metadata, images, citations, safety signals, and agent-ready document structure.",
|
|
2535
|
+
tools: { inspect_pdf: inspectPdf, read_pdf: readPdf },
|
|
2241
2536
|
transport: createTransport()
|
|
2242
2537
|
});
|
|
2243
2538
|
async function main() {
|