@sylphx/pdf-reader-mcp 2.4.3 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +193 -32
- package/dist/index.js +1514 -555
- package/package.json +5 -4
package/dist/index.js
CHANGED
|
@@ -6,10 +6,6 @@ import { createServer, http, stdio } from "@sylphx/mcp-server-sdk";
|
|
|
6
6
|
// src/handlers/readPdf.ts
|
|
7
7
|
import { image, text, tool, toolError } from "@sylphx/mcp-server-sdk";
|
|
8
8
|
|
|
9
|
-
// src/pdf/extractor.ts
|
|
10
|
-
import { OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
11
|
-
import { PNG } from "pngjs";
|
|
12
|
-
|
|
13
9
|
// src/utils/logger.ts
|
|
14
10
|
class Logger {
|
|
15
11
|
prefix;
|
|
@@ -87,182 +83,1308 @@ var createLogger = (component, minLevel) => {
|
|
|
87
83
|
};
|
|
88
84
|
var logger = new Logger("", 2 /* WARN */);
|
|
89
85
|
|
|
90
|
-
// src/pdf/
|
|
91
|
-
var logger2 = createLogger("
|
|
92
|
-
var
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
png.data[dstIdx] = pixelData[srcIdx] ?? 0;
|
|
101
|
-
png.data[dstIdx + 1] = pixelData[srcIdx + 1] ?? 0;
|
|
102
|
-
png.data[dstIdx + 2] = pixelData[srcIdx + 2] ?? 0;
|
|
103
|
-
png.data[dstIdx + 3] = 255;
|
|
104
|
-
}
|
|
105
|
-
} else if (channels === 1) {
|
|
106
|
-
for (let i = 0;i < width * height; i++) {
|
|
107
|
-
const gray = pixelData[i] ?? 0;
|
|
108
|
-
const dstIdx = i * 4;
|
|
109
|
-
png.data[dstIdx] = gray;
|
|
110
|
-
png.data[dstIdx + 1] = gray;
|
|
111
|
-
png.data[dstIdx + 2] = gray;
|
|
112
|
-
png.data[dstIdx + 3] = 255;
|
|
113
|
-
}
|
|
86
|
+
// src/pdf/tableExtractor.ts
|
|
87
|
+
var logger2 = createLogger("TableExtractor");
|
|
88
|
+
var Y_TOLERANCE = 5;
|
|
89
|
+
var COLUMN_GAP_THRESHOLD = 15;
|
|
90
|
+
var MIN_ROWS = 2;
|
|
91
|
+
var MIN_COLS = 2;
|
|
92
|
+
var MIN_ROW_ITEMS = 2;
|
|
93
|
+
var buildBoundingBox = (x, y, width, height) => {
|
|
94
|
+
if (![x, y, width].every(Number.isFinite) || height === undefined || !Number.isFinite(height)) {
|
|
95
|
+
return;
|
|
114
96
|
}
|
|
115
|
-
|
|
116
|
-
|
|
97
|
+
return {
|
|
98
|
+
left: x,
|
|
99
|
+
bottom: y,
|
|
100
|
+
right: x + Math.max(0, width),
|
|
101
|
+
top: y + Math.max(0, height)
|
|
102
|
+
};
|
|
117
103
|
};
|
|
118
|
-
var
|
|
119
|
-
if (
|
|
120
|
-
return
|
|
121
|
-
}
|
|
122
|
-
const img = imageData;
|
|
123
|
-
if (!img.data || !img.width || !img.height) {
|
|
124
|
-
return null;
|
|
125
|
-
}
|
|
126
|
-
const channels = img.kind === 1 ? 1 : img.kind === 3 ? 4 : 3;
|
|
127
|
-
const format = img.kind === 1 ? "grayscale" : img.kind === 3 ? "rgba" : "rgb";
|
|
128
|
-
const pngBase64 = encodePixelsToPNG(img.data, img.width, img.height, channels);
|
|
104
|
+
var mergeBoundingBoxes = (boxes) => {
|
|
105
|
+
if (boxes.length === 0)
|
|
106
|
+
return;
|
|
129
107
|
return {
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
format,
|
|
135
|
-
data: pngBase64
|
|
108
|
+
left: Math.min(...boxes.map((box) => box.left)),
|
|
109
|
+
bottom: Math.min(...boxes.map((box) => box.bottom)),
|
|
110
|
+
right: Math.max(...boxes.map((box) => box.right)),
|
|
111
|
+
top: Math.max(...boxes.map((box) => box.top))
|
|
136
112
|
};
|
|
137
113
|
};
|
|
138
|
-
var
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
114
|
+
var extractTextItemsWithPositions = async (page) => {
|
|
115
|
+
const textContent = await page.getTextContent();
|
|
116
|
+
const items = [];
|
|
117
|
+
for (const item of textContent.items) {
|
|
118
|
+
const textItem = item;
|
|
119
|
+
if (!textItem.str.trim())
|
|
120
|
+
continue;
|
|
121
|
+
if (!textItem.transform || textItem.transform.length < 6)
|
|
122
|
+
continue;
|
|
123
|
+
const x = textItem.transform[4];
|
|
124
|
+
const y = textItem.transform[5];
|
|
125
|
+
if (x === undefined || y === undefined)
|
|
126
|
+
continue;
|
|
127
|
+
const height = textItem.height ?? Math.abs(textItem.transform[3] ?? 0);
|
|
128
|
+
items.push({
|
|
129
|
+
text: textItem.str,
|
|
130
|
+
x,
|
|
131
|
+
y,
|
|
132
|
+
width: textItem.width ?? textItem.str.length * 6,
|
|
133
|
+
...height > 0 ? { height } : {},
|
|
134
|
+
...height > 0 ? {
|
|
135
|
+
bounding_box: buildBoundingBox(x, y, textItem.width ?? textItem.str.length * 6, height)
|
|
136
|
+
} : {}
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
return items;
|
|
140
|
+
};
|
|
141
|
+
var clusterByY = (items, tolerance = Y_TOLERANCE) => {
|
|
142
|
+
if (items.length === 0)
|
|
143
|
+
return [];
|
|
144
|
+
const sorted = [...items].sort((a, b) => b.y - a.y);
|
|
145
|
+
const firstItem = sorted[0];
|
|
146
|
+
if (!firstItem)
|
|
147
|
+
return [];
|
|
148
|
+
const rows = [];
|
|
149
|
+
let currentRow = { y: firstItem.y, items: [firstItem] };
|
|
150
|
+
for (let i = 1;i < sorted.length; i++) {
|
|
151
|
+
const item = sorted[i];
|
|
152
|
+
if (!item)
|
|
153
|
+
continue;
|
|
154
|
+
const yDiff = Math.abs(currentRow.y - item.y);
|
|
155
|
+
if (yDiff <= tolerance) {
|
|
156
|
+
currentRow.items.push(item);
|
|
157
|
+
} else {
|
|
158
|
+
rows.push(currentRow);
|
|
159
|
+
currentRow = { y: item.y, items: [item] };
|
|
148
160
|
}
|
|
149
161
|
}
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
162
|
+
rows.push(currentRow);
|
|
163
|
+
for (const row of rows) {
|
|
164
|
+
row.items.sort((a, b) => a.x - b.x);
|
|
165
|
+
}
|
|
166
|
+
return rows;
|
|
167
|
+
};
|
|
168
|
+
var detectColumnBoundaries = (rows, gapThreshold = COLUMN_GAP_THRESHOLD) => {
|
|
169
|
+
if (rows.length === 0)
|
|
170
|
+
return [];
|
|
171
|
+
const allXPositions = [];
|
|
172
|
+
for (const row of rows) {
|
|
173
|
+
for (const item of row.items) {
|
|
174
|
+
allXPositions.push(item.x);
|
|
154
175
|
}
|
|
155
|
-
} catch (error) {
|
|
156
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
157
|
-
logger2.warn("Sync image get failed, trying async", { imageName, error: message });
|
|
158
176
|
}
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
}
|
|
175
|
-
}, 1e4);
|
|
176
|
-
try {
|
|
177
|
-
page.objs.get(imageName, (imageData) => {
|
|
178
|
-
if (!resolved) {
|
|
179
|
-
resolved = true;
|
|
180
|
-
cleanup();
|
|
181
|
-
resolve(imageData);
|
|
182
|
-
}
|
|
183
|
-
});
|
|
184
|
-
} catch (error) {
|
|
185
|
-
if (!resolved) {
|
|
186
|
-
resolved = true;
|
|
187
|
-
cleanup();
|
|
188
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
189
|
-
logger2.warn("Error in async image get", { imageName, error: message });
|
|
190
|
-
resolve(null);
|
|
191
|
-
}
|
|
177
|
+
if (allXPositions.length === 0)
|
|
178
|
+
return [];
|
|
179
|
+
allXPositions.sort((a, b) => a - b);
|
|
180
|
+
const firstX = allXPositions[0];
|
|
181
|
+
if (firstX === undefined)
|
|
182
|
+
return [];
|
|
183
|
+
const boundaries = [firstX];
|
|
184
|
+
for (let i = 1;i < allXPositions.length; i++) {
|
|
185
|
+
const current = allXPositions[i];
|
|
186
|
+
const previous = allXPositions[i - 1];
|
|
187
|
+
if (current === undefined || previous === undefined)
|
|
188
|
+
continue;
|
|
189
|
+
const gap = current - previous;
|
|
190
|
+
if (gap >= gapThreshold) {
|
|
191
|
+
boundaries.push(current);
|
|
192
192
|
}
|
|
193
|
-
});
|
|
194
|
-
};
|
|
195
|
-
var extractMetadataAndPageCount = async (pdfDocument, includeMetadata, includePageCount) => {
|
|
196
|
-
const output = {};
|
|
197
|
-
if (includePageCount) {
|
|
198
|
-
output.num_pages = pdfDocument.numPages;
|
|
199
193
|
}
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
const metadataObj = pdfMetadata.metadata;
|
|
208
|
-
if (typeof metadataObj.getAll === "function") {
|
|
209
|
-
output.metadata = metadataObj.getAll();
|
|
210
|
-
} else {
|
|
211
|
-
const metadataRecord = {};
|
|
212
|
-
for (const key in metadataObj) {
|
|
213
|
-
if (Object.hasOwn(metadataObj, key)) {
|
|
214
|
-
metadataRecord[key] = metadataObj[key];
|
|
215
|
-
}
|
|
216
|
-
}
|
|
217
|
-
output.metadata = metadataRecord;
|
|
218
|
-
}
|
|
219
|
-
} catch (metaError) {
|
|
220
|
-
const message = metaError instanceof Error ? metaError.message : String(metaError);
|
|
221
|
-
logger2.warn("Error extracting metadata", { error: message });
|
|
194
|
+
return boundaries;
|
|
195
|
+
};
|
|
196
|
+
var columnIndexForItem = (item, columnBoundaries, tolerance = COLUMN_GAP_THRESHOLD / 2) => {
|
|
197
|
+
for (let i = columnBoundaries.length - 1;i >= 0; i--) {
|
|
198
|
+
const boundary = columnBoundaries[i];
|
|
199
|
+
if (boundary !== undefined && item.x >= boundary - tolerance) {
|
|
200
|
+
return i;
|
|
222
201
|
}
|
|
223
202
|
}
|
|
224
|
-
return
|
|
203
|
+
return 0;
|
|
225
204
|
};
|
|
226
|
-
var
|
|
227
|
-
|
|
228
|
-
|
|
205
|
+
var assignToTableCells = (row, rowIndex, columnBoundaries) => {
|
|
206
|
+
const accumulators = Array.from({ length: columnBoundaries.length }, () => ({ textParts: [], boundingBoxes: [] }));
|
|
207
|
+
for (const item of row.items) {
|
|
208
|
+
const colIndex = columnIndexForItem(item, columnBoundaries);
|
|
209
|
+
const accumulator = accumulators[colIndex];
|
|
210
|
+
if (!accumulator)
|
|
211
|
+
continue;
|
|
212
|
+
accumulator.textParts.push(item.text);
|
|
213
|
+
if (item.bounding_box) {
|
|
214
|
+
accumulator.boundingBoxes.push(item.bounding_box);
|
|
215
|
+
}
|
|
229
216
|
}
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
217
|
+
const cells = accumulators.map((accumulator, colIndex) => {
|
|
218
|
+
const boundingBox = mergeBoundingBoxes(accumulator.boundingBoxes);
|
|
219
|
+
return {
|
|
220
|
+
text: accumulator.textParts.join(" "),
|
|
221
|
+
rowIndex,
|
|
222
|
+
colIndex,
|
|
223
|
+
...boundingBox ? { bounding_box: boundingBox } : {}
|
|
224
|
+
};
|
|
225
|
+
});
|
|
226
|
+
return {
|
|
227
|
+
rowValues: cells.map((cell) => cell.text),
|
|
228
|
+
cells
|
|
229
|
+
};
|
|
233
230
|
};
|
|
234
|
-
var
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
231
|
+
var calculateConfidence = (rows, columnBoundaries) => {
|
|
232
|
+
if (rows.length < MIN_ROWS || columnBoundaries.length < MIN_COLS) {
|
|
233
|
+
return 0;
|
|
234
|
+
}
|
|
235
|
+
let score = 0;
|
|
236
|
+
let checks = 0;
|
|
237
|
+
for (const row of rows) {
|
|
238
|
+
const itemsPerColumn = new Set;
|
|
239
|
+
for (const item of row.items) {
|
|
240
|
+
for (let i = columnBoundaries.length - 1;i >= 0; i--) {
|
|
241
|
+
const boundary = columnBoundaries[i];
|
|
242
|
+
if (boundary !== undefined && item.x >= boundary - COLUMN_GAP_THRESHOLD / 2) {
|
|
243
|
+
itemsPerColumn.add(i);
|
|
244
|
+
break;
|
|
245
|
+
}
|
|
248
246
|
}
|
|
249
|
-
textByY.get(y)?.push(textItem.str);
|
|
250
247
|
}
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
248
|
+
score += itemsPerColumn.size / columnBoundaries.length;
|
|
249
|
+
checks++;
|
|
250
|
+
}
|
|
251
|
+
if (rows.length >= 2) {
|
|
252
|
+
const spacings = [];
|
|
253
|
+
for (let i = 1;i < rows.length; i++) {
|
|
254
|
+
const prevRow = rows[i - 1];
|
|
255
|
+
const currRow = rows[i];
|
|
256
|
+
if (prevRow && currRow) {
|
|
257
|
+
spacings.push(Math.abs(prevRow.y - currRow.y));
|
|
259
258
|
}
|
|
260
259
|
}
|
|
261
|
-
if (
|
|
262
|
-
const
|
|
263
|
-
const
|
|
264
|
-
|
|
265
|
-
|
|
260
|
+
if (spacings.length > 0) {
|
|
261
|
+
const avgSpacing = spacings.reduce((a, b) => a + b, 0) / spacings.length;
|
|
262
|
+
const variance = spacings.reduce((sum, s) => sum + (s - avgSpacing) ** 2, 0) / spacings.length;
|
|
263
|
+
const stdDev = Math.sqrt(variance);
|
|
264
|
+
const regularityScore = avgSpacing > 0 ? Math.max(0, 1 - stdDev / avgSpacing) : 0;
|
|
265
|
+
score += regularityScore;
|
|
266
|
+
checks++;
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
return checks > 0 ? Math.min(1, score / checks) : 0;
|
|
270
|
+
};
|
|
271
|
+
var identifyTableRegions = (rows) => {
|
|
272
|
+
const regions = [];
|
|
273
|
+
const candidateRows = rows.filter((row) => row.items.length >= MIN_ROW_ITEMS);
|
|
274
|
+
if (candidateRows.length < MIN_ROWS) {
|
|
275
|
+
return regions;
|
|
276
|
+
}
|
|
277
|
+
const columnBoundaries = detectColumnBoundaries(candidateRows);
|
|
278
|
+
if (columnBoundaries.length < MIN_COLS) {
|
|
279
|
+
return regions;
|
|
280
|
+
}
|
|
281
|
+
let currentRegion = [];
|
|
282
|
+
for (const row of candidateRows) {
|
|
283
|
+
const alignedItems = row.items.filter((item) => {
|
|
284
|
+
return columnBoundaries.some((boundary) => Math.abs(item.x - boundary) < COLUMN_GAP_THRESHOLD);
|
|
285
|
+
});
|
|
286
|
+
if (alignedItems.length >= MIN_COLS - 1) {
|
|
287
|
+
currentRegion.push(row);
|
|
288
|
+
} else if (currentRegion.length >= MIN_ROWS) {
|
|
289
|
+
const firstRow = currentRegion[0];
|
|
290
|
+
const lastRow = currentRegion[currentRegion.length - 1];
|
|
291
|
+
if (firstRow && lastRow) {
|
|
292
|
+
regions.push({
|
|
293
|
+
rows: currentRegion,
|
|
294
|
+
columnBoundaries,
|
|
295
|
+
startY: firstRow.y,
|
|
296
|
+
endY: lastRow.y
|
|
297
|
+
});
|
|
298
|
+
}
|
|
299
|
+
currentRegion = [];
|
|
300
|
+
} else {
|
|
301
|
+
currentRegion = [];
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
305
|
+
const firstRow = currentRegion[0];
|
|
306
|
+
const lastRow = currentRegion[currentRegion.length - 1];
|
|
307
|
+
if (firstRow && lastRow) {
|
|
308
|
+
regions.push({
|
|
309
|
+
rows: currentRegion,
|
|
310
|
+
columnBoundaries,
|
|
311
|
+
startY: firstRow.y,
|
|
312
|
+
endY: lastRow.y
|
|
313
|
+
});
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
return regions;
|
|
317
|
+
};
|
|
318
|
+
var extractTablesFromPage = async (page, pageNum) => {
|
|
319
|
+
const tables = [];
|
|
320
|
+
try {
|
|
321
|
+
const textItems = await extractTextItemsWithPositions(page);
|
|
322
|
+
if (textItems.length === 0) {
|
|
323
|
+
return tables;
|
|
324
|
+
}
|
|
325
|
+
const rows = clusterByY(textItems);
|
|
326
|
+
const tableRegions = identifyTableRegions(rows);
|
|
327
|
+
for (let tableIndex = 0;tableIndex < tableRegions.length; tableIndex++) {
|
|
328
|
+
const region = tableRegions[tableIndex];
|
|
329
|
+
if (!region)
|
|
330
|
+
continue;
|
|
331
|
+
const tableRows = [];
|
|
332
|
+
const tableCells = [];
|
|
333
|
+
for (let rowIndex = 0;rowIndex < region.rows.length; rowIndex++) {
|
|
334
|
+
const row = region.rows[rowIndex];
|
|
335
|
+
if (!row)
|
|
336
|
+
continue;
|
|
337
|
+
const assigned = assignToTableCells(row, rowIndex, region.columnBoundaries);
|
|
338
|
+
tableRows.push(assigned.rowValues);
|
|
339
|
+
tableCells.push(...assigned.cells);
|
|
340
|
+
}
|
|
341
|
+
const confidence = calculateConfidence(region.rows, region.columnBoundaries);
|
|
342
|
+
const tableBoundingBox = mergeBoundingBoxes(tableCells.map((cell) => cell.bounding_box).filter((box) => box !== undefined));
|
|
343
|
+
if (confidence >= 0.3) {
|
|
344
|
+
tables.push({
|
|
345
|
+
page: pageNum,
|
|
346
|
+
tableIndex,
|
|
347
|
+
rows: tableRows,
|
|
348
|
+
cells: tableCells,
|
|
349
|
+
...tableBoundingBox ? { bounding_box: tableBoundingBox } : {},
|
|
350
|
+
rowCount: tableRows.length,
|
|
351
|
+
colCount: region.columnBoundaries.length,
|
|
352
|
+
confidence: Math.round(confidence * 100) / 100
|
|
353
|
+
});
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
} catch (error) {
|
|
357
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
358
|
+
logger2.warn("Error extracting tables from page", { pageNum, error: message });
|
|
359
|
+
}
|
|
360
|
+
return tables;
|
|
361
|
+
};
|
|
362
|
+
var extractTables = async (pdfDocument, pagesToProcess) => {
|
|
363
|
+
const allTables = [];
|
|
364
|
+
for (const pageNum of pagesToProcess) {
|
|
365
|
+
try {
|
|
366
|
+
const page = await pdfDocument.getPage(pageNum);
|
|
367
|
+
const pageTables = await extractTablesFromPage(page, pageNum);
|
|
368
|
+
allTables.push(...pageTables);
|
|
369
|
+
} catch (error) {
|
|
370
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
371
|
+
logger2.warn("Error getting page for table extraction", { pageNum, error: message });
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
return allTables;
|
|
375
|
+
};
|
|
376
|
+
var tableToMarkdown = (table) => {
|
|
377
|
+
if (table.rows.length === 0)
|
|
378
|
+
return "";
|
|
379
|
+
const lines = [];
|
|
380
|
+
const headerRow = table.rows[0];
|
|
381
|
+
if (!headerRow)
|
|
382
|
+
return "";
|
|
383
|
+
lines.push(`| ${headerRow.map((cell) => cell.trim() || " ").join(" | ")} |`);
|
|
384
|
+
lines.push(`| ${headerRow.map(() => "---").join(" | ")} |`);
|
|
385
|
+
for (let i = 1;i < table.rows.length; i++) {
|
|
386
|
+
const row = table.rows[i];
|
|
387
|
+
if (!row)
|
|
388
|
+
continue;
|
|
389
|
+
const paddedRow = [...row];
|
|
390
|
+
while (paddedRow.length < headerRow.length) {
|
|
391
|
+
paddedRow.push("");
|
|
392
|
+
}
|
|
393
|
+
lines.push(`| ${paddedRow.map((cell) => cell.trim() || " ").join(" | ")} |`);
|
|
394
|
+
}
|
|
395
|
+
return lines.join(`
|
|
396
|
+
`);
|
|
397
|
+
};
|
|
398
|
+
var tablesToMarkdown = (tables) => {
|
|
399
|
+
if (tables.length === 0)
|
|
400
|
+
return "";
|
|
401
|
+
const sections = ["## Extracted Tables", ""];
|
|
402
|
+
for (const table of tables) {
|
|
403
|
+
sections.push(`### Page ${table.page}, Table ${table.tableIndex + 1}`);
|
|
404
|
+
sections.push(`*Confidence: ${(table.confidence * 100).toFixed(0)}%*`);
|
|
405
|
+
sections.push("");
|
|
406
|
+
sections.push(tableToMarkdown(table));
|
|
407
|
+
sections.push("");
|
|
408
|
+
}
|
|
409
|
+
return sections.join(`
|
|
410
|
+
`);
|
|
411
|
+
};
|
|
412
|
+
|
|
413
|
+
// src/pdf/documentModel.ts
|
|
414
|
+
var DEFAULT_CHUNK_MAX_CHARS = 1800;
|
|
415
|
+
var buildElementId = (page, type, index) => `p${String(page)}-${type}-${String(index)}`;
|
|
416
|
+
var imageElementMetadata = (imageData) => {
|
|
417
|
+
const { data: _data, ...metadata } = imageData;
|
|
418
|
+
return metadata;
|
|
419
|
+
};
|
|
420
|
+
var buildPageTextStats = (items) => {
|
|
421
|
+
const heights = items.filter((item) => item.type === "text" && item.textContent?.trim() && item.height).map((item) => item.height).sort((a, b) => a - b);
|
|
422
|
+
if (heights.length === 0) {
|
|
423
|
+
return { maxHeight: 0, medianHeight: 0, textItemCount: 0 };
|
|
424
|
+
}
|
|
425
|
+
const midpoint = Math.floor(heights.length / 2);
|
|
426
|
+
const medianHeight = heights.length % 2 === 0 ? ((heights[midpoint - 1] ?? 0) + (heights[midpoint] ?? 0)) / 2 : heights[midpoint] ?? 0;
|
|
427
|
+
return {
|
|
428
|
+
maxHeight: heights.at(-1) ?? 0,
|
|
429
|
+
medianHeight,
|
|
430
|
+
textItemCount: heights.length
|
|
431
|
+
};
|
|
432
|
+
};
|
|
433
|
+
var buildSemanticHint = (item, stats) => {
|
|
434
|
+
if (item.type !== "text" || !item.textContent?.trim())
|
|
435
|
+
return;
|
|
436
|
+
const textContent = item.textContent.trim();
|
|
437
|
+
if (/^([-*]\s+|\d+[.)]\s+)/.test(textContent)) {
|
|
438
|
+
return {
|
|
439
|
+
role: "list_item",
|
|
440
|
+
confidence: 0.92,
|
|
441
|
+
signals: ["list-prefix"]
|
|
442
|
+
};
|
|
443
|
+
}
|
|
444
|
+
const height = item.height ?? 0;
|
|
445
|
+
const isShortLine = textContent.length <= 120;
|
|
446
|
+
const endsLikeSentence = /[.!?]$/.test(textContent);
|
|
447
|
+
const isLargeText = stats.textItemCount > 1 && height > 0 && stats.medianHeight > 0 && height >= stats.medianHeight * 1.3 && height >= stats.maxHeight * 0.8;
|
|
448
|
+
if (isLargeText && isShortLine && !endsLikeSentence) {
|
|
449
|
+
const ratio = height / stats.medianHeight;
|
|
450
|
+
const level = ratio >= 1.8 ? 1 : ratio >= 1.55 ? 2 : 3;
|
|
451
|
+
return {
|
|
452
|
+
role: "heading",
|
|
453
|
+
level,
|
|
454
|
+
confidence: 0.78,
|
|
455
|
+
signals: ["larger-text", "short-line"]
|
|
456
|
+
};
|
|
457
|
+
}
|
|
458
|
+
return {
|
|
459
|
+
role: "paragraph",
|
|
460
|
+
confidence: 0.5,
|
|
461
|
+
signals: ["default-text"]
|
|
462
|
+
};
|
|
463
|
+
};
|
|
464
|
+
var contentItemToElement = (item, page, index, semanticHint) => {
|
|
465
|
+
if (item.type === "text" && item.textContent?.trim()) {
|
|
466
|
+
return {
|
|
467
|
+
id: buildElementId(page, "text", index),
|
|
468
|
+
type: "text",
|
|
469
|
+
page,
|
|
470
|
+
content: item.textContent,
|
|
471
|
+
bounding_box: item.bounding_box,
|
|
472
|
+
provenance: {
|
|
473
|
+
engine: "pdfjs",
|
|
474
|
+
source: "text-content"
|
|
475
|
+
},
|
|
476
|
+
...semanticHint ? { semantic_hint: semanticHint } : {}
|
|
477
|
+
};
|
|
478
|
+
}
|
|
479
|
+
if (item.type === "image" && item.imageData) {
|
|
480
|
+
return {
|
|
481
|
+
id: buildElementId(page, "image", index),
|
|
482
|
+
type: "image",
|
|
483
|
+
page,
|
|
484
|
+
image: imageElementMetadata(item.imageData),
|
|
485
|
+
bounding_box: item.bounding_box,
|
|
486
|
+
provenance: {
|
|
487
|
+
engine: "pdfjs",
|
|
488
|
+
source: "image-xobject"
|
|
489
|
+
}
|
|
490
|
+
};
|
|
491
|
+
}
|
|
492
|
+
return;
|
|
493
|
+
};
|
|
494
|
+
var buildStructuredElements = (pageContents, tables, includeSemanticHints) => {
|
|
495
|
+
const elements = [];
|
|
496
|
+
const tablesByPage = new Map;
|
|
497
|
+
for (const table of tables ?? []) {
|
|
498
|
+
const pageTables = tablesByPage.get(table.page) ?? [];
|
|
499
|
+
pageTables.push(table);
|
|
500
|
+
tablesByPage.set(table.page, pageTables);
|
|
501
|
+
}
|
|
502
|
+
const appendTableElement = (table) => {
|
|
503
|
+
elements.push({
|
|
504
|
+
id: buildElementId(table.page, "table", table.tableIndex + 1),
|
|
505
|
+
type: "table",
|
|
506
|
+
page: table.page,
|
|
507
|
+
table: {
|
|
508
|
+
rows: table.rows,
|
|
509
|
+
...table.cells ? { cells: table.cells } : {},
|
|
510
|
+
...table.bounding_box ? { bounding_box: table.bounding_box } : {},
|
|
511
|
+
rowCount: table.rowCount,
|
|
512
|
+
colCount: table.colCount,
|
|
513
|
+
confidence: table.confidence
|
|
514
|
+
},
|
|
515
|
+
bounding_box: table.bounding_box,
|
|
516
|
+
confidence: table.confidence,
|
|
517
|
+
provenance: {
|
|
518
|
+
engine: "pdfjs",
|
|
519
|
+
source: "table-detector"
|
|
520
|
+
}
|
|
521
|
+
});
|
|
522
|
+
};
|
|
523
|
+
for (const pageContent of pageContents) {
|
|
524
|
+
const stats = includeSemanticHints ? buildPageTextStats(pageContent.items) : undefined;
|
|
525
|
+
let elementIndex = 1;
|
|
526
|
+
for (const item of pageContent.items) {
|
|
527
|
+
const semanticHint = stats ? buildSemanticHint(item, stats) : undefined;
|
|
528
|
+
const element = contentItemToElement(item, pageContent.page, elementIndex, semanticHint);
|
|
529
|
+
if (element) {
|
|
530
|
+
elements.push(element);
|
|
531
|
+
elementIndex++;
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
const pageTables = tablesByPage.get(pageContent.page);
|
|
535
|
+
if (pageTables) {
|
|
536
|
+
for (const table of pageTables.sort((a, b) => a.tableIndex - b.tableIndex)) {
|
|
537
|
+
appendTableElement(table);
|
|
538
|
+
}
|
|
539
|
+
tablesByPage.delete(pageContent.page);
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
const remainingTables = Array.from(tablesByPage.values()).flat().sort((a, b) => a.page - b.page || a.tableIndex - b.tableIndex);
|
|
543
|
+
for (const table of remainingTables) {
|
|
544
|
+
appendTableElement(table);
|
|
545
|
+
}
|
|
546
|
+
return elements;
|
|
547
|
+
};
|
|
548
|
+
var renderMarkdownFromPageContents = (pageContents, tables) => {
|
|
549
|
+
const sections = [];
|
|
550
|
+
for (const pageContent of pageContents) {
|
|
551
|
+
const pageLines = [`## Page ${String(pageContent.page)}`, ""];
|
|
552
|
+
for (const item of pageContent.items) {
|
|
553
|
+
if (item.type === "text" && item.textContent?.trim()) {
|
|
554
|
+
pageLines.push(item.textContent.trim(), "");
|
|
555
|
+
} else if (item.type === "image" && item.imageData) {
|
|
556
|
+
pageLines.push(`[Image ${String(item.imageData.index + 1)}: ${String(item.imageData.width)}x${String(item.imageData.height)} ${item.imageData.format}]`, "");
|
|
557
|
+
}
|
|
558
|
+
}
|
|
559
|
+
sections.push(pageLines.join(`
|
|
560
|
+
`).trimEnd());
|
|
561
|
+
}
|
|
562
|
+
if (tables && tables.length > 0) {
|
|
563
|
+
sections.push(tablesToMarkdown(tables));
|
|
564
|
+
}
|
|
565
|
+
return sections.join(`
|
|
566
|
+
|
|
567
|
+
`).trim();
|
|
568
|
+
};
|
|
569
|
+
var escapeHtml = (value) => value.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
570
|
+
var renderTablesToHtml = (tables) => {
|
|
571
|
+
if (!tables || tables.length === 0)
|
|
572
|
+
return [];
|
|
573
|
+
return tables.map((table) => {
|
|
574
|
+
const rows = table.rows.map((row) => {
|
|
575
|
+
const cells = row.map((cell) => `<td>${escapeHtml(cell)}</td>`).join("");
|
|
576
|
+
return `<tr>${cells}</tr>`;
|
|
577
|
+
}).join(`
|
|
578
|
+
`);
|
|
579
|
+
return [
|
|
580
|
+
`<table data-page="${String(table.page)}" data-table-index="${String(table.tableIndex)}">`,
|
|
581
|
+
"<tbody>",
|
|
582
|
+
rows,
|
|
583
|
+
"</tbody>",
|
|
584
|
+
"</table>"
|
|
585
|
+
].join(`
|
|
586
|
+
`);
|
|
587
|
+
});
|
|
588
|
+
};
|
|
589
|
+
var renderHtmlFromPageContents = (pageContents, tables) => {
|
|
590
|
+
const sections = pageContents.map((pageContent) => {
|
|
591
|
+
const body = [
|
|
592
|
+
`<section data-page="${String(pageContent.page)}">`,
|
|
593
|
+
`<h2>Page ${String(pageContent.page)}</h2>`
|
|
594
|
+
];
|
|
595
|
+
for (const item of pageContent.items) {
|
|
596
|
+
if (item.type === "text" && item.textContent?.trim()) {
|
|
597
|
+
body.push(`<p>${escapeHtml(item.textContent.trim())}</p>`);
|
|
598
|
+
} else if (item.type === "image" && item.imageData) {
|
|
599
|
+
body.push([
|
|
600
|
+
`<figure data-image-index="${String(item.imageData.index)}">`,
|
|
601
|
+
`<figcaption>Image ${String(item.imageData.index + 1)}: ${String(item.imageData.width)}x${String(item.imageData.height)} ${escapeHtml(item.imageData.format)}</figcaption>`,
|
|
602
|
+
"</figure>"
|
|
603
|
+
].join(`
|
|
604
|
+
`));
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
body.push("</section>");
|
|
608
|
+
return body.join(`
|
|
609
|
+
`);
|
|
610
|
+
});
|
|
611
|
+
return [...sections, ...renderTablesToHtml(tables)].join(`
|
|
612
|
+
|
|
613
|
+
`).trim();
|
|
614
|
+
};
|
|
615
|
+
var elementText = (element) => {
|
|
616
|
+
if (element.type === "text")
|
|
617
|
+
return element.content.trim();
|
|
618
|
+
if (element.type === "table") {
|
|
619
|
+
const tableText = element.table.rows.map((row) => row.join(" | ")).join(`
|
|
620
|
+
`).trim();
|
|
621
|
+
return tableText.length > 0 ? tableText : undefined;
|
|
622
|
+
}
|
|
623
|
+
return;
|
|
624
|
+
};
|
|
625
|
+
var elementRole = (element) => element.type === "text" ? element.semantic_hint?.role : undefined;
|
|
626
|
+
var chunkTextLength = (draft) => draft.textParts.reduce((sum, part) => sum + part.length + 1, 0);
|
|
627
|
+
var createChunkDraft = (element, strategy, heading) => ({
|
|
628
|
+
pageStart: element.page,
|
|
629
|
+
pageEnd: element.page,
|
|
630
|
+
textParts: [],
|
|
631
|
+
elementIds: [],
|
|
632
|
+
boundingBoxes: [],
|
|
633
|
+
strategy,
|
|
634
|
+
heading
|
|
635
|
+
});
|
|
636
|
+
var addElementToChunk = (draft, element, textValue) => {
|
|
637
|
+
draft.pageEnd = Math.max(draft.pageEnd, element.page);
|
|
638
|
+
draft.textParts.push(textValue);
|
|
639
|
+
draft.elementIds.push(element.id);
|
|
640
|
+
if (element.bounding_box) {
|
|
641
|
+
draft.boundingBoxes.push(element.bounding_box);
|
|
642
|
+
}
|
|
643
|
+
};
|
|
644
|
+
var finalizeChunk = (draft, index) => {
|
|
645
|
+
const textValue = draft.textParts.join(`
|
|
646
|
+
`).trim();
|
|
647
|
+
if (!textValue)
|
|
648
|
+
return;
|
|
649
|
+
return {
|
|
650
|
+
id: draft.pageStart === draft.pageEnd ? `p${String(draft.pageStart)}-chunk-${String(index)}` : `p${String(draft.pageStart)}-p${String(draft.pageEnd)}-chunk-${String(index)}`,
|
|
651
|
+
page_start: draft.pageStart,
|
|
652
|
+
page_end: draft.pageEnd,
|
|
653
|
+
text: textValue,
|
|
654
|
+
element_ids: draft.elementIds,
|
|
655
|
+
strategy: draft.strategy,
|
|
656
|
+
...draft.heading ? { heading: draft.heading } : {},
|
|
657
|
+
...draft.boundingBoxes.length > 0 ? { bounding_boxes: draft.boundingBoxes } : {}
|
|
658
|
+
};
|
|
659
|
+
};
|
|
660
|
+
var buildCitationChunks = (elements, options) => {
|
|
661
|
+
const maxChars = options.maxChars ?? DEFAULT_CHUNK_MAX_CHARS;
|
|
662
|
+
const chunks = [];
|
|
663
|
+
let current;
|
|
664
|
+
const pushCurrent = () => {
|
|
665
|
+
if (!current)
|
|
666
|
+
return;
|
|
667
|
+
const chunk = finalizeChunk(current, chunks.length + 1);
|
|
668
|
+
if (chunk)
|
|
669
|
+
chunks.push(chunk);
|
|
670
|
+
current = undefined;
|
|
671
|
+
};
|
|
672
|
+
for (const element of elements) {
|
|
673
|
+
const textValue = elementText(element);
|
|
674
|
+
if (!textValue)
|
|
675
|
+
continue;
|
|
676
|
+
const role = elementRole(element);
|
|
677
|
+
const shouldStartSemanticChunk = options.useSemanticBoundaries && role === "heading";
|
|
678
|
+
const shouldStartTableChunk = element.type === "table";
|
|
679
|
+
const exceedsSize = current !== undefined && current.elementIds.length > 0 && chunkTextLength(current) + textValue.length > maxChars;
|
|
680
|
+
const crossesPage = current !== undefined && current.pageEnd !== element.page;
|
|
681
|
+
if (shouldStartSemanticChunk || shouldStartTableChunk || exceedsSize || crossesPage) {
|
|
682
|
+
pushCurrent();
|
|
683
|
+
}
|
|
684
|
+
if (!current) {
|
|
685
|
+
const strategy = shouldStartSemanticChunk ? "semantic" : exceedsSize ? "size" : "page";
|
|
686
|
+
const heading = shouldStartSemanticChunk && element.type === "text" ? element.content.trim() : undefined;
|
|
687
|
+
current = createChunkDraft(element, strategy, heading);
|
|
688
|
+
}
|
|
689
|
+
if (element.type === "table" && current.elementIds.length === 0) {
|
|
690
|
+
current.strategy = "table";
|
|
691
|
+
}
|
|
692
|
+
addElementToChunk(current, element, textValue);
|
|
693
|
+
if (element.type === "table") {
|
|
694
|
+
pushCurrent();
|
|
695
|
+
}
|
|
696
|
+
}
|
|
697
|
+
pushCurrent();
|
|
698
|
+
return chunks;
|
|
699
|
+
};
|
|
700
|
+
var PROMPT_INJECTION_PATTERNS = [
|
|
701
|
+
/\bignore (all )?(previous|prior|above) instructions\b/i,
|
|
702
|
+
/\bdisregard (previous|prior|above) instructions\b/i,
|
|
703
|
+
/\bsystem prompt\b/i,
|
|
704
|
+
/\bdeveloper (message|instruction)s?\b/i,
|
|
705
|
+
/\bdo not (follow|obey) .*instructions\b/i
|
|
706
|
+
];
|
|
707
|
+
var snippetFromText = (value) => {
|
|
708
|
+
const normalized = value.replace(/\s+/g, " ").trim();
|
|
709
|
+
return normalized.length > 160 ? `${normalized.slice(0, 157)}...` : normalized;
|
|
710
|
+
};
|
|
711
|
+
var isOutsideViewBox = (box, viewBox) => {
|
|
712
|
+
if (!box || !viewBox)
|
|
713
|
+
return false;
|
|
714
|
+
const tolerance = 1;
|
|
715
|
+
return box.right < viewBox.left - tolerance || box.left > viewBox.right + tolerance || box.top < viewBox.bottom - tolerance || box.bottom > viewBox.top + tolerance;
|
|
716
|
+
};
|
|
717
|
+
var buildSafetyFindings = (pageContents, pageGeometry) => {
|
|
718
|
+
const findings = [];
|
|
719
|
+
const geometryByPage = new Map(pageGeometry?.map((geometry) => [geometry.page, geometry]));
|
|
720
|
+
for (const pageContent of pageContents) {
|
|
721
|
+
let elementIndex = 1;
|
|
722
|
+
const geometry = geometryByPage.get(pageContent.page);
|
|
723
|
+
for (const item of pageContent.items) {
|
|
724
|
+
const element = contentItemToElement(item, pageContent.page, elementIndex);
|
|
725
|
+
if (!element) {
|
|
726
|
+
continue;
|
|
727
|
+
}
|
|
728
|
+
if (element.type === "text") {
|
|
729
|
+
const textContent = element.content.trim();
|
|
730
|
+
const snippet = snippetFromText(textContent);
|
|
731
|
+
if (PROMPT_INJECTION_PATTERNS.some((pattern) => pattern.test(textContent))) {
|
|
732
|
+
findings.push({
|
|
733
|
+
type: "prompt_injection_pattern",
|
|
734
|
+
severity: "high",
|
|
735
|
+
page: pageContent.page,
|
|
736
|
+
element_id: element.id,
|
|
737
|
+
message: "Text matches a common prompt-injection instruction pattern.",
|
|
738
|
+
snippet,
|
|
739
|
+
...element.bounding_box ? { bounding_box: element.bounding_box } : {}
|
|
740
|
+
});
|
|
741
|
+
}
|
|
742
|
+
if (item.height !== undefined && item.height > 0 && item.height < 2) {
|
|
743
|
+
findings.push({
|
|
744
|
+
type: "tiny_text",
|
|
745
|
+
severity: "medium",
|
|
746
|
+
page: pageContent.page,
|
|
747
|
+
element_id: element.id,
|
|
748
|
+
message: "Text is unusually small and may be hidden, decorative, or extraction noise.",
|
|
749
|
+
snippet,
|
|
750
|
+
...element.bounding_box ? { bounding_box: element.bounding_box } : {}
|
|
751
|
+
});
|
|
752
|
+
}
|
|
753
|
+
if (isOutsideViewBox(element.bounding_box, geometry?.view_box)) {
|
|
754
|
+
findings.push({
|
|
755
|
+
type: "off_page_text",
|
|
756
|
+
severity: "medium",
|
|
757
|
+
page: pageContent.page,
|
|
758
|
+
element_id: element.id,
|
|
759
|
+
message: "Text bounding box falls outside the PDF page view box.",
|
|
760
|
+
snippet,
|
|
761
|
+
...element.bounding_box ? { bounding_box: element.bounding_box } : {}
|
|
762
|
+
});
|
|
763
|
+
}
|
|
764
|
+
}
|
|
765
|
+
elementIndex++;
|
|
766
|
+
}
|
|
767
|
+
}
|
|
768
|
+
return findings;
|
|
769
|
+
};
|
|
770
|
+
|
|
771
|
+
// src/pdf/extractor.ts
|
|
772
|
+
import { OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
773
|
+
import { PNG } from "pngjs";
|
|
774
|
+
var logger3 = createLogger("Extractor");
|
|
775
|
+
var TEXT_SEGMENT_GAP_THRESHOLD = 48;
|
|
776
|
+
var COLUMN_CUT_MIN_GAP = 48;
|
|
777
|
+
var COLUMN_CUT_MIN_WIDTH_RATIO = 0.12;
|
|
778
|
+
var SPANNING_WIDTH_RATIO = 0.72;
|
|
779
|
+
var mergeBoundingBoxes2 = (boxes) => {
|
|
780
|
+
const validBoxes = boxes.filter((box) => box !== undefined);
|
|
781
|
+
if (validBoxes.length === 0)
|
|
782
|
+
return;
|
|
783
|
+
return {
|
|
784
|
+
left: Math.min(...validBoxes.map((box) => box.left)),
|
|
785
|
+
bottom: Math.min(...validBoxes.map((box) => box.bottom)),
|
|
786
|
+
right: Math.max(...validBoxes.map((box) => box.right)),
|
|
787
|
+
top: Math.max(...validBoxes.map((box) => box.top))
|
|
788
|
+
};
|
|
789
|
+
};
|
|
790
|
+
var buildBoundingBox2 = (x, y, width, height) => {
|
|
791
|
+
if (x === undefined || y === undefined || width === undefined || height === undefined) {
|
|
792
|
+
return;
|
|
793
|
+
}
|
|
794
|
+
if (![x, y, width, height].every(Number.isFinite)) {
|
|
795
|
+
return;
|
|
796
|
+
}
|
|
797
|
+
return {
|
|
798
|
+
left: x,
|
|
799
|
+
bottom: y,
|
|
800
|
+
right: x + Math.max(0, width),
|
|
801
|
+
top: y + Math.max(0, height)
|
|
802
|
+
};
|
|
803
|
+
};
|
|
804
|
+
var buildRectBoundingBox = (rect) => {
|
|
805
|
+
if (!rect || rect.length < 4)
|
|
806
|
+
return;
|
|
807
|
+
const [x1, y1, x2, y2] = rect;
|
|
808
|
+
if (x1 === undefined || y1 === undefined || x2 === undefined || y2 === undefined || ![x1, y1, x2, y2].every(Number.isFinite)) {
|
|
809
|
+
return;
|
|
810
|
+
}
|
|
811
|
+
return {
|
|
812
|
+
left: Math.min(x1, x2),
|
|
813
|
+
bottom: Math.min(y1, y2),
|
|
814
|
+
right: Math.max(x1, x2),
|
|
815
|
+
top: Math.max(y1, y2)
|
|
816
|
+
};
|
|
817
|
+
};
|
|
818
|
+
var finiteNumber = (value) => typeof value === "number" && Number.isFinite(value);
|
|
819
|
+
var textFromAnnotationField = (direct, objectValue) => {
|
|
820
|
+
const value = direct ?? objectValue?.str;
|
|
821
|
+
return value && value.trim().length > 0 ? value : undefined;
|
|
822
|
+
};
|
|
823
|
+
var sanitizeOutlineItems = (items) => items.map((item) => {
|
|
824
|
+
const title = item.title?.trim();
|
|
825
|
+
if (!title)
|
|
826
|
+
return;
|
|
827
|
+
const children = item.items ? sanitizeOutlineItems(item.items) : undefined;
|
|
828
|
+
return {
|
|
829
|
+
title,
|
|
830
|
+
...item.bold !== undefined ? { bold: item.bold } : {},
|
|
831
|
+
...item.italic !== undefined ? { italic: item.italic } : {},
|
|
832
|
+
...item.color ? { color: Array.from(item.color) } : {},
|
|
833
|
+
...item.url ? { url: item.url } : {},
|
|
834
|
+
...item.dest !== undefined ? { dest: item.dest } : {},
|
|
835
|
+
...children && children.length > 0 ? { items: children } : {}
|
|
836
|
+
};
|
|
837
|
+
}).filter((item) => item !== undefined);
|
|
838
|
+
var PDF_PERMISSION_LABELS = new Map([
|
|
839
|
+
[4, "print"],
|
|
840
|
+
[8, "modify"],
|
|
841
|
+
[16, "copy"],
|
|
842
|
+
[32, "annotate"],
|
|
843
|
+
[256, "fill_forms"],
|
|
844
|
+
[512, "copy_for_accessibility"],
|
|
845
|
+
[1024, "assemble"],
|
|
846
|
+
[2048, "print_high_quality"]
|
|
847
|
+
]);
|
|
848
|
+
var permissionLabels = (permissions) => permissions.map((permission) => PDF_PERMISSION_LABELS.get(permission) ?? `unknown:${String(permission)}`);
|
|
849
|
+
var attachmentSize = (content) => {
|
|
850
|
+
if (!content)
|
|
851
|
+
return;
|
|
852
|
+
if ("byteLength" in content && typeof content.byteLength === "number") {
|
|
853
|
+
return content.byteLength;
|
|
854
|
+
}
|
|
855
|
+
if ("length" in content && typeof content.length === "number") {
|
|
856
|
+
return content.length;
|
|
857
|
+
}
|
|
858
|
+
return;
|
|
859
|
+
};
|
|
860
|
+
var textSegmentToContentItem = (y, segment) => {
|
|
861
|
+
const textContent = segment.map((part) => part.text).join("");
|
|
862
|
+
if (!textContent.trim())
|
|
863
|
+
return null;
|
|
864
|
+
const boundingBox = mergeBoundingBoxes2(segment.map((part) => part.bounding_box));
|
|
865
|
+
const xPosition = boundingBox?.left ?? segment[0]?.x;
|
|
866
|
+
const width = boundingBox !== undefined ? boundingBox.right - boundingBox.left : segment.reduce((sum, part) => sum + part.width, 0);
|
|
867
|
+
const height = boundingBox !== undefined ? boundingBox.top - boundingBox.bottom : Math.max(...segment.map((part) => part.height), 0);
|
|
868
|
+
return {
|
|
869
|
+
type: "text",
|
|
870
|
+
yPosition: y,
|
|
871
|
+
xPosition,
|
|
872
|
+
width,
|
|
873
|
+
height,
|
|
874
|
+
bounding_box: boundingBox,
|
|
875
|
+
textContent
|
|
876
|
+
};
|
|
877
|
+
};
|
|
878
|
+
var splitTextPartsIntoSegments = (parts) => {
|
|
879
|
+
const sortedParts = [...parts].sort((a, b) => a.x - b.x);
|
|
880
|
+
const segments = [];
|
|
881
|
+
let currentSegment = [];
|
|
882
|
+
let previousRight;
|
|
883
|
+
for (const part of sortedParts) {
|
|
884
|
+
if (previousRight !== undefined && part.x - previousRight > TEXT_SEGMENT_GAP_THRESHOLD) {
|
|
885
|
+
if (currentSegment.length > 0) {
|
|
886
|
+
segments.push(currentSegment);
|
|
887
|
+
}
|
|
888
|
+
currentSegment = [];
|
|
889
|
+
}
|
|
890
|
+
currentSegment.push(part);
|
|
891
|
+
previousRight = Math.max(previousRight ?? part.x, part.x + part.width);
|
|
892
|
+
}
|
|
893
|
+
if (currentSegment.length > 0) {
|
|
894
|
+
segments.push(currentSegment);
|
|
895
|
+
}
|
|
896
|
+
return segments;
|
|
897
|
+
};
|
|
898
|
+
var sortByYThenX = (items) => [...items].sort((a, b) => b.yPosition - a.yPosition || (a.xPosition ?? 0) - (b.xPosition ?? 0));
|
|
899
|
+
var findVerticalColumnCut = (items) => {
|
|
900
|
+
const boxedItems = items.filter((item) => item.bounding_box !== undefined);
|
|
901
|
+
if (boxedItems.length < 4)
|
|
902
|
+
return;
|
|
903
|
+
const left = Math.min(...boxedItems.map((item) => item.bounding_box?.left ?? 0));
|
|
904
|
+
const right = Math.max(...boxedItems.map((item) => item.bounding_box?.right ?? 0));
|
|
905
|
+
const pageWidth = right - left;
|
|
906
|
+
if (pageWidth <= 0)
|
|
907
|
+
return;
|
|
908
|
+
const narrowItems = boxedItems.filter((item) => {
|
|
909
|
+
const box = item.bounding_box;
|
|
910
|
+
if (!box)
|
|
911
|
+
return false;
|
|
912
|
+
return box.right - box.left < pageWidth * SPANNING_WIDTH_RATIO;
|
|
913
|
+
});
|
|
914
|
+
if (narrowItems.length < 4)
|
|
915
|
+
return;
|
|
916
|
+
const sorted = [...narrowItems].sort((a, b) => (a.bounding_box?.left ?? 0) - (b.bounding_box?.left ?? 0));
|
|
917
|
+
let currentRight = sorted[0]?.bounding_box?.right;
|
|
918
|
+
if (currentRight === undefined)
|
|
919
|
+
return;
|
|
920
|
+
let largestGap = 0;
|
|
921
|
+
let cutPosition;
|
|
922
|
+
for (let i = 1;i < sorted.length; i++) {
|
|
923
|
+
const box = sorted[i]?.bounding_box;
|
|
924
|
+
if (!box)
|
|
925
|
+
continue;
|
|
926
|
+
if (box.left > currentRight) {
|
|
927
|
+
const gap = box.left - currentRight;
|
|
928
|
+
if (gap > largestGap) {
|
|
929
|
+
largestGap = gap;
|
|
930
|
+
cutPosition = (box.left + currentRight) / 2;
|
|
931
|
+
}
|
|
932
|
+
}
|
|
933
|
+
currentRight = Math.max(currentRight, box.right);
|
|
934
|
+
}
|
|
935
|
+
if (cutPosition === undefined)
|
|
936
|
+
return;
|
|
937
|
+
const minGap = Math.max(COLUMN_CUT_MIN_GAP, pageWidth * COLUMN_CUT_MIN_WIDTH_RATIO);
|
|
938
|
+
if (largestGap < minGap)
|
|
939
|
+
return;
|
|
940
|
+
const leftCount = narrowItems.filter((item) => {
|
|
941
|
+
const box = item.bounding_box;
|
|
942
|
+
if (!box)
|
|
943
|
+
return false;
|
|
944
|
+
return (box.left + box.right) / 2 < cutPosition;
|
|
945
|
+
}).length;
|
|
946
|
+
const rightCount = narrowItems.length - leftCount;
|
|
947
|
+
return leftCount >= 2 && rightCount >= 2 ? cutPosition : undefined;
|
|
948
|
+
};
|
|
949
|
+
var sortPageContentItems = (items) => {
|
|
950
|
+
const cutPosition = findVerticalColumnCut(items);
|
|
951
|
+
if (cutPosition === undefined)
|
|
952
|
+
return sortByYThenX(items);
|
|
953
|
+
const leftColumn = [];
|
|
954
|
+
const rightColumn = [];
|
|
955
|
+
const spanning = [];
|
|
956
|
+
for (const item of items) {
|
|
957
|
+
const box = item.bounding_box;
|
|
958
|
+
if (!box) {
|
|
959
|
+
spanning.push(item);
|
|
960
|
+
continue;
|
|
961
|
+
}
|
|
962
|
+
if (box.left < cutPosition && box.right > cutPosition) {
|
|
963
|
+
spanning.push(item);
|
|
964
|
+
continue;
|
|
965
|
+
}
|
|
966
|
+
const center = (box.left + box.right) / 2;
|
|
967
|
+
if (center < cutPosition) {
|
|
968
|
+
leftColumn.push(item);
|
|
969
|
+
} else {
|
|
970
|
+
rightColumn.push(item);
|
|
971
|
+
}
|
|
972
|
+
}
|
|
973
|
+
const columnItems = [...leftColumn, ...rightColumn].filter((item) => item.bounding_box);
|
|
974
|
+
const highestColumnTop = columnItems.length > 0 ? Math.max(...columnItems.map((item) => item.bounding_box?.top ?? item.yPosition)) : Number.POSITIVE_INFINITY;
|
|
975
|
+
const topSpanning = spanning.filter((item) => (item.bounding_box?.top ?? item.yPosition) >= highestColumnTop);
|
|
976
|
+
const remainingSpanning = spanning.filter((item) => (item.bounding_box?.top ?? item.yPosition) < highestColumnTop);
|
|
977
|
+
return [
|
|
978
|
+
...sortByYThenX(topSpanning),
|
|
979
|
+
...sortByYThenX(leftColumn),
|
|
980
|
+
...sortByYThenX(rightColumn),
|
|
981
|
+
...sortByYThenX(remainingSpanning)
|
|
982
|
+
];
|
|
983
|
+
};
|
|
984
|
+
var encodePixelsToPNG = (pixelData, width, height, channels) => {
|
|
985
|
+
const png = new PNG({ width, height });
|
|
986
|
+
if (channels === 4) {
|
|
987
|
+
png.data = Buffer.from(pixelData);
|
|
988
|
+
} else if (channels === 3) {
|
|
989
|
+
for (let i = 0;i < width * height; i++) {
|
|
990
|
+
const srcIdx = i * 3;
|
|
991
|
+
const dstIdx = i * 4;
|
|
992
|
+
png.data[dstIdx] = pixelData[srcIdx] ?? 0;
|
|
993
|
+
png.data[dstIdx + 1] = pixelData[srcIdx + 1] ?? 0;
|
|
994
|
+
png.data[dstIdx + 2] = pixelData[srcIdx + 2] ?? 0;
|
|
995
|
+
png.data[dstIdx + 3] = 255;
|
|
996
|
+
}
|
|
997
|
+
} else if (channels === 1) {
|
|
998
|
+
for (let i = 0;i < width * height; i++) {
|
|
999
|
+
const gray = pixelData[i] ?? 0;
|
|
1000
|
+
const dstIdx = i * 4;
|
|
1001
|
+
png.data[dstIdx] = gray;
|
|
1002
|
+
png.data[dstIdx + 1] = gray;
|
|
1003
|
+
png.data[dstIdx + 2] = gray;
|
|
1004
|
+
png.data[dstIdx + 3] = 255;
|
|
1005
|
+
}
|
|
1006
|
+
}
|
|
1007
|
+
const pngBuffer = PNG.sync.write(png);
|
|
1008
|
+
return pngBuffer.toString("base64");
|
|
1009
|
+
};
|
|
1010
|
+
var processImageData = (imageData, pageNum, arrayIndex) => {
|
|
1011
|
+
if (!imageData || typeof imageData !== "object") {
|
|
1012
|
+
return null;
|
|
1013
|
+
}
|
|
1014
|
+
const img = imageData;
|
|
1015
|
+
if (!img.data || !img.width || !img.height) {
|
|
1016
|
+
return null;
|
|
1017
|
+
}
|
|
1018
|
+
const channels = img.kind === 1 ? 1 : img.kind === 3 ? 4 : 3;
|
|
1019
|
+
const format = img.kind === 1 ? "grayscale" : img.kind === 3 ? "rgba" : "rgb";
|
|
1020
|
+
const pngBase64 = encodePixelsToPNG(img.data, img.width, img.height, channels);
|
|
1021
|
+
return {
|
|
1022
|
+
page: pageNum,
|
|
1023
|
+
index: arrayIndex,
|
|
1024
|
+
width: img.width,
|
|
1025
|
+
height: img.height,
|
|
1026
|
+
format,
|
|
1027
|
+
data: pngBase64
|
|
1028
|
+
};
|
|
1029
|
+
};
|
|
1030
|
+
var retrieveImageData = async (page, imageName, pageNum) => {
|
|
1031
|
+
if (imageName.startsWith("g_")) {
|
|
1032
|
+
try {
|
|
1033
|
+
const imageData = page.commonObjs.get(imageName);
|
|
1034
|
+
if (imageData) {
|
|
1035
|
+
return imageData;
|
|
1036
|
+
}
|
|
1037
|
+
} catch (error) {
|
|
1038
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1039
|
+
logger3.warn("Error getting image from commonObjs", { imageName, error: message });
|
|
1040
|
+
}
|
|
1041
|
+
}
|
|
1042
|
+
try {
|
|
1043
|
+
const imageData = page.objs.get(imageName);
|
|
1044
|
+
if (imageData !== undefined) {
|
|
1045
|
+
return imageData;
|
|
1046
|
+
}
|
|
1047
|
+
} catch (error) {
|
|
1048
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1049
|
+
logger3.warn("Sync image get failed, trying async", { imageName, error: message });
|
|
1050
|
+
}
|
|
1051
|
+
return new Promise((resolve) => {
|
|
1052
|
+
let resolved = false;
|
|
1053
|
+
let timeoutId = null;
|
|
1054
|
+
const cleanup = () => {
|
|
1055
|
+
if (timeoutId !== null) {
|
|
1056
|
+
clearTimeout(timeoutId);
|
|
1057
|
+
timeoutId = null;
|
|
1058
|
+
}
|
|
1059
|
+
};
|
|
1060
|
+
timeoutId = setTimeout(() => {
|
|
1061
|
+
if (!resolved) {
|
|
1062
|
+
resolved = true;
|
|
1063
|
+
cleanup();
|
|
1064
|
+
logger3.warn("Image extraction timeout", { imageName, pageNum });
|
|
1065
|
+
resolve(null);
|
|
1066
|
+
}
|
|
1067
|
+
}, 1e4);
|
|
1068
|
+
try {
|
|
1069
|
+
page.objs.get(imageName, (imageData) => {
|
|
1070
|
+
if (!resolved) {
|
|
1071
|
+
resolved = true;
|
|
1072
|
+
cleanup();
|
|
1073
|
+
resolve(imageData);
|
|
1074
|
+
}
|
|
1075
|
+
});
|
|
1076
|
+
} catch (error) {
|
|
1077
|
+
if (!resolved) {
|
|
1078
|
+
resolved = true;
|
|
1079
|
+
cleanup();
|
|
1080
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1081
|
+
logger3.warn("Error in async image get", { imageName, error: message });
|
|
1082
|
+
resolve(null);
|
|
1083
|
+
}
|
|
1084
|
+
}
|
|
1085
|
+
});
|
|
1086
|
+
};
|
|
1087
|
+
var extractMetadataAndPageCount = async (pdfDocument, includeMetadata, includePageCount) => {
|
|
1088
|
+
const output = {};
|
|
1089
|
+
if (includePageCount) {
|
|
1090
|
+
output.num_pages = pdfDocument.numPages;
|
|
1091
|
+
}
|
|
1092
|
+
if (includeMetadata) {
|
|
1093
|
+
try {
|
|
1094
|
+
const pdfMetadata = await pdfDocument.getMetadata();
|
|
1095
|
+
const infoData = pdfMetadata.info;
|
|
1096
|
+
if (infoData !== undefined) {
|
|
1097
|
+
output.info = infoData;
|
|
1098
|
+
}
|
|
1099
|
+
const metadataObj = pdfMetadata.metadata;
|
|
1100
|
+
if (metadataObj && typeof metadataObj.getAll === "function") {
|
|
1101
|
+
output.metadata = metadataObj.getAll();
|
|
1102
|
+
} else if (metadataObj && typeof metadataObj === "object") {
|
|
1103
|
+
const metadataRecord = {};
|
|
1104
|
+
for (const key in metadataObj) {
|
|
1105
|
+
if (Object.hasOwn(metadataObj, key)) {
|
|
1106
|
+
metadataRecord[key] = metadataObj[key];
|
|
1107
|
+
}
|
|
1108
|
+
}
|
|
1109
|
+
output.metadata = metadataRecord;
|
|
1110
|
+
}
|
|
1111
|
+
} catch (metaError) {
|
|
1112
|
+
const message = metaError instanceof Error ? metaError.message : String(metaError);
|
|
1113
|
+
logger3.warn("Error extracting metadata", { error: message });
|
|
1114
|
+
}
|
|
1115
|
+
}
|
|
1116
|
+
return output;
|
|
1117
|
+
};
|
|
1118
|
+
var extractDocumentStructure = async (pdfDocument, options) => {
|
|
1119
|
+
const documentWithStructure = pdfDocument;
|
|
1120
|
+
const output = {};
|
|
1121
|
+
if (options.includeOutline && typeof documentWithStructure.getOutline === "function") {
|
|
1122
|
+
try {
|
|
1123
|
+
const outline = await documentWithStructure.getOutline();
|
|
1124
|
+
if (outline && outline.length > 0) {
|
|
1125
|
+
output.outline = sanitizeOutlineItems(outline);
|
|
1126
|
+
}
|
|
1127
|
+
} catch (error) {
|
|
1128
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1129
|
+
logger3.warn("Error extracting outline", { error: message });
|
|
1130
|
+
}
|
|
1131
|
+
}
|
|
1132
|
+
if (options.includePageLabels && typeof documentWithStructure.getPageLabels === "function") {
|
|
1133
|
+
try {
|
|
1134
|
+
const pageLabels = await documentWithStructure.getPageLabels();
|
|
1135
|
+
if (pageLabels && pageLabels.length > 0) {
|
|
1136
|
+
output.page_labels = pageLabels;
|
|
1137
|
+
}
|
|
1138
|
+
} catch (error) {
|
|
1139
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1140
|
+
logger3.warn("Error extracting page labels", { error: message });
|
|
1141
|
+
}
|
|
1142
|
+
}
|
|
1143
|
+
if (options.includePermissions && typeof documentWithStructure.getPermissions === "function") {
|
|
1144
|
+
try {
|
|
1145
|
+
const permissions = await documentWithStructure.getPermissions();
|
|
1146
|
+
if (permissions && permissions.length > 0) {
|
|
1147
|
+
output.permissions = permissionLabels(permissions);
|
|
1148
|
+
}
|
|
1149
|
+
} catch (error) {
|
|
1150
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1151
|
+
logger3.warn("Error extracting permissions", { error: message });
|
|
1152
|
+
}
|
|
1153
|
+
}
|
|
1154
|
+
if (options.includePermissions && typeof documentWithStructure.getMarkInfo === "function") {
|
|
1155
|
+
try {
|
|
1156
|
+
const markInfo = await documentWithStructure.getMarkInfo();
|
|
1157
|
+
if (markInfo && Object.keys(markInfo).length > 0) {
|
|
1158
|
+
output.mark_info = markInfo;
|
|
1159
|
+
}
|
|
1160
|
+
} catch (error) {
|
|
1161
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1162
|
+
logger3.warn("Error extracting mark info", { error: message });
|
|
1163
|
+
}
|
|
1164
|
+
}
|
|
1165
|
+
if (options.includeFormFields && typeof documentWithStructure.getFieldObjects === "function") {
|
|
1166
|
+
try {
|
|
1167
|
+
const fieldObjects = await documentWithStructure.getFieldObjects();
|
|
1168
|
+
if (fieldObjects) {
|
|
1169
|
+
const fields = Object.entries(fieldObjects).flatMap(([name, fieldOrFields]) => {
|
|
1170
|
+
const fieldList = Array.isArray(fieldOrFields) ? fieldOrFields : [fieldOrFields];
|
|
1171
|
+
return fieldList.map((field) => normalizeFormField(name, field));
|
|
1172
|
+
}).filter((field) => field !== undefined);
|
|
1173
|
+
if (fields.length > 0) {
|
|
1174
|
+
output.form_fields = fields;
|
|
1175
|
+
}
|
|
1176
|
+
}
|
|
1177
|
+
} catch (error) {
|
|
1178
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1179
|
+
logger3.warn("Error extracting form fields", { error: message });
|
|
1180
|
+
}
|
|
1181
|
+
}
|
|
1182
|
+
if (options.includeAttachments && typeof documentWithStructure.getAttachments === "function") {
|
|
1183
|
+
try {
|
|
1184
|
+
const attachments = await documentWithStructure.getAttachments();
|
|
1185
|
+
if (attachments) {
|
|
1186
|
+
const attachmentSummaries = Object.entries(attachments).map(([name, attachment]) => {
|
|
1187
|
+
const size = attachmentSize(attachment.content);
|
|
1188
|
+
return {
|
|
1189
|
+
name,
|
|
1190
|
+
...attachment.filename ? { filename: attachment.filename } : {},
|
|
1191
|
+
...attachment.description ? { description: attachment.description } : {},
|
|
1192
|
+
...size !== undefined ? { size_bytes: size } : {}
|
|
1193
|
+
};
|
|
1194
|
+
});
|
|
1195
|
+
if (attachmentSummaries.length > 0) {
|
|
1196
|
+
output.attachments = attachmentSummaries;
|
|
1197
|
+
}
|
|
1198
|
+
}
|
|
1199
|
+
} catch (error) {
|
|
1200
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1201
|
+
logger3.warn("Error extracting attachments", { error: message });
|
|
1202
|
+
}
|
|
1203
|
+
}
|
|
1204
|
+
return output;
|
|
1205
|
+
};
|
|
1206
|
+
var normalizeFormField = (fallbackName, field) => {
|
|
1207
|
+
const name = (field.name ?? field.fieldName ?? fallbackName).trim();
|
|
1208
|
+
if (!name)
|
|
1209
|
+
return;
|
|
1210
|
+
const page = field.page !== undefined ? field.page : field.pageIndex !== undefined ? field.pageIndex + 1 : undefined;
|
|
1211
|
+
const fieldType = field.type ?? field.fieldType;
|
|
1212
|
+
const boundingBox = buildRectBoundingBox(field.rect);
|
|
1213
|
+
return {
|
|
1214
|
+
name,
|
|
1215
|
+
...fieldType ? { type: fieldType } : {},
|
|
1216
|
+
...field.value !== undefined ? { value: field.value } : {},
|
|
1217
|
+
...field.defaultValue !== undefined ? { default_value: field.defaultValue } : {},
|
|
1218
|
+
...page !== undefined ? { page } : {},
|
|
1219
|
+
...field.id ? { id: field.id } : {},
|
|
1220
|
+
...field.editable !== undefined ? { editable: field.editable } : {},
|
|
1221
|
+
...field.required !== undefined ? { required: field.required } : {},
|
|
1222
|
+
...boundingBox ? { bounding_box: boundingBox } : {}
|
|
1223
|
+
};
|
|
1224
|
+
};
|
|
1225
|
+
var normalizeAnnotation = (annotation, pageNum) => {
|
|
1226
|
+
const contents = textFromAnnotationField(annotation.contents, annotation.contentsObj);
|
|
1227
|
+
const title = textFromAnnotationField(annotation.title, annotation.titleObj);
|
|
1228
|
+
const boundingBox = buildRectBoundingBox(annotation.rect);
|
|
1229
|
+
const subtype = annotation.subtype?.trim();
|
|
1230
|
+
const url = annotation.url ?? annotation.unsafeUrl;
|
|
1231
|
+
if (!annotation.id && !subtype && !contents && !title && !url && annotation.dest === undefined) {
|
|
1232
|
+
return;
|
|
1233
|
+
}
|
|
1234
|
+
return {
|
|
1235
|
+
page: pageNum,
|
|
1236
|
+
...annotation.id ? { id: annotation.id } : {},
|
|
1237
|
+
...subtype ? { subtype } : {},
|
|
1238
|
+
...contents ? { contents } : {},
|
|
1239
|
+
...title ? { title } : {},
|
|
1240
|
+
...url ? { url } : {},
|
|
1241
|
+
...annotation.dest !== undefined ? { dest: annotation.dest } : {},
|
|
1242
|
+
...boundingBox ? { bounding_box: boundingBox } : {}
|
|
1243
|
+
};
|
|
1244
|
+
};
|
|
1245
|
+
var isRecord = (value) => typeof value === "object" && value !== null;
|
|
1246
|
+
var normalizeStructureTreeContent = (rawContent) => {
|
|
1247
|
+
const type = typeof rawContent.type === "string" ? rawContent.type.trim() : "";
|
|
1248
|
+
const id = typeof rawContent.id === "string" ? rawContent.id.trim() : "";
|
|
1249
|
+
if (!type && !id)
|
|
1250
|
+
return;
|
|
1251
|
+
return {
|
|
1252
|
+
type: type || "content",
|
|
1253
|
+
...id ? { id } : {}
|
|
1254
|
+
};
|
|
1255
|
+
};
|
|
1256
|
+
var normalizeStructureTreeChild = (rawChild) => {
|
|
1257
|
+
if (!isRecord(rawChild))
|
|
1258
|
+
return;
|
|
1259
|
+
if ("role" in rawChild || "children" in rawChild) {
|
|
1260
|
+
return normalizeStructureTreeNode(rawChild);
|
|
1261
|
+
}
|
|
1262
|
+
return normalizeStructureTreeContent(rawChild);
|
|
1263
|
+
};
|
|
1264
|
+
var normalizeStructureTreeNode = (rawNode) => {
|
|
1265
|
+
const role = typeof rawNode.role === "string" && rawNode.role.trim() ? rawNode.role.trim() : "Unknown";
|
|
1266
|
+
const children = Array.isArray(rawNode.children) ? rawNode.children.map((child) => normalizeStructureTreeChild(child)).filter((child) => child !== undefined) : [];
|
|
1267
|
+
return {
|
|
1268
|
+
role,
|
|
1269
|
+
...children.length > 0 ? { children } : {}
|
|
1270
|
+
};
|
|
1271
|
+
};
|
|
1272
|
+
var extractAnnotations = async (pdfDocument, pagesToProcess) => {
|
|
1273
|
+
const pageAnnotations = [];
|
|
1274
|
+
for (const pageNum of pagesToProcess) {
|
|
1275
|
+
try {
|
|
1276
|
+
const page = await pdfDocument.getPage(pageNum);
|
|
1277
|
+
if (typeof page.getAnnotations !== "function")
|
|
1278
|
+
continue;
|
|
1279
|
+
const annotations = await page.getAnnotations({ intent: "display" });
|
|
1280
|
+
const normalized = annotations.map((annotation) => normalizeAnnotation(annotation, pageNum)).filter((annotation) => annotation !== undefined);
|
|
1281
|
+
if (normalized.length > 0) {
|
|
1282
|
+
pageAnnotations.push({ page: pageNum, annotations: normalized });
|
|
1283
|
+
}
|
|
1284
|
+
} catch (error) {
|
|
1285
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1286
|
+
logger3.warn("Error extracting annotations from page", { pageNum, error: message });
|
|
1287
|
+
}
|
|
1288
|
+
}
|
|
1289
|
+
return pageAnnotations;
|
|
1290
|
+
};
|
|
1291
|
+
var extractStructureTrees = async (pdfDocument, pagesToProcess) => {
|
|
1292
|
+
const pageStructureTrees = [];
|
|
1293
|
+
for (const pageNum of pagesToProcess) {
|
|
1294
|
+
try {
|
|
1295
|
+
const page = await pdfDocument.getPage(pageNum);
|
|
1296
|
+
if (typeof page.getStructTree !== "function")
|
|
1297
|
+
continue;
|
|
1298
|
+
const rawTree = await page.getStructTree();
|
|
1299
|
+
if (!rawTree)
|
|
1300
|
+
continue;
|
|
1301
|
+
pageStructureTrees.push({
|
|
1302
|
+
page: pageNum,
|
|
1303
|
+
tree: normalizeStructureTreeNode(rawTree)
|
|
1304
|
+
});
|
|
1305
|
+
} catch (error) {
|
|
1306
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1307
|
+
logger3.warn("Error extracting structure tree", { pageNum, error: message });
|
|
1308
|
+
}
|
|
1309
|
+
}
|
|
1310
|
+
return pageStructureTrees;
|
|
1311
|
+
};
|
|
1312
|
+
var extractPageGeometry = async (pdfDocument, pagesToProcess) => {
|
|
1313
|
+
const pageGeometry = [];
|
|
1314
|
+
for (const pageNum of pagesToProcess) {
|
|
1315
|
+
try {
|
|
1316
|
+
const page = await pdfDocument.getPage(pageNum);
|
|
1317
|
+
const viewBox = buildRectBoundingBox(page.view);
|
|
1318
|
+
const viewport = page.getViewport({ scale: 1 });
|
|
1319
|
+
const width = finiteNumber(viewport.width) ? viewport.width : viewBox ? viewBox.right - viewBox.left : undefined;
|
|
1320
|
+
const height = finiteNumber(viewport.height) ? viewport.height : viewBox ? viewBox.top - viewBox.bottom : undefined;
|
|
1321
|
+
if (!finiteNumber(width) || !finiteNumber(height)) {
|
|
1322
|
+
logger3.warn("Skipping page geometry with invalid dimensions", { pageNum });
|
|
1323
|
+
continue;
|
|
1324
|
+
}
|
|
1325
|
+
pageGeometry.push({
|
|
1326
|
+
page: pageNum,
|
|
1327
|
+
width,
|
|
1328
|
+
height,
|
|
1329
|
+
rotation: finiteNumber(page.rotate) ? page.rotate : 0,
|
|
1330
|
+
...finiteNumber(page.userUnit) ? { user_unit: page.userUnit } : {},
|
|
1331
|
+
...viewBox ? { view_box: viewBox } : {}
|
|
1332
|
+
});
|
|
1333
|
+
} catch (error) {
|
|
1334
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1335
|
+
logger3.warn("Error extracting page geometry", { pageNum, error: message });
|
|
1336
|
+
}
|
|
1337
|
+
}
|
|
1338
|
+
return pageGeometry;
|
|
1339
|
+
};
|
|
1340
|
+
var buildWarnings = (invalidPages, totalPages) => {
|
|
1341
|
+
if (invalidPages.length === 0) {
|
|
1342
|
+
return [];
|
|
1343
|
+
}
|
|
1344
|
+
return [
|
|
1345
|
+
`Requested page numbers ${invalidPages.join(", ")} exceed total pages (${String(totalPages)}).`
|
|
1346
|
+
];
|
|
1347
|
+
};
|
|
1348
|
+
var extractPageContent = async (pdfDocument, pageNum, includeImages, sourceDescription) => {
|
|
1349
|
+
const contentItems = [];
|
|
1350
|
+
try {
|
|
1351
|
+
const page = await pdfDocument.getPage(pageNum);
|
|
1352
|
+
const textContent = await page.getTextContent();
|
|
1353
|
+
const textByY = new Map;
|
|
1354
|
+
for (const item of textContent.items) {
|
|
1355
|
+
const textItem = item;
|
|
1356
|
+
const xCoord = textItem.transform?.[4];
|
|
1357
|
+
const yCoord = textItem.transform?.[5];
|
|
1358
|
+
if (yCoord === undefined)
|
|
1359
|
+
continue;
|
|
1360
|
+
const y = Math.round(yCoord);
|
|
1361
|
+
const width = textItem.width ?? textItem.str.length * 6;
|
|
1362
|
+
const height = textItem.height ?? Math.abs(textItem.transform?.[3] ?? 0);
|
|
1363
|
+
const boundingBox = buildBoundingBox2(xCoord, yCoord, width, height);
|
|
1364
|
+
if (!textByY.has(y)) {
|
|
1365
|
+
textByY.set(y, []);
|
|
1366
|
+
}
|
|
1367
|
+
textByY.get(y)?.push({
|
|
1368
|
+
text: textItem.str,
|
|
1369
|
+
x: xCoord ?? 0,
|
|
1370
|
+
width,
|
|
1371
|
+
height,
|
|
1372
|
+
bounding_box: boundingBox
|
|
1373
|
+
});
|
|
1374
|
+
}
|
|
1375
|
+
for (const [y, textParts] of textByY.entries()) {
|
|
1376
|
+
for (const segment of splitTextPartsIntoSegments(textParts)) {
|
|
1377
|
+
const contentItem = textSegmentToContentItem(y, segment);
|
|
1378
|
+
if (contentItem) {
|
|
1379
|
+
contentItems.push(contentItem);
|
|
1380
|
+
}
|
|
1381
|
+
}
|
|
1382
|
+
}
|
|
1383
|
+
if (includeImages) {
|
|
1384
|
+
const operatorList = await page.getOperatorList();
|
|
1385
|
+
const imageIndices = [];
|
|
1386
|
+
for (let i = 0;i < operatorList.fnArray.length; i++) {
|
|
1387
|
+
const op = operatorList.fnArray[i];
|
|
266
1388
|
if (op === OPS.paintImageXObject || op === OPS.paintXObject) {
|
|
267
1389
|
imageIndices.push(i);
|
|
268
1390
|
}
|
|
@@ -273,10 +1395,15 @@ var extractPageContent = async (pdfDocument, pageNum, includeImages, sourceDescr
|
|
|
273
1395
|
return null;
|
|
274
1396
|
}
|
|
275
1397
|
const imageName = argsArray[0];
|
|
276
|
-
let
|
|
1398
|
+
let xPosition;
|
|
1399
|
+
let yPosition;
|
|
277
1400
|
if (argsArray.length > 1 && Array.isArray(argsArray[1])) {
|
|
278
1401
|
const transform = argsArray[1];
|
|
1402
|
+
const xCoord = transform[4];
|
|
279
1403
|
const yCoord = transform[5];
|
|
1404
|
+
if (xCoord !== undefined) {
|
|
1405
|
+
xPosition = Math.round(xCoord);
|
|
1406
|
+
}
|
|
280
1407
|
if (yCoord !== undefined) {
|
|
281
1408
|
yPosition = Math.round(yCoord);
|
|
282
1409
|
}
|
|
@@ -284,9 +1411,15 @@ var extractPageContent = async (pdfDocument, pageNum, includeImages, sourceDescr
|
|
|
284
1411
|
const imageData = await retrieveImageData(page, imageName, pageNum);
|
|
285
1412
|
const extractedImage = processImageData(imageData, pageNum, arrayIndex);
|
|
286
1413
|
if (extractedImage) {
|
|
1414
|
+
const imageBox = buildBoundingBox2(xPosition, yPosition, extractedImage.width, extractedImage.height);
|
|
1415
|
+
extractedImage.bounding_box = imageBox;
|
|
287
1416
|
return {
|
|
288
1417
|
type: "image",
|
|
289
|
-
yPosition,
|
|
1418
|
+
yPosition: imageBox?.top ?? yPosition ?? 0,
|
|
1419
|
+
xPosition,
|
|
1420
|
+
width: extractedImage.width,
|
|
1421
|
+
height: extractedImage.height,
|
|
1422
|
+
bounding_box: imageBox,
|
|
290
1423
|
imageData: extractedImage
|
|
291
1424
|
};
|
|
292
1425
|
}
|
|
@@ -298,7 +1431,7 @@ var extractPageContent = async (pdfDocument, pageNum, includeImages, sourceDescr
|
|
|
298
1431
|
}
|
|
299
1432
|
} catch (error) {
|
|
300
1433
|
const message = error instanceof Error ? error.message : String(error);
|
|
301
|
-
|
|
1434
|
+
logger3.warn("Error extracting page content", {
|
|
302
1435
|
pageNum,
|
|
303
1436
|
sourceDescription,
|
|
304
1437
|
error: message
|
|
@@ -311,7 +1444,7 @@ var extractPageContent = async (pdfDocument, pageNum, includeImages, sourceDescr
|
|
|
311
1444
|
}
|
|
312
1445
|
];
|
|
313
1446
|
}
|
|
314
|
-
return contentItems
|
|
1447
|
+
return sortPageContentItems(contentItems);
|
|
315
1448
|
};
|
|
316
1449
|
|
|
317
1450
|
// src/pdf/loader.ts
|
|
@@ -527,7 +1660,7 @@ var resolvePath = (userPath) => {
|
|
|
527
1660
|
};
|
|
528
1661
|
|
|
529
1662
|
// src/pdf/loader.ts
|
|
530
|
-
var
|
|
1663
|
+
var logger4 = createLogger("Loader");
|
|
531
1664
|
var require2 = createRequire(import.meta.url);
|
|
532
1665
|
var PDFJS_ROOT = require2.resolve("pdfjs-dist/package.json").replace("package.json", "");
|
|
533
1666
|
var CMAP_URL = `${PDFJS_ROOT}cmaps/`;
|
|
@@ -651,7 +1784,7 @@ var fetchUrlBody = async (url, config) => {
|
|
|
651
1784
|
throw new PdfError(-32600 /* InvalidRequest */, `URL fetch timed out after ${String(URL_FETCH_TIMEOUT_MS / 1000)}s.`, { cause: err });
|
|
652
1785
|
}
|
|
653
1786
|
const message = err instanceof Error ? err.message : String(err);
|
|
654
|
-
|
|
1787
|
+
logger4.warn("URL fetch failed", { url, error: message });
|
|
655
1788
|
throw new PdfError(-32600 /* InvalidRequest */, `URL fetch failed for '${url}'.`, {
|
|
656
1789
|
cause: err instanceof Error ? err : undefined
|
|
657
1790
|
});
|
|
@@ -666,387 +1799,113 @@ var loadPdfDocument = async (source, sourceDescription) => {
|
|
|
666
1799
|
if (source.path) {
|
|
667
1800
|
pdfData = await loadLocalFile(source.path);
|
|
668
1801
|
} else if (source.url) {
|
|
669
|
-
const config = getSecurityConfig();
|
|
670
|
-
pdfData = await fetchUrlBody(source.url, config);
|
|
671
|
-
} else {
|
|
672
|
-
throw new PdfError(-32602 /* InvalidParams */, `Source ${safeSource} missing 'path' or 'url'.`);
|
|
673
|
-
}
|
|
674
|
-
} catch (err) {
|
|
675
|
-
if (err instanceof PdfError) {
|
|
676
|
-
throw err;
|
|
677
|
-
}
|
|
678
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
679
|
-
logger3.error("Unexpected error preparing PDF source", {
|
|
680
|
-
sourceDescription: safeSource,
|
|
681
|
-
error: message
|
|
682
|
-
});
|
|
683
|
-
throw new PdfError(-32600 /* InvalidRequest */, `Failed to prepare PDF source ${safeSource}.`, {
|
|
684
|
-
cause: err instanceof Error ? err : undefined
|
|
685
|
-
});
|
|
686
|
-
}
|
|
687
|
-
const loadingTask = getDocument({
|
|
688
|
-
data: pdfData,
|
|
689
|
-
cMapUrl: CMAP_URL,
|
|
690
|
-
cMapPacked: true,
|
|
691
|
-
standardFontDataUrl: STANDARD_FONT_DATA_URL,
|
|
692
|
-
wasmUrl: WASM_URL,
|
|
693
|
-
iccUrl: ICC_URL
|
|
694
|
-
});
|
|
695
|
-
try {
|
|
696
|
-
return await loadingTask.promise;
|
|
697
|
-
} catch (err) {
|
|
698
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
699
|
-
logger3.error("PDF.js loading error", { sourceDescription: safeSource, error: message });
|
|
700
|
-
throw new PdfError(-32600 /* InvalidRequest */, `Failed to load PDF document from ${safeSource}.`, { cause: err instanceof Error ? err : undefined });
|
|
701
|
-
}
|
|
702
|
-
};
|
|
703
|
-
|
|
704
|
-
// src/pdf/parser.ts
|
|
705
|
-
var logger4 = createLogger("Parser");
|
|
706
|
-
var MAX_RANGE_SIZE = 1e4;
|
|
707
|
-
var parseRangePart = (part, pages) => {
|
|
708
|
-
const trimmedPart = part.trim();
|
|
709
|
-
if (trimmedPart.includes("-")) {
|
|
710
|
-
const splitResult = trimmedPart.split("-");
|
|
711
|
-
const startStr = splitResult[0] || "";
|
|
712
|
-
const endStr = splitResult[1];
|
|
713
|
-
const start = parseInt(startStr, 10);
|
|
714
|
-
const end = endStr === "" || endStr === undefined ? Infinity : parseInt(endStr, 10);
|
|
715
|
-
if (Number.isNaN(start) || Number.isNaN(end) || start <= 0 || start > end) {
|
|
716
|
-
throw new Error(`Invalid page range values: ${trimmedPart}`);
|
|
717
|
-
}
|
|
718
|
-
const practicalEnd = Math.min(end, start + MAX_RANGE_SIZE);
|
|
719
|
-
for (let i = start;i <= practicalEnd; i++) {
|
|
720
|
-
pages.add(i);
|
|
721
|
-
}
|
|
722
|
-
if (end === Infinity && practicalEnd === start + MAX_RANGE_SIZE) {
|
|
723
|
-
logger4.warn("Open-ended range truncated", { start, practicalEnd });
|
|
724
|
-
}
|
|
725
|
-
} else {
|
|
726
|
-
const page = parseInt(trimmedPart, 10);
|
|
727
|
-
if (Number.isNaN(page) || page <= 0) {
|
|
728
|
-
throw new Error(`Invalid page number: ${trimmedPart}`);
|
|
729
|
-
}
|
|
730
|
-
pages.add(page);
|
|
731
|
-
}
|
|
732
|
-
};
|
|
733
|
-
var parsePageRanges = (ranges) => {
|
|
734
|
-
const pages = new Set;
|
|
735
|
-
const parts = ranges.split(",");
|
|
736
|
-
for (const part of parts) {
|
|
737
|
-
parseRangePart(part, pages);
|
|
738
|
-
}
|
|
739
|
-
if (pages.size === 0) {
|
|
740
|
-
throw new Error("Page range string resulted in zero valid pages.");
|
|
741
|
-
}
|
|
742
|
-
return Array.from(pages).sort((a, b) => a - b);
|
|
743
|
-
};
|
|
744
|
-
var getTargetPages = (sourcePages, sourceDescription) => {
|
|
745
|
-
if (!sourcePages) {
|
|
746
|
-
return;
|
|
747
|
-
}
|
|
748
|
-
try {
|
|
749
|
-
if (typeof sourcePages === "string") {
|
|
750
|
-
return parsePageRanges(sourcePages);
|
|
751
|
-
}
|
|
752
|
-
if (sourcePages.some((p) => !Number.isInteger(p) || p <= 0)) {
|
|
753
|
-
throw new Error("Page numbers in array must be positive integers.");
|
|
754
|
-
}
|
|
755
|
-
const uniquePages = [...new Set(sourcePages)].sort((a, b) => a - b);
|
|
756
|
-
if (uniquePages.length === 0) {
|
|
757
|
-
throw new Error("Page specification resulted in an empty set of pages.");
|
|
758
|
-
}
|
|
759
|
-
return uniquePages;
|
|
760
|
-
} catch (error) {
|
|
761
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
762
|
-
throw new PdfError(-32602 /* InvalidParams */, `Invalid page specification for source ${sourceDescription}: ${message}`);
|
|
763
|
-
}
|
|
764
|
-
};
|
|
765
|
-
var determinePagesToProcess = (targetPages, totalPages, includeFullText) => {
|
|
766
|
-
if (targetPages) {
|
|
767
|
-
const pagesToProcess = targetPages.filter((p) => p <= totalPages);
|
|
768
|
-
const invalidPages = targetPages.filter((p) => p > totalPages);
|
|
769
|
-
return { pagesToProcess, invalidPages };
|
|
770
|
-
}
|
|
771
|
-
if (includeFullText) {
|
|
772
|
-
const pagesToProcess = Array.from({ length: totalPages }, (_, i) => i + 1);
|
|
773
|
-
return { pagesToProcess, invalidPages: [] };
|
|
774
|
-
}
|
|
775
|
-
return { pagesToProcess: [], invalidPages: [] };
|
|
776
|
-
};
|
|
777
|
-
|
|
778
|
-
// src/pdf/tableExtractor.ts
|
|
779
|
-
var logger5 = createLogger("TableExtractor");
|
|
780
|
-
var Y_TOLERANCE = 5;
|
|
781
|
-
var COLUMN_GAP_THRESHOLD = 15;
|
|
782
|
-
var MIN_ROWS = 2;
|
|
783
|
-
var MIN_COLS = 2;
|
|
784
|
-
var MIN_ROW_ITEMS = 2;
|
|
785
|
-
var extractTextItemsWithPositions = async (page) => {
|
|
786
|
-
const textContent = await page.getTextContent();
|
|
787
|
-
const items = [];
|
|
788
|
-
for (const item of textContent.items) {
|
|
789
|
-
const textItem = item;
|
|
790
|
-
if (!textItem.str.trim())
|
|
791
|
-
continue;
|
|
792
|
-
if (!textItem.transform || textItem.transform.length < 6)
|
|
793
|
-
continue;
|
|
794
|
-
const x = textItem.transform[4];
|
|
795
|
-
const y = textItem.transform[5];
|
|
796
|
-
if (x === undefined || y === undefined)
|
|
797
|
-
continue;
|
|
798
|
-
items.push({
|
|
799
|
-
text: textItem.str,
|
|
800
|
-
x,
|
|
801
|
-
y,
|
|
802
|
-
width: textItem.width ?? textItem.str.length * 6
|
|
803
|
-
});
|
|
804
|
-
}
|
|
805
|
-
return items;
|
|
806
|
-
};
|
|
807
|
-
var clusterByY = (items, tolerance = Y_TOLERANCE) => {
|
|
808
|
-
if (items.length === 0)
|
|
809
|
-
return [];
|
|
810
|
-
const sorted = [...items].sort((a, b) => b.y - a.y);
|
|
811
|
-
const firstItem = sorted[0];
|
|
812
|
-
if (!firstItem)
|
|
813
|
-
return [];
|
|
814
|
-
const rows = [];
|
|
815
|
-
let currentRow = { y: firstItem.y, items: [firstItem] };
|
|
816
|
-
for (let i = 1;i < sorted.length; i++) {
|
|
817
|
-
const item = sorted[i];
|
|
818
|
-
if (!item)
|
|
819
|
-
continue;
|
|
820
|
-
const yDiff = Math.abs(currentRow.y - item.y);
|
|
821
|
-
if (yDiff <= tolerance) {
|
|
822
|
-
currentRow.items.push(item);
|
|
823
|
-
} else {
|
|
824
|
-
rows.push(currentRow);
|
|
825
|
-
currentRow = { y: item.y, items: [item] };
|
|
826
|
-
}
|
|
827
|
-
}
|
|
828
|
-
rows.push(currentRow);
|
|
829
|
-
for (const row of rows) {
|
|
830
|
-
row.items.sort((a, b) => a.x - b.x);
|
|
831
|
-
}
|
|
832
|
-
return rows;
|
|
833
|
-
};
|
|
834
|
-
var detectColumnBoundaries = (rows, gapThreshold = COLUMN_GAP_THRESHOLD) => {
|
|
835
|
-
if (rows.length === 0)
|
|
836
|
-
return [];
|
|
837
|
-
const allXPositions = [];
|
|
838
|
-
for (const row of rows) {
|
|
839
|
-
for (const item of row.items) {
|
|
840
|
-
allXPositions.push(item.x);
|
|
841
|
-
}
|
|
842
|
-
}
|
|
843
|
-
if (allXPositions.length === 0)
|
|
844
|
-
return [];
|
|
845
|
-
allXPositions.sort((a, b) => a - b);
|
|
846
|
-
const firstX = allXPositions[0];
|
|
847
|
-
if (firstX === undefined)
|
|
848
|
-
return [];
|
|
849
|
-
const boundaries = [firstX];
|
|
850
|
-
for (let i = 1;i < allXPositions.length; i++) {
|
|
851
|
-
const current = allXPositions[i];
|
|
852
|
-
const previous = allXPositions[i - 1];
|
|
853
|
-
if (current === undefined || previous === undefined)
|
|
854
|
-
continue;
|
|
855
|
-
const gap = current - previous;
|
|
856
|
-
if (gap >= gapThreshold) {
|
|
857
|
-
boundaries.push(current);
|
|
1802
|
+
const config = getSecurityConfig();
|
|
1803
|
+
pdfData = await fetchUrlBody(source.url, config);
|
|
1804
|
+
} else {
|
|
1805
|
+
throw new PdfError(-32602 /* InvalidParams */, `Source ${safeSource} missing 'path' or 'url'.`);
|
|
858
1806
|
}
|
|
859
|
-
}
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
var assignToColumns = (row, columnBoundaries, tolerance = COLUMN_GAP_THRESHOLD / 2) => {
|
|
863
|
-
const cells = new Array(columnBoundaries.length).fill("");
|
|
864
|
-
for (const item of row.items) {
|
|
865
|
-
let colIndex = 0;
|
|
866
|
-
for (let i = columnBoundaries.length - 1;i >= 0; i--) {
|
|
867
|
-
const boundary = columnBoundaries[i];
|
|
868
|
-
if (boundary !== undefined && item.x >= boundary - tolerance) {
|
|
869
|
-
colIndex = i;
|
|
870
|
-
break;
|
|
871
|
-
}
|
|
1807
|
+
} catch (err) {
|
|
1808
|
+
if (err instanceof PdfError) {
|
|
1809
|
+
throw err;
|
|
872
1810
|
}
|
|
873
|
-
const
|
|
874
|
-
|
|
1811
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1812
|
+
logger4.error("Unexpected error preparing PDF source", {
|
|
1813
|
+
sourceDescription: safeSource,
|
|
1814
|
+
error: message
|
|
1815
|
+
});
|
|
1816
|
+
throw new PdfError(-32600 /* InvalidRequest */, `Failed to prepare PDF source ${safeSource}.`, {
|
|
1817
|
+
cause: err instanceof Error ? err : undefined
|
|
1818
|
+
});
|
|
875
1819
|
}
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
1820
|
+
const loadingTask = getDocument({
|
|
1821
|
+
data: pdfData,
|
|
1822
|
+
cMapUrl: CMAP_URL,
|
|
1823
|
+
cMapPacked: true,
|
|
1824
|
+
standardFontDataUrl: STANDARD_FONT_DATA_URL,
|
|
1825
|
+
wasmUrl: WASM_URL,
|
|
1826
|
+
iccUrl: ICC_URL
|
|
1827
|
+
});
|
|
1828
|
+
try {
|
|
1829
|
+
return await loadingTask.promise;
|
|
1830
|
+
} catch (err) {
|
|
1831
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1832
|
+
logger4.error("PDF.js loading error", { sourceDescription: safeSource, error: message });
|
|
1833
|
+
throw new PdfError(-32600 /* InvalidRequest */, `Failed to load PDF document from ${safeSource}.`, { cause: err instanceof Error ? err : undefined });
|
|
881
1834
|
}
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
1835
|
+
};
|
|
1836
|
+
|
|
1837
|
+
// src/pdf/parser.ts
|
|
1838
|
+
var logger5 = createLogger("Parser");
|
|
1839
|
+
var MAX_RANGE_SIZE = 1e4;
|
|
1840
|
+
var parseRangePart = (part, pages) => {
|
|
1841
|
+
const trimmedPart = part.trim();
|
|
1842
|
+
if (trimmedPart.includes("-")) {
|
|
1843
|
+
const splitResult = trimmedPart.split("-");
|
|
1844
|
+
const startStr = splitResult[0] || "";
|
|
1845
|
+
const endStr = splitResult[1];
|
|
1846
|
+
const start = parseInt(startStr, 10);
|
|
1847
|
+
const end = endStr === "" || endStr === undefined ? Infinity : parseInt(endStr, 10);
|
|
1848
|
+
if (Number.isNaN(start) || Number.isNaN(end) || start <= 0 || start > end) {
|
|
1849
|
+
throw new Error(`Invalid page range values: ${trimmedPart}`);
|
|
894
1850
|
}
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
if (rows.length >= 2) {
|
|
899
|
-
const spacings = [];
|
|
900
|
-
for (let i = 1;i < rows.length; i++) {
|
|
901
|
-
const prevRow = rows[i - 1];
|
|
902
|
-
const currRow = rows[i];
|
|
903
|
-
if (prevRow && currRow) {
|
|
904
|
-
spacings.push(Math.abs(prevRow.y - currRow.y));
|
|
905
|
-
}
|
|
1851
|
+
const practicalEnd = Math.min(end, start + MAX_RANGE_SIZE);
|
|
1852
|
+
for (let i = start;i <= practicalEnd; i++) {
|
|
1853
|
+
pages.add(i);
|
|
906
1854
|
}
|
|
907
|
-
if (
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
1855
|
+
if (end === Infinity && practicalEnd === start + MAX_RANGE_SIZE) {
|
|
1856
|
+
logger5.warn("Open-ended range truncated", { start, practicalEnd });
|
|
1857
|
+
}
|
|
1858
|
+
} else {
|
|
1859
|
+
const page = parseInt(trimmedPart, 10);
|
|
1860
|
+
if (Number.isNaN(page) || page <= 0) {
|
|
1861
|
+
throw new Error(`Invalid page number: ${trimmedPart}`);
|
|
914
1862
|
}
|
|
1863
|
+
pages.add(page);
|
|
915
1864
|
}
|
|
916
|
-
return checks > 0 ? Math.min(1, score / checks) : 0;
|
|
917
1865
|
};
|
|
918
|
-
var
|
|
919
|
-
const
|
|
920
|
-
const
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
}
|
|
924
|
-
const columnBoundaries = detectColumnBoundaries(candidateRows);
|
|
925
|
-
if (columnBoundaries.length < MIN_COLS) {
|
|
926
|
-
return regions;
|
|
927
|
-
}
|
|
928
|
-
let currentRegion = [];
|
|
929
|
-
for (const row of candidateRows) {
|
|
930
|
-
const alignedItems = row.items.filter((item) => {
|
|
931
|
-
return columnBoundaries.some((boundary) => Math.abs(item.x - boundary) < COLUMN_GAP_THRESHOLD);
|
|
932
|
-
});
|
|
933
|
-
if (alignedItems.length >= MIN_COLS - 1) {
|
|
934
|
-
currentRegion.push(row);
|
|
935
|
-
} else if (currentRegion.length >= MIN_ROWS) {
|
|
936
|
-
const firstRow = currentRegion[0];
|
|
937
|
-
const lastRow = currentRegion[currentRegion.length - 1];
|
|
938
|
-
if (firstRow && lastRow) {
|
|
939
|
-
regions.push({
|
|
940
|
-
rows: currentRegion,
|
|
941
|
-
columnBoundaries,
|
|
942
|
-
startY: firstRow.y,
|
|
943
|
-
endY: lastRow.y
|
|
944
|
-
});
|
|
945
|
-
}
|
|
946
|
-
currentRegion = [];
|
|
947
|
-
} else {
|
|
948
|
-
currentRegion = [];
|
|
949
|
-
}
|
|
1866
|
+
var parsePageRanges = (ranges) => {
|
|
1867
|
+
const pages = new Set;
|
|
1868
|
+
const parts = ranges.split(",");
|
|
1869
|
+
for (const part of parts) {
|
|
1870
|
+
parseRangePart(part, pages);
|
|
950
1871
|
}
|
|
951
|
-
if (
|
|
952
|
-
|
|
953
|
-
const lastRow = currentRegion[currentRegion.length - 1];
|
|
954
|
-
if (firstRow && lastRow) {
|
|
955
|
-
regions.push({
|
|
956
|
-
rows: currentRegion,
|
|
957
|
-
columnBoundaries,
|
|
958
|
-
startY: firstRow.y,
|
|
959
|
-
endY: lastRow.y
|
|
960
|
-
});
|
|
961
|
-
}
|
|
1872
|
+
if (pages.size === 0) {
|
|
1873
|
+
throw new Error("Page range string resulted in zero valid pages.");
|
|
962
1874
|
}
|
|
963
|
-
return
|
|
1875
|
+
return Array.from(pages).sort((a, b) => a - b);
|
|
964
1876
|
};
|
|
965
|
-
var
|
|
966
|
-
|
|
1877
|
+
var getTargetPages = (sourcePages, sourceDescription) => {
|
|
1878
|
+
if (!sourcePages) {
|
|
1879
|
+
return;
|
|
1880
|
+
}
|
|
967
1881
|
try {
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
return tables;
|
|
1882
|
+
if (typeof sourcePages === "string") {
|
|
1883
|
+
return parsePageRanges(sourcePages);
|
|
971
1884
|
}
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
const tableRows = [];
|
|
979
|
-
for (const row of region.rows) {
|
|
980
|
-
const cells = assignToColumns(row, region.columnBoundaries);
|
|
981
|
-
tableRows.push(cells);
|
|
982
|
-
}
|
|
983
|
-
const confidence = calculateConfidence(region.rows, region.columnBoundaries);
|
|
984
|
-
if (confidence >= 0.3) {
|
|
985
|
-
tables.push({
|
|
986
|
-
page: pageNum,
|
|
987
|
-
tableIndex,
|
|
988
|
-
rows: tableRows,
|
|
989
|
-
rowCount: tableRows.length,
|
|
990
|
-
colCount: region.columnBoundaries.length,
|
|
991
|
-
confidence: Math.round(confidence * 100) / 100
|
|
992
|
-
});
|
|
993
|
-
}
|
|
1885
|
+
if (sourcePages.some((p) => !Number.isInteger(p) || p <= 0)) {
|
|
1886
|
+
throw new Error("Page numbers in array must be positive integers.");
|
|
1887
|
+
}
|
|
1888
|
+
const uniquePages = [...new Set(sourcePages)].sort((a, b) => a - b);
|
|
1889
|
+
if (uniquePages.length === 0) {
|
|
1890
|
+
throw new Error("Page specification resulted in an empty set of pages.");
|
|
994
1891
|
}
|
|
1892
|
+
return uniquePages;
|
|
995
1893
|
} catch (error) {
|
|
996
1894
|
const message = error instanceof Error ? error.message : String(error);
|
|
997
|
-
|
|
998
|
-
}
|
|
999
|
-
return tables;
|
|
1000
|
-
};
|
|
1001
|
-
var extractTables = async (pdfDocument, pagesToProcess) => {
|
|
1002
|
-
const allTables = [];
|
|
1003
|
-
for (const pageNum of pagesToProcess) {
|
|
1004
|
-
try {
|
|
1005
|
-
const page = await pdfDocument.getPage(pageNum);
|
|
1006
|
-
const pageTables = await extractTablesFromPage(page, pageNum);
|
|
1007
|
-
allTables.push(...pageTables);
|
|
1008
|
-
} catch (error) {
|
|
1009
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
1010
|
-
logger5.warn("Error getting page for table extraction", { pageNum, error: message });
|
|
1011
|
-
}
|
|
1895
|
+
throw new PdfError(-32602 /* InvalidParams */, `Invalid page specification for source ${sourceDescription}: ${message}`);
|
|
1012
1896
|
}
|
|
1013
|
-
return allTables;
|
|
1014
1897
|
};
|
|
1015
|
-
var
|
|
1016
|
-
if (
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
if (!headerRow)
|
|
1021
|
-
return "";
|
|
1022
|
-
lines.push(`| ${headerRow.map((cell) => cell.trim() || " ").join(" | ")} |`);
|
|
1023
|
-
lines.push(`| ${headerRow.map(() => "---").join(" | ")} |`);
|
|
1024
|
-
for (let i = 1;i < table.rows.length; i++) {
|
|
1025
|
-
const row = table.rows[i];
|
|
1026
|
-
if (!row)
|
|
1027
|
-
continue;
|
|
1028
|
-
const paddedRow = [...row];
|
|
1029
|
-
while (paddedRow.length < headerRow.length) {
|
|
1030
|
-
paddedRow.push("");
|
|
1031
|
-
}
|
|
1032
|
-
lines.push(`| ${paddedRow.map((cell) => cell.trim() || " ").join(" | ")} |`);
|
|
1898
|
+
var determinePagesToProcess = (targetPages, totalPages, includeFullText) => {
|
|
1899
|
+
if (targetPages) {
|
|
1900
|
+
const pagesToProcess = targetPages.filter((p) => p <= totalPages);
|
|
1901
|
+
const invalidPages = targetPages.filter((p) => p > totalPages);
|
|
1902
|
+
return { pagesToProcess, invalidPages };
|
|
1033
1903
|
}
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
};
|
|
1037
|
-
var tablesToMarkdown = (tables) => {
|
|
1038
|
-
if (tables.length === 0)
|
|
1039
|
-
return "";
|
|
1040
|
-
const sections = ["## Extracted Tables", ""];
|
|
1041
|
-
for (const table of tables) {
|
|
1042
|
-
sections.push(`### Page ${table.page}, Table ${table.tableIndex + 1}`);
|
|
1043
|
-
sections.push(`*Confidence: ${(table.confidence * 100).toFixed(0)}%*`);
|
|
1044
|
-
sections.push("");
|
|
1045
|
-
sections.push(tableToMarkdown(table));
|
|
1046
|
-
sections.push("");
|
|
1904
|
+
if (includeFullText) {
|
|
1905
|
+
const pagesToProcess = Array.from({ length: totalPages }, (_, i) => i + 1);
|
|
1906
|
+
return { pagesToProcess, invalidPages: [] };
|
|
1047
1907
|
}
|
|
1048
|
-
return
|
|
1049
|
-
`);
|
|
1908
|
+
return { pagesToProcess: [], invalidPages: [] };
|
|
1050
1909
|
};
|
|
1051
1910
|
|
|
1052
1911
|
// src/schemas/readPdf.ts
|
|
@@ -1075,7 +1934,21 @@ var readPdfArgsSchema = object({
|
|
|
1075
1934
|
include_metadata: optional(bool(description("Include metadata and info objects for each PDF."))),
|
|
1076
1935
|
include_page_count: optional(bool(description("Include the total number of pages for each PDF."))),
|
|
1077
1936
|
include_images: optional(bool(description("Extract and include embedded images from the PDF pages as base64-encoded data."))),
|
|
1078
|
-
include_tables: optional(bool(description("Detect and extract tables from PDF pages. Uses spatial clustering of text coordinates to identify tabular structures.")))
|
|
1937
|
+
include_tables: optional(bool(description("Detect and extract tables from PDF pages. Uses spatial clustering of text coordinates to identify tabular structures."))),
|
|
1938
|
+
include_elements: optional(bool(description("Include agent-ready structured document elements with page numbers, stable IDs, provenance, and best-effort bounding boxes."))),
|
|
1939
|
+
include_semantic_hints: optional(bool(description("Include deterministic semantic hints on text elements, such as heading, list item, or paragraph."))),
|
|
1940
|
+
include_markdown: optional(bool(description("Include a Markdown rendering of extracted pages for RAG, summarization, and agent context."))),
|
|
1941
|
+
include_html: optional(bool(description("Include a simple HTML rendering of extracted pages for preview, export, and downstream conversion."))),
|
|
1942
|
+
include_chunks: optional(bool(description("Include page-level citation-ready chunks with text, element IDs, page ranges, and best-effort bounding boxes."))),
|
|
1943
|
+
include_outline: optional(bool(description("Include document outline/bookmark entries when the PDF exposes them."))),
|
|
1944
|
+
include_annotations: optional(bool(description("Include page annotations such as links, notes, and form-related annotations with safe summary fields."))),
|
|
1945
|
+
include_page_labels: optional(bool(description("Include PDF page labels when available, such as roman numerals or section labels."))),
|
|
1946
|
+
include_page_geometry: optional(bool(description("Include page viewport geometry such as width, height, rotation, user unit, and view box."))),
|
|
1947
|
+
include_permissions: optional(bool(description("Include PDF permission and marking signals when exposed by the parser."))),
|
|
1948
|
+
include_form_fields: optional(bool(description("Include PDF form field summaries when AcroForm fields are exposed."))),
|
|
1949
|
+
include_attachments: optional(bool(description("Include embedded attachment metadata such as filename and size. Attachment bytes are not returned."))),
|
|
1950
|
+
include_structure_tree: optional(bool(description("Include best-effort tagged PDF structure trees for selected pages when the PDF exposes them."))),
|
|
1951
|
+
include_safety_findings: optional(bool(description("Include deterministic content safety findings for prompt-injection patterns, tiny text, and off-page text.")))
|
|
1079
1952
|
});
|
|
1080
1953
|
|
|
1081
1954
|
// src/handlers/readPdf.ts
|
|
@@ -1091,41 +1964,63 @@ var processSingleSource = async (source, options) => {
|
|
|
1091
1964
|
const totalPages = pdfDocument.numPages;
|
|
1092
1965
|
const metadataOutput = await extractMetadataAndPageCount(pdfDocument, options.includeMetadata, options.includePageCount);
|
|
1093
1966
|
const output = { ...metadataOutput };
|
|
1094
|
-
const
|
|
1967
|
+
const structureOutput = await extractDocumentStructure(pdfDocument, {
|
|
1968
|
+
includeOutline: options.includeOutline,
|
|
1969
|
+
includePageLabels: options.includePageLabels,
|
|
1970
|
+
includePermissions: options.includePermissions,
|
|
1971
|
+
includeFormFields: options.includeFormFields,
|
|
1972
|
+
includeAttachments: options.includeAttachments
|
|
1973
|
+
});
|
|
1974
|
+
Object.assign(output, structureOutput);
|
|
1975
|
+
const explicitPageContent = options.includeFullText || options.includeElements || options.includeSemanticHints || options.includeMarkdown || options.includeHtml || options.includeChunks || options.includeImages || options.includeSafetyFindings;
|
|
1976
|
+
const pageScopedMetadata = options.includeTables || options.includeAnnotations || options.includePageGeometry || options.includeStructureTree;
|
|
1977
|
+
const includeSelectedPageText = targetPages !== undefined && !explicitPageContent && !pageScopedMetadata;
|
|
1978
|
+
const shouldSelectPages = explicitPageContent || includeSelectedPageText || pageScopedMetadata;
|
|
1979
|
+
const { pagesToProcess, invalidPages } = determinePagesToProcess(targetPages, totalPages, shouldSelectPages);
|
|
1095
1980
|
const warnings = buildWarnings(invalidPages, totalPages);
|
|
1096
1981
|
if (warnings.length > 0) {
|
|
1097
1982
|
output.warnings = warnings;
|
|
1098
1983
|
}
|
|
1099
1984
|
if (pagesToProcess.length > 0) {
|
|
1100
|
-
const
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
if (i + MAX_CONCURRENT_PAGES < pagesToProcess.length) {
|
|
1107
|
-
await new Promise((resolve) => setImmediate(resolve));
|
|
1985
|
+
const needsPageContent = explicitPageContent || includeSelectedPageText;
|
|
1986
|
+
let pageGeometry;
|
|
1987
|
+
if (options.includePageGeometry || options.includeSafetyFindings) {
|
|
1988
|
+
pageGeometry = await extractPageGeometry(pdfDocument, pagesToProcess);
|
|
1989
|
+
if (pageGeometry.length > 0 && options.includePageGeometry) {
|
|
1990
|
+
output.page_geometry = pageGeometry;
|
|
1108
1991
|
}
|
|
1109
1992
|
}
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
output.
|
|
1993
|
+
if (needsPageContent) {
|
|
1994
|
+
const MAX_CONCURRENT_PAGES = 5;
|
|
1995
|
+
const pageContents = [];
|
|
1996
|
+
for (let i = 0;i < pagesToProcess.length; i += MAX_CONCURRENT_PAGES) {
|
|
1997
|
+
const batch = pagesToProcess.slice(i, i + MAX_CONCURRENT_PAGES);
|
|
1998
|
+
const batchResults = await Promise.all(batch.map((pageNum) => extractPageContent(pdfDocument, pageNum, options.includeImages, sourceDescription)));
|
|
1999
|
+
pageContents.push(...batchResults);
|
|
2000
|
+
if (i + MAX_CONCURRENT_PAGES < pagesToProcess.length) {
|
|
2001
|
+
await new Promise((resolve) => setImmediate(resolve));
|
|
2002
|
+
}
|
|
2003
|
+
}
|
|
2004
|
+
output.page_contents = pageContents.map((items, idx) => ({
|
|
2005
|
+
page: pagesToProcess[idx],
|
|
2006
|
+
items
|
|
2007
|
+
}));
|
|
2008
|
+
const extractedPageTexts = pageContents.map((items, idx) => ({
|
|
2009
|
+
page: pagesToProcess[idx],
|
|
2010
|
+
text: items.filter((item) => item.type === "text").map((item) => item.textContent).join("")
|
|
2011
|
+
}));
|
|
2012
|
+
if (targetPages) {
|
|
2013
|
+
output.page_texts = extractedPageTexts;
|
|
2014
|
+
} else if (options.includeFullText) {
|
|
2015
|
+
output.full_text = extractedPageTexts.map((p) => p.text).join(`
|
|
1122
2016
|
|
|
1123
2017
|
`);
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
2018
|
+
}
|
|
2019
|
+
if (options.includeImages) {
|
|
2020
|
+
const extractedImages = pageContents.flatMap((items) => items.filter((item) => item.type === "image" && item.imageData)).map((item) => item.imageData).filter((img) => img !== undefined);
|
|
2021
|
+
if (extractedImages.length > 0) {
|
|
2022
|
+
output.images = extractedImages;
|
|
2023
|
+
}
|
|
1129
2024
|
}
|
|
1130
2025
|
}
|
|
1131
2026
|
if (options.includeTables) {
|
|
@@ -1134,6 +2029,40 @@ var processSingleSource = async (source, options) => {
|
|
|
1134
2029
|
output.tables = extractedTables;
|
|
1135
2030
|
}
|
|
1136
2031
|
}
|
|
2032
|
+
const buildElementsForOutput = () => buildStructuredElements(output.page_contents ?? [], output.tables, options.includeSemanticHints);
|
|
2033
|
+
if ((options.includeElements || options.includeSemanticHints) && output.page_contents) {
|
|
2034
|
+
output.elements = buildElementsForOutput();
|
|
2035
|
+
}
|
|
2036
|
+
if (options.includeMarkdown && output.page_contents) {
|
|
2037
|
+
output.markdown = renderMarkdownFromPageContents(output.page_contents, output.tables);
|
|
2038
|
+
}
|
|
2039
|
+
if (options.includeHtml && output.page_contents) {
|
|
2040
|
+
output.html = renderHtmlFromPageContents(output.page_contents, output.tables);
|
|
2041
|
+
}
|
|
2042
|
+
if (options.includeChunks && output.page_contents) {
|
|
2043
|
+
const chunkElements = output.elements ?? buildElementsForOutput();
|
|
2044
|
+
output.chunks = buildCitationChunks(chunkElements, {
|
|
2045
|
+
useSemanticBoundaries: options.includeSemanticHints
|
|
2046
|
+
});
|
|
2047
|
+
}
|
|
2048
|
+
if (options.includeSafetyFindings && output.page_contents) {
|
|
2049
|
+
const safetyFindings = buildSafetyFindings(output.page_contents, pageGeometry);
|
|
2050
|
+
if (safetyFindings.length > 0) {
|
|
2051
|
+
output.safety_findings = safetyFindings;
|
|
2052
|
+
}
|
|
2053
|
+
}
|
|
2054
|
+
if (options.includeAnnotations) {
|
|
2055
|
+
const annotations = await extractAnnotations(pdfDocument, pagesToProcess);
|
|
2056
|
+
if (annotations.length > 0) {
|
|
2057
|
+
output.annotations = annotations;
|
|
2058
|
+
}
|
|
2059
|
+
}
|
|
2060
|
+
if (options.includeStructureTree) {
|
|
2061
|
+
const structureTrees = await extractStructureTrees(pdfDocument, pagesToProcess);
|
|
2062
|
+
if (structureTrees.length > 0) {
|
|
2063
|
+
output.structure_trees = structureTrees;
|
|
2064
|
+
}
|
|
2065
|
+
}
|
|
1137
2066
|
}
|
|
1138
2067
|
individualResult = { ...individualResult, data: output, success: true };
|
|
1139
2068
|
} catch (error) {
|
|
@@ -1171,7 +2100,21 @@ var readPdf = tool().description("Reads content/metadata/images from one or more
|
|
|
1171
2100
|
include_metadata,
|
|
1172
2101
|
include_page_count,
|
|
1173
2102
|
include_images,
|
|
1174
|
-
include_tables
|
|
2103
|
+
include_tables,
|
|
2104
|
+
include_elements,
|
|
2105
|
+
include_semantic_hints,
|
|
2106
|
+
include_markdown,
|
|
2107
|
+
include_html,
|
|
2108
|
+
include_chunks,
|
|
2109
|
+
include_outline,
|
|
2110
|
+
include_annotations,
|
|
2111
|
+
include_page_labels,
|
|
2112
|
+
include_page_geometry,
|
|
2113
|
+
include_permissions,
|
|
2114
|
+
include_form_fields,
|
|
2115
|
+
include_attachments,
|
|
2116
|
+
include_structure_tree,
|
|
2117
|
+
include_safety_findings
|
|
1175
2118
|
} = input;
|
|
1176
2119
|
const MAX_CONCURRENT_SOURCES = 3;
|
|
1177
2120
|
const results = [];
|
|
@@ -1180,7 +2123,21 @@ var readPdf = tool().description("Reads content/metadata/images from one or more
|
|
|
1180
2123
|
includeMetadata: include_metadata ?? true,
|
|
1181
2124
|
includePageCount: include_page_count ?? true,
|
|
1182
2125
|
includeImages: include_images ?? false,
|
|
1183
|
-
includeTables: include_tables ?? false
|
|
2126
|
+
includeTables: include_tables ?? false,
|
|
2127
|
+
includeElements: include_elements ?? false,
|
|
2128
|
+
includeSemanticHints: include_semantic_hints ?? false,
|
|
2129
|
+
includeMarkdown: include_markdown ?? false,
|
|
2130
|
+
includeHtml: include_html ?? false,
|
|
2131
|
+
includeChunks: include_chunks ?? false,
|
|
2132
|
+
includeOutline: include_outline ?? false,
|
|
2133
|
+
includeAnnotations: include_annotations ?? false,
|
|
2134
|
+
includePageLabels: include_page_labels ?? false,
|
|
2135
|
+
includePageGeometry: include_page_geometry ?? false,
|
|
2136
|
+
includePermissions: include_permissions ?? false,
|
|
2137
|
+
includeFormFields: include_form_fields ?? false,
|
|
2138
|
+
includeAttachments: include_attachments ?? false,
|
|
2139
|
+
includeStructureTree: include_structure_tree ?? false,
|
|
2140
|
+
includeSafetyFindings: include_safety_findings ?? false
|
|
1184
2141
|
};
|
|
1185
2142
|
for (let i = 0;i < sources.length; i += MAX_CONCURRENT_SOURCES) {
|
|
1186
2143
|
const batch = sources.slice(i, i + MAX_CONCURRENT_SOURCES);
|
|
@@ -1212,6 +2169,8 @@ var readPdf = tool().description("Reads content/metadata/images from one or more
|
|
|
1212
2169
|
tableIndex: tbl.tableIndex,
|
|
1213
2170
|
rowCount: tbl.rowCount,
|
|
1214
2171
|
colCount: tbl.colCount,
|
|
2172
|
+
cellCount: tbl.cells?.length ?? tbl.rowCount * tbl.colCount,
|
|
2173
|
+
bounding_box: tbl.bounding_box,
|
|
1215
2174
|
confidence: tbl.confidence
|
|
1216
2175
|
}));
|
|
1217
2176
|
}
|