@sylphx/pdf-reader-mcp 2.4.2 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +204 -43
  2. package/dist/index.js +1517 -557
  3. package/package.json +10 -11
package/dist/index.js CHANGED
@@ -6,10 +6,6 @@ import { createServer, http, stdio } from "@sylphx/mcp-server-sdk";
6
6
  // src/handlers/readPdf.ts
7
7
  import { image, text, tool, toolError } from "@sylphx/mcp-server-sdk";
8
8
 
9
- // src/pdf/extractor.ts
10
- import { OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
11
- import { PNG } from "pngjs";
12
-
13
9
  // src/utils/logger.ts
14
10
  class Logger {
15
11
  prefix;
@@ -87,182 +83,1308 @@ var createLogger = (component, minLevel) => {
87
83
  };
88
84
  var logger = new Logger("", 2 /* WARN */);
89
85
 
90
- // src/pdf/extractor.ts
91
- var logger2 = createLogger("Extractor");
92
- var encodePixelsToPNG = (pixelData, width, height, channels) => {
93
- const png = new PNG({ width, height });
94
- if (channels === 4) {
95
- png.data = Buffer.from(pixelData);
96
- } else if (channels === 3) {
97
- for (let i = 0;i < width * height; i++) {
98
- const srcIdx = i * 3;
99
- const dstIdx = i * 4;
100
- png.data[dstIdx] = pixelData[srcIdx] ?? 0;
101
- png.data[dstIdx + 1] = pixelData[srcIdx + 1] ?? 0;
102
- png.data[dstIdx + 2] = pixelData[srcIdx + 2] ?? 0;
103
- png.data[dstIdx + 3] = 255;
104
- }
105
- } else if (channels === 1) {
106
- for (let i = 0;i < width * height; i++) {
107
- const gray = pixelData[i] ?? 0;
108
- const dstIdx = i * 4;
109
- png.data[dstIdx] = gray;
110
- png.data[dstIdx + 1] = gray;
111
- png.data[dstIdx + 2] = gray;
112
- png.data[dstIdx + 3] = 255;
113
- }
86
+ // src/pdf/tableExtractor.ts
87
+ var logger2 = createLogger("TableExtractor");
88
+ var Y_TOLERANCE = 5;
89
+ var COLUMN_GAP_THRESHOLD = 15;
90
+ var MIN_ROWS = 2;
91
+ var MIN_COLS = 2;
92
+ var MIN_ROW_ITEMS = 2;
93
+ var buildBoundingBox = (x, y, width, height) => {
94
+ if (![x, y, width].every(Number.isFinite) || height === undefined || !Number.isFinite(height)) {
95
+ return;
114
96
  }
115
- const pngBuffer = PNG.sync.write(png);
116
- return pngBuffer.toString("base64");
97
+ return {
98
+ left: x,
99
+ bottom: y,
100
+ right: x + Math.max(0, width),
101
+ top: y + Math.max(0, height)
102
+ };
117
103
  };
118
- var processImageData = (imageData, pageNum, arrayIndex) => {
119
- if (!imageData || typeof imageData !== "object") {
120
- return null;
121
- }
122
- const img = imageData;
123
- if (!img.data || !img.width || !img.height) {
124
- return null;
125
- }
126
- const channels = img.kind === 1 ? 1 : img.kind === 3 ? 4 : 3;
127
- const format = img.kind === 1 ? "grayscale" : img.kind === 3 ? "rgba" : "rgb";
128
- const pngBase64 = encodePixelsToPNG(img.data, img.width, img.height, channels);
104
+ var mergeBoundingBoxes = (boxes) => {
105
+ if (boxes.length === 0)
106
+ return;
129
107
  return {
130
- page: pageNum,
131
- index: arrayIndex,
132
- width: img.width,
133
- height: img.height,
134
- format,
135
- data: pngBase64
108
+ left: Math.min(...boxes.map((box) => box.left)),
109
+ bottom: Math.min(...boxes.map((box) => box.bottom)),
110
+ right: Math.max(...boxes.map((box) => box.right)),
111
+ top: Math.max(...boxes.map((box) => box.top))
136
112
  };
137
113
  };
138
- var retrieveImageData = async (page, imageName, pageNum) => {
139
- if (imageName.startsWith("g_")) {
140
- try {
141
- const imageData = page.commonObjs.get(imageName);
142
- if (imageData) {
143
- return imageData;
144
- }
145
- } catch (error) {
146
- const message = error instanceof Error ? error.message : String(error);
147
- logger2.warn("Error getting image from commonObjs", { imageName, error: message });
114
+ var extractTextItemsWithPositions = async (page) => {
115
+ const textContent = await page.getTextContent();
116
+ const items = [];
117
+ for (const item of textContent.items) {
118
+ const textItem = item;
119
+ if (!textItem.str.trim())
120
+ continue;
121
+ if (!textItem.transform || textItem.transform.length < 6)
122
+ continue;
123
+ const x = textItem.transform[4];
124
+ const y = textItem.transform[5];
125
+ if (x === undefined || y === undefined)
126
+ continue;
127
+ const height = textItem.height ?? Math.abs(textItem.transform[3] ?? 0);
128
+ items.push({
129
+ text: textItem.str,
130
+ x,
131
+ y,
132
+ width: textItem.width ?? textItem.str.length * 6,
133
+ ...height > 0 ? { height } : {},
134
+ ...height > 0 ? {
135
+ bounding_box: buildBoundingBox(x, y, textItem.width ?? textItem.str.length * 6, height)
136
+ } : {}
137
+ });
138
+ }
139
+ return items;
140
+ };
141
+ var clusterByY = (items, tolerance = Y_TOLERANCE) => {
142
+ if (items.length === 0)
143
+ return [];
144
+ const sorted = [...items].sort((a, b) => b.y - a.y);
145
+ const firstItem = sorted[0];
146
+ if (!firstItem)
147
+ return [];
148
+ const rows = [];
149
+ let currentRow = { y: firstItem.y, items: [firstItem] };
150
+ for (let i = 1;i < sorted.length; i++) {
151
+ const item = sorted[i];
152
+ if (!item)
153
+ continue;
154
+ const yDiff = Math.abs(currentRow.y - item.y);
155
+ if (yDiff <= tolerance) {
156
+ currentRow.items.push(item);
157
+ } else {
158
+ rows.push(currentRow);
159
+ currentRow = { y: item.y, items: [item] };
148
160
  }
149
161
  }
150
- try {
151
- const imageData = page.objs.get(imageName);
152
- if (imageData !== undefined) {
153
- return imageData;
162
+ rows.push(currentRow);
163
+ for (const row of rows) {
164
+ row.items.sort((a, b) => a.x - b.x);
165
+ }
166
+ return rows;
167
+ };
168
+ var detectColumnBoundaries = (rows, gapThreshold = COLUMN_GAP_THRESHOLD) => {
169
+ if (rows.length === 0)
170
+ return [];
171
+ const allXPositions = [];
172
+ for (const row of rows) {
173
+ for (const item of row.items) {
174
+ allXPositions.push(item.x);
154
175
  }
155
- } catch (error) {
156
- const message = error instanceof Error ? error.message : String(error);
157
- logger2.warn("Sync image get failed, trying async", { imageName, error: message });
158
176
  }
159
- return new Promise((resolve) => {
160
- let resolved = false;
161
- let timeoutId = null;
162
- const cleanup = () => {
163
- if (timeoutId !== null) {
164
- clearTimeout(timeoutId);
165
- timeoutId = null;
166
- }
167
- };
168
- timeoutId = setTimeout(() => {
169
- if (!resolved) {
170
- resolved = true;
171
- cleanup();
172
- logger2.warn("Image extraction timeout", { imageName, pageNum });
173
- resolve(null);
174
- }
175
- }, 1e4);
176
- try {
177
- page.objs.get(imageName, (imageData) => {
178
- if (!resolved) {
179
- resolved = true;
180
- cleanup();
181
- resolve(imageData);
182
- }
183
- });
184
- } catch (error) {
185
- if (!resolved) {
186
- resolved = true;
187
- cleanup();
188
- const message = error instanceof Error ? error.message : String(error);
189
- logger2.warn("Error in async image get", { imageName, error: message });
190
- resolve(null);
191
- }
177
+ if (allXPositions.length === 0)
178
+ return [];
179
+ allXPositions.sort((a, b) => a - b);
180
+ const firstX = allXPositions[0];
181
+ if (firstX === undefined)
182
+ return [];
183
+ const boundaries = [firstX];
184
+ for (let i = 1;i < allXPositions.length; i++) {
185
+ const current = allXPositions[i];
186
+ const previous = allXPositions[i - 1];
187
+ if (current === undefined || previous === undefined)
188
+ continue;
189
+ const gap = current - previous;
190
+ if (gap >= gapThreshold) {
191
+ boundaries.push(current);
192
192
  }
193
- });
194
- };
195
- var extractMetadataAndPageCount = async (pdfDocument, includeMetadata, includePageCount) => {
196
- const output = {};
197
- if (includePageCount) {
198
- output.num_pages = pdfDocument.numPages;
199
193
  }
200
- if (includeMetadata) {
201
- try {
202
- const pdfMetadata = await pdfDocument.getMetadata();
203
- const infoData = pdfMetadata.info;
204
- if (infoData !== undefined) {
205
- output.info = infoData;
206
- }
207
- const metadataObj = pdfMetadata.metadata;
208
- if (typeof metadataObj.getAll === "function") {
209
- output.metadata = metadataObj.getAll();
210
- } else {
211
- const metadataRecord = {};
212
- for (const key in metadataObj) {
213
- if (Object.hasOwn(metadataObj, key)) {
214
- metadataRecord[key] = metadataObj[key];
215
- }
216
- }
217
- output.metadata = metadataRecord;
218
- }
219
- } catch (metaError) {
220
- const message = metaError instanceof Error ? metaError.message : String(metaError);
221
- logger2.warn("Error extracting metadata", { error: message });
194
+ return boundaries;
195
+ };
196
+ var columnIndexForItem = (item, columnBoundaries, tolerance = COLUMN_GAP_THRESHOLD / 2) => {
197
+ for (let i = columnBoundaries.length - 1;i >= 0; i--) {
198
+ const boundary = columnBoundaries[i];
199
+ if (boundary !== undefined && item.x >= boundary - tolerance) {
200
+ return i;
222
201
  }
223
202
  }
224
- return output;
203
+ return 0;
225
204
  };
226
- var buildWarnings = (invalidPages, totalPages) => {
227
- if (invalidPages.length === 0) {
228
- return [];
205
+ var assignToTableCells = (row, rowIndex, columnBoundaries) => {
206
+ const accumulators = Array.from({ length: columnBoundaries.length }, () => ({ textParts: [], boundingBoxes: [] }));
207
+ for (const item of row.items) {
208
+ const colIndex = columnIndexForItem(item, columnBoundaries);
209
+ const accumulator = accumulators[colIndex];
210
+ if (!accumulator)
211
+ continue;
212
+ accumulator.textParts.push(item.text);
213
+ if (item.bounding_box) {
214
+ accumulator.boundingBoxes.push(item.bounding_box);
215
+ }
229
216
  }
230
- return [
231
- `Requested page numbers ${invalidPages.join(", ")} exceed total pages (${String(totalPages)}).`
232
- ];
217
+ const cells = accumulators.map((accumulator, colIndex) => {
218
+ const boundingBox = mergeBoundingBoxes(accumulator.boundingBoxes);
219
+ return {
220
+ text: accumulator.textParts.join(" "),
221
+ rowIndex,
222
+ colIndex,
223
+ ...boundingBox ? { bounding_box: boundingBox } : {}
224
+ };
225
+ });
226
+ return {
227
+ rowValues: cells.map((cell) => cell.text),
228
+ cells
229
+ };
233
230
  };
234
- var extractPageContent = async (pdfDocument, pageNum, includeImages, sourceDescription) => {
235
- const contentItems = [];
236
- try {
237
- const page = await pdfDocument.getPage(pageNum);
238
- const textContent = await page.getTextContent();
239
- const textByY = new Map;
240
- for (const item of textContent.items) {
241
- const textItem = item;
242
- const yCoord = textItem.transform[5];
243
- if (yCoord === undefined)
244
- continue;
245
- const y = Math.round(yCoord);
246
- if (!textByY.has(y)) {
247
- textByY.set(y, []);
231
+ var calculateConfidence = (rows, columnBoundaries) => {
232
+ if (rows.length < MIN_ROWS || columnBoundaries.length < MIN_COLS) {
233
+ return 0;
234
+ }
235
+ let score = 0;
236
+ let checks = 0;
237
+ for (const row of rows) {
238
+ const itemsPerColumn = new Set;
239
+ for (const item of row.items) {
240
+ for (let i = columnBoundaries.length - 1;i >= 0; i--) {
241
+ const boundary = columnBoundaries[i];
242
+ if (boundary !== undefined && item.x >= boundary - COLUMN_GAP_THRESHOLD / 2) {
243
+ itemsPerColumn.add(i);
244
+ break;
245
+ }
248
246
  }
249
- textByY.get(y)?.push(textItem.str);
250
247
  }
251
- for (const [y, textParts] of textByY.entries()) {
252
- const textContent2 = textParts.join("");
253
- if (textContent2.trim()) {
254
- contentItems.push({
255
- type: "text",
256
- yPosition: y,
257
- textContent: textContent2
258
- });
248
+ score += itemsPerColumn.size / columnBoundaries.length;
249
+ checks++;
250
+ }
251
+ if (rows.length >= 2) {
252
+ const spacings = [];
253
+ for (let i = 1;i < rows.length; i++) {
254
+ const prevRow = rows[i - 1];
255
+ const currRow = rows[i];
256
+ if (prevRow && currRow) {
257
+ spacings.push(Math.abs(prevRow.y - currRow.y));
259
258
  }
260
259
  }
261
- if (includeImages) {
262
- const operatorList = await page.getOperatorList();
263
- const imageIndices = [];
264
- for (let i = 0;i < operatorList.fnArray.length; i++) {
265
- const op = operatorList.fnArray[i];
260
+ if (spacings.length > 0) {
261
+ const avgSpacing = spacings.reduce((a, b) => a + b, 0) / spacings.length;
262
+ const variance = spacings.reduce((sum, s) => sum + (s - avgSpacing) ** 2, 0) / spacings.length;
263
+ const stdDev = Math.sqrt(variance);
264
+ const regularityScore = avgSpacing > 0 ? Math.max(0, 1 - stdDev / avgSpacing) : 0;
265
+ score += regularityScore;
266
+ checks++;
267
+ }
268
+ }
269
+ return checks > 0 ? Math.min(1, score / checks) : 0;
270
+ };
271
+ var identifyTableRegions = (rows) => {
272
+ const regions = [];
273
+ const candidateRows = rows.filter((row) => row.items.length >= MIN_ROW_ITEMS);
274
+ if (candidateRows.length < MIN_ROWS) {
275
+ return regions;
276
+ }
277
+ const columnBoundaries = detectColumnBoundaries(candidateRows);
278
+ if (columnBoundaries.length < MIN_COLS) {
279
+ return regions;
280
+ }
281
+ let currentRegion = [];
282
+ for (const row of candidateRows) {
283
+ const alignedItems = row.items.filter((item) => {
284
+ return columnBoundaries.some((boundary) => Math.abs(item.x - boundary) < COLUMN_GAP_THRESHOLD);
285
+ });
286
+ if (alignedItems.length >= MIN_COLS - 1) {
287
+ currentRegion.push(row);
288
+ } else if (currentRegion.length >= MIN_ROWS) {
289
+ const firstRow = currentRegion[0];
290
+ const lastRow = currentRegion[currentRegion.length - 1];
291
+ if (firstRow && lastRow) {
292
+ regions.push({
293
+ rows: currentRegion,
294
+ columnBoundaries,
295
+ startY: firstRow.y,
296
+ endY: lastRow.y
297
+ });
298
+ }
299
+ currentRegion = [];
300
+ } else {
301
+ currentRegion = [];
302
+ }
303
+ }
304
+ if (currentRegion.length >= MIN_ROWS) {
305
+ const firstRow = currentRegion[0];
306
+ const lastRow = currentRegion[currentRegion.length - 1];
307
+ if (firstRow && lastRow) {
308
+ regions.push({
309
+ rows: currentRegion,
310
+ columnBoundaries,
311
+ startY: firstRow.y,
312
+ endY: lastRow.y
313
+ });
314
+ }
315
+ }
316
+ return regions;
317
+ };
318
+ var extractTablesFromPage = async (page, pageNum) => {
319
+ const tables = [];
320
+ try {
321
+ const textItems = await extractTextItemsWithPositions(page);
322
+ if (textItems.length === 0) {
323
+ return tables;
324
+ }
325
+ const rows = clusterByY(textItems);
326
+ const tableRegions = identifyTableRegions(rows);
327
+ for (let tableIndex = 0;tableIndex < tableRegions.length; tableIndex++) {
328
+ const region = tableRegions[tableIndex];
329
+ if (!region)
330
+ continue;
331
+ const tableRows = [];
332
+ const tableCells = [];
333
+ for (let rowIndex = 0;rowIndex < region.rows.length; rowIndex++) {
334
+ const row = region.rows[rowIndex];
335
+ if (!row)
336
+ continue;
337
+ const assigned = assignToTableCells(row, rowIndex, region.columnBoundaries);
338
+ tableRows.push(assigned.rowValues);
339
+ tableCells.push(...assigned.cells);
340
+ }
341
+ const confidence = calculateConfidence(region.rows, region.columnBoundaries);
342
+ const tableBoundingBox = mergeBoundingBoxes(tableCells.map((cell) => cell.bounding_box).filter((box) => box !== undefined));
343
+ if (confidence >= 0.3) {
344
+ tables.push({
345
+ page: pageNum,
346
+ tableIndex,
347
+ rows: tableRows,
348
+ cells: tableCells,
349
+ ...tableBoundingBox ? { bounding_box: tableBoundingBox } : {},
350
+ rowCount: tableRows.length,
351
+ colCount: region.columnBoundaries.length,
352
+ confidence: Math.round(confidence * 100) / 100
353
+ });
354
+ }
355
+ }
356
+ } catch (error) {
357
+ const message = error instanceof Error ? error.message : String(error);
358
+ logger2.warn("Error extracting tables from page", { pageNum, error: message });
359
+ }
360
+ return tables;
361
+ };
362
+ var extractTables = async (pdfDocument, pagesToProcess) => {
363
+ const allTables = [];
364
+ for (const pageNum of pagesToProcess) {
365
+ try {
366
+ const page = await pdfDocument.getPage(pageNum);
367
+ const pageTables = await extractTablesFromPage(page, pageNum);
368
+ allTables.push(...pageTables);
369
+ } catch (error) {
370
+ const message = error instanceof Error ? error.message : String(error);
371
+ logger2.warn("Error getting page for table extraction", { pageNum, error: message });
372
+ }
373
+ }
374
+ return allTables;
375
+ };
376
+ var tableToMarkdown = (table) => {
377
+ if (table.rows.length === 0)
378
+ return "";
379
+ const lines = [];
380
+ const headerRow = table.rows[0];
381
+ if (!headerRow)
382
+ return "";
383
+ lines.push(`| ${headerRow.map((cell) => cell.trim() || " ").join(" | ")} |`);
384
+ lines.push(`| ${headerRow.map(() => "---").join(" | ")} |`);
385
+ for (let i = 1;i < table.rows.length; i++) {
386
+ const row = table.rows[i];
387
+ if (!row)
388
+ continue;
389
+ const paddedRow = [...row];
390
+ while (paddedRow.length < headerRow.length) {
391
+ paddedRow.push("");
392
+ }
393
+ lines.push(`| ${paddedRow.map((cell) => cell.trim() || " ").join(" | ")} |`);
394
+ }
395
+ return lines.join(`
396
+ `);
397
+ };
398
+ var tablesToMarkdown = (tables) => {
399
+ if (tables.length === 0)
400
+ return "";
401
+ const sections = ["## Extracted Tables", ""];
402
+ for (const table of tables) {
403
+ sections.push(`### Page ${table.page}, Table ${table.tableIndex + 1}`);
404
+ sections.push(`*Confidence: ${(table.confidence * 100).toFixed(0)}%*`);
405
+ sections.push("");
406
+ sections.push(tableToMarkdown(table));
407
+ sections.push("");
408
+ }
409
+ return sections.join(`
410
+ `);
411
+ };
412
+
413
+ // src/pdf/documentModel.ts
414
+ var DEFAULT_CHUNK_MAX_CHARS = 1800;
415
+ var buildElementId = (page, type, index) => `p${String(page)}-${type}-${String(index)}`;
416
+ var imageElementMetadata = (imageData) => {
417
+ const { data: _data, ...metadata } = imageData;
418
+ return metadata;
419
+ };
420
+ var buildPageTextStats = (items) => {
421
+ const heights = items.filter((item) => item.type === "text" && item.textContent?.trim() && item.height).map((item) => item.height).sort((a, b) => a - b);
422
+ if (heights.length === 0) {
423
+ return { maxHeight: 0, medianHeight: 0, textItemCount: 0 };
424
+ }
425
+ const midpoint = Math.floor(heights.length / 2);
426
+ const medianHeight = heights.length % 2 === 0 ? ((heights[midpoint - 1] ?? 0) + (heights[midpoint] ?? 0)) / 2 : heights[midpoint] ?? 0;
427
+ return {
428
+ maxHeight: heights.at(-1) ?? 0,
429
+ medianHeight,
430
+ textItemCount: heights.length
431
+ };
432
+ };
433
+ var buildSemanticHint = (item, stats) => {
434
+ if (item.type !== "text" || !item.textContent?.trim())
435
+ return;
436
+ const textContent = item.textContent.trim();
437
+ if (/^([-*]\s+|\d+[.)]\s+)/.test(textContent)) {
438
+ return {
439
+ role: "list_item",
440
+ confidence: 0.92,
441
+ signals: ["list-prefix"]
442
+ };
443
+ }
444
+ const height = item.height ?? 0;
445
+ const isShortLine = textContent.length <= 120;
446
+ const endsLikeSentence = /[.!?]$/.test(textContent);
447
+ const isLargeText = stats.textItemCount > 1 && height > 0 && stats.medianHeight > 0 && height >= stats.medianHeight * 1.3 && height >= stats.maxHeight * 0.8;
448
+ if (isLargeText && isShortLine && !endsLikeSentence) {
449
+ const ratio = height / stats.medianHeight;
450
+ const level = ratio >= 1.8 ? 1 : ratio >= 1.55 ? 2 : 3;
451
+ return {
452
+ role: "heading",
453
+ level,
454
+ confidence: 0.78,
455
+ signals: ["larger-text", "short-line"]
456
+ };
457
+ }
458
+ return {
459
+ role: "paragraph",
460
+ confidence: 0.5,
461
+ signals: ["default-text"]
462
+ };
463
+ };
464
+ var contentItemToElement = (item, page, index, semanticHint) => {
465
+ if (item.type === "text" && item.textContent?.trim()) {
466
+ return {
467
+ id: buildElementId(page, "text", index),
468
+ type: "text",
469
+ page,
470
+ content: item.textContent,
471
+ bounding_box: item.bounding_box,
472
+ provenance: {
473
+ engine: "pdfjs",
474
+ source: "text-content"
475
+ },
476
+ ...semanticHint ? { semantic_hint: semanticHint } : {}
477
+ };
478
+ }
479
+ if (item.type === "image" && item.imageData) {
480
+ return {
481
+ id: buildElementId(page, "image", index),
482
+ type: "image",
483
+ page,
484
+ image: imageElementMetadata(item.imageData),
485
+ bounding_box: item.bounding_box,
486
+ provenance: {
487
+ engine: "pdfjs",
488
+ source: "image-xobject"
489
+ }
490
+ };
491
+ }
492
+ return;
493
+ };
494
+ var buildStructuredElements = (pageContents, tables, includeSemanticHints) => {
495
+ const elements = [];
496
+ const tablesByPage = new Map;
497
+ for (const table of tables ?? []) {
498
+ const pageTables = tablesByPage.get(table.page) ?? [];
499
+ pageTables.push(table);
500
+ tablesByPage.set(table.page, pageTables);
501
+ }
502
+ const appendTableElement = (table) => {
503
+ elements.push({
504
+ id: buildElementId(table.page, "table", table.tableIndex + 1),
505
+ type: "table",
506
+ page: table.page,
507
+ table: {
508
+ rows: table.rows,
509
+ ...table.cells ? { cells: table.cells } : {},
510
+ ...table.bounding_box ? { bounding_box: table.bounding_box } : {},
511
+ rowCount: table.rowCount,
512
+ colCount: table.colCount,
513
+ confidence: table.confidence
514
+ },
515
+ bounding_box: table.bounding_box,
516
+ confidence: table.confidence,
517
+ provenance: {
518
+ engine: "pdfjs",
519
+ source: "table-detector"
520
+ }
521
+ });
522
+ };
523
+ for (const pageContent of pageContents) {
524
+ const stats = includeSemanticHints ? buildPageTextStats(pageContent.items) : undefined;
525
+ let elementIndex = 1;
526
+ for (const item of pageContent.items) {
527
+ const semanticHint = stats ? buildSemanticHint(item, stats) : undefined;
528
+ const element = contentItemToElement(item, pageContent.page, elementIndex, semanticHint);
529
+ if (element) {
530
+ elements.push(element);
531
+ elementIndex++;
532
+ }
533
+ }
534
+ const pageTables = tablesByPage.get(pageContent.page);
535
+ if (pageTables) {
536
+ for (const table of pageTables.sort((a, b) => a.tableIndex - b.tableIndex)) {
537
+ appendTableElement(table);
538
+ }
539
+ tablesByPage.delete(pageContent.page);
540
+ }
541
+ }
542
+ const remainingTables = Array.from(tablesByPage.values()).flat().sort((a, b) => a.page - b.page || a.tableIndex - b.tableIndex);
543
+ for (const table of remainingTables) {
544
+ appendTableElement(table);
545
+ }
546
+ return elements;
547
+ };
548
+ var renderMarkdownFromPageContents = (pageContents, tables) => {
549
+ const sections = [];
550
+ for (const pageContent of pageContents) {
551
+ const pageLines = [`## Page ${String(pageContent.page)}`, ""];
552
+ for (const item of pageContent.items) {
553
+ if (item.type === "text" && item.textContent?.trim()) {
554
+ pageLines.push(item.textContent.trim(), "");
555
+ } else if (item.type === "image" && item.imageData) {
556
+ pageLines.push(`[Image ${String(item.imageData.index + 1)}: ${String(item.imageData.width)}x${String(item.imageData.height)} ${item.imageData.format}]`, "");
557
+ }
558
+ }
559
+ sections.push(pageLines.join(`
560
+ `).trimEnd());
561
+ }
562
+ if (tables && tables.length > 0) {
563
+ sections.push(tablesToMarkdown(tables));
564
+ }
565
+ return sections.join(`
566
+
567
+ `).trim();
568
+ };
569
+ var escapeHtml = (value) => value.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&#39;");
570
+ var renderTablesToHtml = (tables) => {
571
+ if (!tables || tables.length === 0)
572
+ return [];
573
+ return tables.map((table) => {
574
+ const rows = table.rows.map((row) => {
575
+ const cells = row.map((cell) => `<td>${escapeHtml(cell)}</td>`).join("");
576
+ return `<tr>${cells}</tr>`;
577
+ }).join(`
578
+ `);
579
+ return [
580
+ `<table data-page="${String(table.page)}" data-table-index="${String(table.tableIndex)}">`,
581
+ "<tbody>",
582
+ rows,
583
+ "</tbody>",
584
+ "</table>"
585
+ ].join(`
586
+ `);
587
+ });
588
+ };
589
+ var renderHtmlFromPageContents = (pageContents, tables) => {
590
+ const sections = pageContents.map((pageContent) => {
591
+ const body = [
592
+ `<section data-page="${String(pageContent.page)}">`,
593
+ `<h2>Page ${String(pageContent.page)}</h2>`
594
+ ];
595
+ for (const item of pageContent.items) {
596
+ if (item.type === "text" && item.textContent?.trim()) {
597
+ body.push(`<p>${escapeHtml(item.textContent.trim())}</p>`);
598
+ } else if (item.type === "image" && item.imageData) {
599
+ body.push([
600
+ `<figure data-image-index="${String(item.imageData.index)}">`,
601
+ `<figcaption>Image ${String(item.imageData.index + 1)}: ${String(item.imageData.width)}x${String(item.imageData.height)} ${escapeHtml(item.imageData.format)}</figcaption>`,
602
+ "</figure>"
603
+ ].join(`
604
+ `));
605
+ }
606
+ }
607
+ body.push("</section>");
608
+ return body.join(`
609
+ `);
610
+ });
611
+ return [...sections, ...renderTablesToHtml(tables)].join(`
612
+
613
+ `).trim();
614
+ };
615
+ var elementText = (element) => {
616
+ if (element.type === "text")
617
+ return element.content.trim();
618
+ if (element.type === "table") {
619
+ const tableText = element.table.rows.map((row) => row.join(" | ")).join(`
620
+ `).trim();
621
+ return tableText.length > 0 ? tableText : undefined;
622
+ }
623
+ return;
624
+ };
625
+ var elementRole = (element) => element.type === "text" ? element.semantic_hint?.role : undefined;
626
+ var chunkTextLength = (draft) => draft.textParts.reduce((sum, part) => sum + part.length + 1, 0);
627
+ var createChunkDraft = (element, strategy, heading) => ({
628
+ pageStart: element.page,
629
+ pageEnd: element.page,
630
+ textParts: [],
631
+ elementIds: [],
632
+ boundingBoxes: [],
633
+ strategy,
634
+ heading
635
+ });
636
+ var addElementToChunk = (draft, element, textValue) => {
637
+ draft.pageEnd = Math.max(draft.pageEnd, element.page);
638
+ draft.textParts.push(textValue);
639
+ draft.elementIds.push(element.id);
640
+ if (element.bounding_box) {
641
+ draft.boundingBoxes.push(element.bounding_box);
642
+ }
643
+ };
644
+ var finalizeChunk = (draft, index) => {
645
+ const textValue = draft.textParts.join(`
646
+ `).trim();
647
+ if (!textValue)
648
+ return;
649
+ return {
650
+ id: draft.pageStart === draft.pageEnd ? `p${String(draft.pageStart)}-chunk-${String(index)}` : `p${String(draft.pageStart)}-p${String(draft.pageEnd)}-chunk-${String(index)}`,
651
+ page_start: draft.pageStart,
652
+ page_end: draft.pageEnd,
653
+ text: textValue,
654
+ element_ids: draft.elementIds,
655
+ strategy: draft.strategy,
656
+ ...draft.heading ? { heading: draft.heading } : {},
657
+ ...draft.boundingBoxes.length > 0 ? { bounding_boxes: draft.boundingBoxes } : {}
658
+ };
659
+ };
660
+ var buildCitationChunks = (elements, options) => {
661
+ const maxChars = options.maxChars ?? DEFAULT_CHUNK_MAX_CHARS;
662
+ const chunks = [];
663
+ let current;
664
+ const pushCurrent = () => {
665
+ if (!current)
666
+ return;
667
+ const chunk = finalizeChunk(current, chunks.length + 1);
668
+ if (chunk)
669
+ chunks.push(chunk);
670
+ current = undefined;
671
+ };
672
+ for (const element of elements) {
673
+ const textValue = elementText(element);
674
+ if (!textValue)
675
+ continue;
676
+ const role = elementRole(element);
677
+ const shouldStartSemanticChunk = options.useSemanticBoundaries && role === "heading";
678
+ const shouldStartTableChunk = element.type === "table";
679
+ const exceedsSize = current !== undefined && current.elementIds.length > 0 && chunkTextLength(current) + textValue.length > maxChars;
680
+ const crossesPage = current !== undefined && current.pageEnd !== element.page;
681
+ if (shouldStartSemanticChunk || shouldStartTableChunk || exceedsSize || crossesPage) {
682
+ pushCurrent();
683
+ }
684
+ if (!current) {
685
+ const strategy = shouldStartSemanticChunk ? "semantic" : exceedsSize ? "size" : "page";
686
+ const heading = shouldStartSemanticChunk && element.type === "text" ? element.content.trim() : undefined;
687
+ current = createChunkDraft(element, strategy, heading);
688
+ }
689
+ if (element.type === "table" && current.elementIds.length === 0) {
690
+ current.strategy = "table";
691
+ }
692
+ addElementToChunk(current, element, textValue);
693
+ if (element.type === "table") {
694
+ pushCurrent();
695
+ }
696
+ }
697
+ pushCurrent();
698
+ return chunks;
699
+ };
700
+ var PROMPT_INJECTION_PATTERNS = [
701
+ /\bignore (all )?(previous|prior|above) instructions\b/i,
702
+ /\bdisregard (previous|prior|above) instructions\b/i,
703
+ /\bsystem prompt\b/i,
704
+ /\bdeveloper (message|instruction)s?\b/i,
705
+ /\bdo not (follow|obey) .*instructions\b/i
706
+ ];
707
+ var snippetFromText = (value) => {
708
+ const normalized = value.replace(/\s+/g, " ").trim();
709
+ return normalized.length > 160 ? `${normalized.slice(0, 157)}...` : normalized;
710
+ };
711
+ var isOutsideViewBox = (box, viewBox) => {
712
+ if (!box || !viewBox)
713
+ return false;
714
+ const tolerance = 1;
715
+ return box.right < viewBox.left - tolerance || box.left > viewBox.right + tolerance || box.top < viewBox.bottom - tolerance || box.bottom > viewBox.top + tolerance;
716
+ };
717
+ var buildSafetyFindings = (pageContents, pageGeometry) => {
718
+ const findings = [];
719
+ const geometryByPage = new Map(pageGeometry?.map((geometry) => [geometry.page, geometry]));
720
+ for (const pageContent of pageContents) {
721
+ let elementIndex = 1;
722
+ const geometry = geometryByPage.get(pageContent.page);
723
+ for (const item of pageContent.items) {
724
+ const element = contentItemToElement(item, pageContent.page, elementIndex);
725
+ if (!element) {
726
+ continue;
727
+ }
728
+ if (element.type === "text") {
729
+ const textContent = element.content.trim();
730
+ const snippet = snippetFromText(textContent);
731
+ if (PROMPT_INJECTION_PATTERNS.some((pattern) => pattern.test(textContent))) {
732
+ findings.push({
733
+ type: "prompt_injection_pattern",
734
+ severity: "high",
735
+ page: pageContent.page,
736
+ element_id: element.id,
737
+ message: "Text matches a common prompt-injection instruction pattern.",
738
+ snippet,
739
+ ...element.bounding_box ? { bounding_box: element.bounding_box } : {}
740
+ });
741
+ }
742
+ if (item.height !== undefined && item.height > 0 && item.height < 2) {
743
+ findings.push({
744
+ type: "tiny_text",
745
+ severity: "medium",
746
+ page: pageContent.page,
747
+ element_id: element.id,
748
+ message: "Text is unusually small and may be hidden, decorative, or extraction noise.",
749
+ snippet,
750
+ ...element.bounding_box ? { bounding_box: element.bounding_box } : {}
751
+ });
752
+ }
753
+ if (isOutsideViewBox(element.bounding_box, geometry?.view_box)) {
754
+ findings.push({
755
+ type: "off_page_text",
756
+ severity: "medium",
757
+ page: pageContent.page,
758
+ element_id: element.id,
759
+ message: "Text bounding box falls outside the PDF page view box.",
760
+ snippet,
761
+ ...element.bounding_box ? { bounding_box: element.bounding_box } : {}
762
+ });
763
+ }
764
+ }
765
+ elementIndex++;
766
+ }
767
+ }
768
+ return findings;
769
+ };
770
+
771
+ // src/pdf/extractor.ts
772
+ import { OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
773
+ import { PNG } from "pngjs";
774
+ var logger3 = createLogger("Extractor");
775
+ var TEXT_SEGMENT_GAP_THRESHOLD = 48;
776
+ var COLUMN_CUT_MIN_GAP = 48;
777
+ var COLUMN_CUT_MIN_WIDTH_RATIO = 0.12;
778
+ var SPANNING_WIDTH_RATIO = 0.72;
779
+ var mergeBoundingBoxes2 = (boxes) => {
780
+ const validBoxes = boxes.filter((box) => box !== undefined);
781
+ if (validBoxes.length === 0)
782
+ return;
783
+ return {
784
+ left: Math.min(...validBoxes.map((box) => box.left)),
785
+ bottom: Math.min(...validBoxes.map((box) => box.bottom)),
786
+ right: Math.max(...validBoxes.map((box) => box.right)),
787
+ top: Math.max(...validBoxes.map((box) => box.top))
788
+ };
789
+ };
790
+ var buildBoundingBox2 = (x, y, width, height) => {
791
+ if (x === undefined || y === undefined || width === undefined || height === undefined) {
792
+ return;
793
+ }
794
+ if (![x, y, width, height].every(Number.isFinite)) {
795
+ return;
796
+ }
797
+ return {
798
+ left: x,
799
+ bottom: y,
800
+ right: x + Math.max(0, width),
801
+ top: y + Math.max(0, height)
802
+ };
803
+ };
804
+ var buildRectBoundingBox = (rect) => {
805
+ if (!rect || rect.length < 4)
806
+ return;
807
+ const [x1, y1, x2, y2] = rect;
808
+ if (x1 === undefined || y1 === undefined || x2 === undefined || y2 === undefined || ![x1, y1, x2, y2].every(Number.isFinite)) {
809
+ return;
810
+ }
811
+ return {
812
+ left: Math.min(x1, x2),
813
+ bottom: Math.min(y1, y2),
814
+ right: Math.max(x1, x2),
815
+ top: Math.max(y1, y2)
816
+ };
817
+ };
818
+ var finiteNumber = (value) => typeof value === "number" && Number.isFinite(value);
819
+ var textFromAnnotationField = (direct, objectValue) => {
820
+ const value = direct ?? objectValue?.str;
821
+ return value && value.trim().length > 0 ? value : undefined;
822
+ };
823
+ var sanitizeOutlineItems = (items) => items.map((item) => {
824
+ const title = item.title?.trim();
825
+ if (!title)
826
+ return;
827
+ const children = item.items ? sanitizeOutlineItems(item.items) : undefined;
828
+ return {
829
+ title,
830
+ ...item.bold !== undefined ? { bold: item.bold } : {},
831
+ ...item.italic !== undefined ? { italic: item.italic } : {},
832
+ ...item.color ? { color: Array.from(item.color) } : {},
833
+ ...item.url ? { url: item.url } : {},
834
+ ...item.dest !== undefined ? { dest: item.dest } : {},
835
+ ...children && children.length > 0 ? { items: children } : {}
836
+ };
837
+ }).filter((item) => item !== undefined);
838
+ var PDF_PERMISSION_LABELS = new Map([
839
+ [4, "print"],
840
+ [8, "modify"],
841
+ [16, "copy"],
842
+ [32, "annotate"],
843
+ [256, "fill_forms"],
844
+ [512, "copy_for_accessibility"],
845
+ [1024, "assemble"],
846
+ [2048, "print_high_quality"]
847
+ ]);
848
+ var permissionLabels = (permissions) => permissions.map((permission) => PDF_PERMISSION_LABELS.get(permission) ?? `unknown:${String(permission)}`);
849
+ var attachmentSize = (content) => {
850
+ if (!content)
851
+ return;
852
+ if ("byteLength" in content && typeof content.byteLength === "number") {
853
+ return content.byteLength;
854
+ }
855
+ if ("length" in content && typeof content.length === "number") {
856
+ return content.length;
857
+ }
858
+ return;
859
+ };
860
+ var textSegmentToContentItem = (y, segment) => {
861
+ const textContent = segment.map((part) => part.text).join("");
862
+ if (!textContent.trim())
863
+ return null;
864
+ const boundingBox = mergeBoundingBoxes2(segment.map((part) => part.bounding_box));
865
+ const xPosition = boundingBox?.left ?? segment[0]?.x;
866
+ const width = boundingBox !== undefined ? boundingBox.right - boundingBox.left : segment.reduce((sum, part) => sum + part.width, 0);
867
+ const height = boundingBox !== undefined ? boundingBox.top - boundingBox.bottom : Math.max(...segment.map((part) => part.height), 0);
868
+ return {
869
+ type: "text",
870
+ yPosition: y,
871
+ xPosition,
872
+ width,
873
+ height,
874
+ bounding_box: boundingBox,
875
+ textContent
876
+ };
877
+ };
878
+ var splitTextPartsIntoSegments = (parts) => {
879
+ const sortedParts = [...parts].sort((a, b) => a.x - b.x);
880
+ const segments = [];
881
+ let currentSegment = [];
882
+ let previousRight;
883
+ for (const part of sortedParts) {
884
+ if (previousRight !== undefined && part.x - previousRight > TEXT_SEGMENT_GAP_THRESHOLD) {
885
+ if (currentSegment.length > 0) {
886
+ segments.push(currentSegment);
887
+ }
888
+ currentSegment = [];
889
+ }
890
+ currentSegment.push(part);
891
+ previousRight = Math.max(previousRight ?? part.x, part.x + part.width);
892
+ }
893
+ if (currentSegment.length > 0) {
894
+ segments.push(currentSegment);
895
+ }
896
+ return segments;
897
+ };
898
+ var sortByYThenX = (items) => [...items].sort((a, b) => b.yPosition - a.yPosition || (a.xPosition ?? 0) - (b.xPosition ?? 0));
899
+ var findVerticalColumnCut = (items) => {
900
+ const boxedItems = items.filter((item) => item.bounding_box !== undefined);
901
+ if (boxedItems.length < 4)
902
+ return;
903
+ const left = Math.min(...boxedItems.map((item) => item.bounding_box?.left ?? 0));
904
+ const right = Math.max(...boxedItems.map((item) => item.bounding_box?.right ?? 0));
905
+ const pageWidth = right - left;
906
+ if (pageWidth <= 0)
907
+ return;
908
+ const narrowItems = boxedItems.filter((item) => {
909
+ const box = item.bounding_box;
910
+ if (!box)
911
+ return false;
912
+ return box.right - box.left < pageWidth * SPANNING_WIDTH_RATIO;
913
+ });
914
+ if (narrowItems.length < 4)
915
+ return;
916
+ const sorted = [...narrowItems].sort((a, b) => (a.bounding_box?.left ?? 0) - (b.bounding_box?.left ?? 0));
917
+ let currentRight = sorted[0]?.bounding_box?.right;
918
+ if (currentRight === undefined)
919
+ return;
920
+ let largestGap = 0;
921
+ let cutPosition;
922
+ for (let i = 1;i < sorted.length; i++) {
923
+ const box = sorted[i]?.bounding_box;
924
+ if (!box)
925
+ continue;
926
+ if (box.left > currentRight) {
927
+ const gap = box.left - currentRight;
928
+ if (gap > largestGap) {
929
+ largestGap = gap;
930
+ cutPosition = (box.left + currentRight) / 2;
931
+ }
932
+ }
933
+ currentRight = Math.max(currentRight, box.right);
934
+ }
935
+ if (cutPosition === undefined)
936
+ return;
937
+ const minGap = Math.max(COLUMN_CUT_MIN_GAP, pageWidth * COLUMN_CUT_MIN_WIDTH_RATIO);
938
+ if (largestGap < minGap)
939
+ return;
940
+ const leftCount = narrowItems.filter((item) => {
941
+ const box = item.bounding_box;
942
+ if (!box)
943
+ return false;
944
+ return (box.left + box.right) / 2 < cutPosition;
945
+ }).length;
946
+ const rightCount = narrowItems.length - leftCount;
947
+ return leftCount >= 2 && rightCount >= 2 ? cutPosition : undefined;
948
+ };
949
+ var sortPageContentItems = (items) => {
950
+ const cutPosition = findVerticalColumnCut(items);
951
+ if (cutPosition === undefined)
952
+ return sortByYThenX(items);
953
+ const leftColumn = [];
954
+ const rightColumn = [];
955
+ const spanning = [];
956
+ for (const item of items) {
957
+ const box = item.bounding_box;
958
+ if (!box) {
959
+ spanning.push(item);
960
+ continue;
961
+ }
962
+ if (box.left < cutPosition && box.right > cutPosition) {
963
+ spanning.push(item);
964
+ continue;
965
+ }
966
+ const center = (box.left + box.right) / 2;
967
+ if (center < cutPosition) {
968
+ leftColumn.push(item);
969
+ } else {
970
+ rightColumn.push(item);
971
+ }
972
+ }
973
+ const columnItems = [...leftColumn, ...rightColumn].filter((item) => item.bounding_box);
974
+ const highestColumnTop = columnItems.length > 0 ? Math.max(...columnItems.map((item) => item.bounding_box?.top ?? item.yPosition)) : Number.POSITIVE_INFINITY;
975
+ const topSpanning = spanning.filter((item) => (item.bounding_box?.top ?? item.yPosition) >= highestColumnTop);
976
+ const remainingSpanning = spanning.filter((item) => (item.bounding_box?.top ?? item.yPosition) < highestColumnTop);
977
+ return [
978
+ ...sortByYThenX(topSpanning),
979
+ ...sortByYThenX(leftColumn),
980
+ ...sortByYThenX(rightColumn),
981
+ ...sortByYThenX(remainingSpanning)
982
+ ];
983
+ };
984
+ var encodePixelsToPNG = (pixelData, width, height, channels) => {
985
+ const png = new PNG({ width, height });
986
+ if (channels === 4) {
987
+ png.data = Buffer.from(pixelData);
988
+ } else if (channels === 3) {
989
+ for (let i = 0;i < width * height; i++) {
990
+ const srcIdx = i * 3;
991
+ const dstIdx = i * 4;
992
+ png.data[dstIdx] = pixelData[srcIdx] ?? 0;
993
+ png.data[dstIdx + 1] = pixelData[srcIdx + 1] ?? 0;
994
+ png.data[dstIdx + 2] = pixelData[srcIdx + 2] ?? 0;
995
+ png.data[dstIdx + 3] = 255;
996
+ }
997
+ } else if (channels === 1) {
998
+ for (let i = 0;i < width * height; i++) {
999
+ const gray = pixelData[i] ?? 0;
1000
+ const dstIdx = i * 4;
1001
+ png.data[dstIdx] = gray;
1002
+ png.data[dstIdx + 1] = gray;
1003
+ png.data[dstIdx + 2] = gray;
1004
+ png.data[dstIdx + 3] = 255;
1005
+ }
1006
+ }
1007
+ const pngBuffer = PNG.sync.write(png);
1008
+ return pngBuffer.toString("base64");
1009
+ };
1010
+ var processImageData = (imageData, pageNum, arrayIndex) => {
1011
+ if (!imageData || typeof imageData !== "object") {
1012
+ return null;
1013
+ }
1014
+ const img = imageData;
1015
+ if (!img.data || !img.width || !img.height) {
1016
+ return null;
1017
+ }
1018
+ const channels = img.kind === 1 ? 1 : img.kind === 3 ? 4 : 3;
1019
+ const format = img.kind === 1 ? "grayscale" : img.kind === 3 ? "rgba" : "rgb";
1020
+ const pngBase64 = encodePixelsToPNG(img.data, img.width, img.height, channels);
1021
+ return {
1022
+ page: pageNum,
1023
+ index: arrayIndex,
1024
+ width: img.width,
1025
+ height: img.height,
1026
+ format,
1027
+ data: pngBase64
1028
+ };
1029
+ };
1030
+ var retrieveImageData = async (page, imageName, pageNum) => {
1031
+ if (imageName.startsWith("g_")) {
1032
+ try {
1033
+ const imageData = page.commonObjs.get(imageName);
1034
+ if (imageData) {
1035
+ return imageData;
1036
+ }
1037
+ } catch (error) {
1038
+ const message = error instanceof Error ? error.message : String(error);
1039
+ logger3.warn("Error getting image from commonObjs", { imageName, error: message });
1040
+ }
1041
+ }
1042
+ try {
1043
+ const imageData = page.objs.get(imageName);
1044
+ if (imageData !== undefined) {
1045
+ return imageData;
1046
+ }
1047
+ } catch (error) {
1048
+ const message = error instanceof Error ? error.message : String(error);
1049
+ logger3.warn("Sync image get failed, trying async", { imageName, error: message });
1050
+ }
1051
+ return new Promise((resolve) => {
1052
+ let resolved = false;
1053
+ let timeoutId = null;
1054
+ const cleanup = () => {
1055
+ if (timeoutId !== null) {
1056
+ clearTimeout(timeoutId);
1057
+ timeoutId = null;
1058
+ }
1059
+ };
1060
+ timeoutId = setTimeout(() => {
1061
+ if (!resolved) {
1062
+ resolved = true;
1063
+ cleanup();
1064
+ logger3.warn("Image extraction timeout", { imageName, pageNum });
1065
+ resolve(null);
1066
+ }
1067
+ }, 1e4);
1068
+ try {
1069
+ page.objs.get(imageName, (imageData) => {
1070
+ if (!resolved) {
1071
+ resolved = true;
1072
+ cleanup();
1073
+ resolve(imageData);
1074
+ }
1075
+ });
1076
+ } catch (error) {
1077
+ if (!resolved) {
1078
+ resolved = true;
1079
+ cleanup();
1080
+ const message = error instanceof Error ? error.message : String(error);
1081
+ logger3.warn("Error in async image get", { imageName, error: message });
1082
+ resolve(null);
1083
+ }
1084
+ }
1085
+ });
1086
+ };
1087
+ var extractMetadataAndPageCount = async (pdfDocument, includeMetadata, includePageCount) => {
1088
+ const output = {};
1089
+ if (includePageCount) {
1090
+ output.num_pages = pdfDocument.numPages;
1091
+ }
1092
+ if (includeMetadata) {
1093
+ try {
1094
+ const pdfMetadata = await pdfDocument.getMetadata();
1095
+ const infoData = pdfMetadata.info;
1096
+ if (infoData !== undefined) {
1097
+ output.info = infoData;
1098
+ }
1099
+ const metadataObj = pdfMetadata.metadata;
1100
+ if (metadataObj && typeof metadataObj.getAll === "function") {
1101
+ output.metadata = metadataObj.getAll();
1102
+ } else if (metadataObj && typeof metadataObj === "object") {
1103
+ const metadataRecord = {};
1104
+ for (const key in metadataObj) {
1105
+ if (Object.hasOwn(metadataObj, key)) {
1106
+ metadataRecord[key] = metadataObj[key];
1107
+ }
1108
+ }
1109
+ output.metadata = metadataRecord;
1110
+ }
1111
+ } catch (metaError) {
1112
+ const message = metaError instanceof Error ? metaError.message : String(metaError);
1113
+ logger3.warn("Error extracting metadata", { error: message });
1114
+ }
1115
+ }
1116
+ return output;
1117
+ };
1118
+ var extractDocumentStructure = async (pdfDocument, options) => {
1119
+ const documentWithStructure = pdfDocument;
1120
+ const output = {};
1121
+ if (options.includeOutline && typeof documentWithStructure.getOutline === "function") {
1122
+ try {
1123
+ const outline = await documentWithStructure.getOutline();
1124
+ if (outline && outline.length > 0) {
1125
+ output.outline = sanitizeOutlineItems(outline);
1126
+ }
1127
+ } catch (error) {
1128
+ const message = error instanceof Error ? error.message : String(error);
1129
+ logger3.warn("Error extracting outline", { error: message });
1130
+ }
1131
+ }
1132
+ if (options.includePageLabels && typeof documentWithStructure.getPageLabels === "function") {
1133
+ try {
1134
+ const pageLabels = await documentWithStructure.getPageLabels();
1135
+ if (pageLabels && pageLabels.length > 0) {
1136
+ output.page_labels = pageLabels;
1137
+ }
1138
+ } catch (error) {
1139
+ const message = error instanceof Error ? error.message : String(error);
1140
+ logger3.warn("Error extracting page labels", { error: message });
1141
+ }
1142
+ }
1143
+ if (options.includePermissions && typeof documentWithStructure.getPermissions === "function") {
1144
+ try {
1145
+ const permissions = await documentWithStructure.getPermissions();
1146
+ if (permissions && permissions.length > 0) {
1147
+ output.permissions = permissionLabels(permissions);
1148
+ }
1149
+ } catch (error) {
1150
+ const message = error instanceof Error ? error.message : String(error);
1151
+ logger3.warn("Error extracting permissions", { error: message });
1152
+ }
1153
+ }
1154
+ if (options.includePermissions && typeof documentWithStructure.getMarkInfo === "function") {
1155
+ try {
1156
+ const markInfo = await documentWithStructure.getMarkInfo();
1157
+ if (markInfo && Object.keys(markInfo).length > 0) {
1158
+ output.mark_info = markInfo;
1159
+ }
1160
+ } catch (error) {
1161
+ const message = error instanceof Error ? error.message : String(error);
1162
+ logger3.warn("Error extracting mark info", { error: message });
1163
+ }
1164
+ }
1165
+ if (options.includeFormFields && typeof documentWithStructure.getFieldObjects === "function") {
1166
+ try {
1167
+ const fieldObjects = await documentWithStructure.getFieldObjects();
1168
+ if (fieldObjects) {
1169
+ const fields = Object.entries(fieldObjects).flatMap(([name, fieldOrFields]) => {
1170
+ const fieldList = Array.isArray(fieldOrFields) ? fieldOrFields : [fieldOrFields];
1171
+ return fieldList.map((field) => normalizeFormField(name, field));
1172
+ }).filter((field) => field !== undefined);
1173
+ if (fields.length > 0) {
1174
+ output.form_fields = fields;
1175
+ }
1176
+ }
1177
+ } catch (error) {
1178
+ const message = error instanceof Error ? error.message : String(error);
1179
+ logger3.warn("Error extracting form fields", { error: message });
1180
+ }
1181
+ }
1182
+ if (options.includeAttachments && typeof documentWithStructure.getAttachments === "function") {
1183
+ try {
1184
+ const attachments = await documentWithStructure.getAttachments();
1185
+ if (attachments) {
1186
+ const attachmentSummaries = Object.entries(attachments).map(([name, attachment]) => {
1187
+ const size = attachmentSize(attachment.content);
1188
+ return {
1189
+ name,
1190
+ ...attachment.filename ? { filename: attachment.filename } : {},
1191
+ ...attachment.description ? { description: attachment.description } : {},
1192
+ ...size !== undefined ? { size_bytes: size } : {}
1193
+ };
1194
+ });
1195
+ if (attachmentSummaries.length > 0) {
1196
+ output.attachments = attachmentSummaries;
1197
+ }
1198
+ }
1199
+ } catch (error) {
1200
+ const message = error instanceof Error ? error.message : String(error);
1201
+ logger3.warn("Error extracting attachments", { error: message });
1202
+ }
1203
+ }
1204
+ return output;
1205
+ };
1206
+ var normalizeFormField = (fallbackName, field) => {
1207
+ const name = (field.name ?? field.fieldName ?? fallbackName).trim();
1208
+ if (!name)
1209
+ return;
1210
+ const page = field.page !== undefined ? field.page : field.pageIndex !== undefined ? field.pageIndex + 1 : undefined;
1211
+ const fieldType = field.type ?? field.fieldType;
1212
+ const boundingBox = buildRectBoundingBox(field.rect);
1213
+ return {
1214
+ name,
1215
+ ...fieldType ? { type: fieldType } : {},
1216
+ ...field.value !== undefined ? { value: field.value } : {},
1217
+ ...field.defaultValue !== undefined ? { default_value: field.defaultValue } : {},
1218
+ ...page !== undefined ? { page } : {},
1219
+ ...field.id ? { id: field.id } : {},
1220
+ ...field.editable !== undefined ? { editable: field.editable } : {},
1221
+ ...field.required !== undefined ? { required: field.required } : {},
1222
+ ...boundingBox ? { bounding_box: boundingBox } : {}
1223
+ };
1224
+ };
1225
+ var normalizeAnnotation = (annotation, pageNum) => {
1226
+ const contents = textFromAnnotationField(annotation.contents, annotation.contentsObj);
1227
+ const title = textFromAnnotationField(annotation.title, annotation.titleObj);
1228
+ const boundingBox = buildRectBoundingBox(annotation.rect);
1229
+ const subtype = annotation.subtype?.trim();
1230
+ const url = annotation.url ?? annotation.unsafeUrl;
1231
+ if (!annotation.id && !subtype && !contents && !title && !url && annotation.dest === undefined) {
1232
+ return;
1233
+ }
1234
+ return {
1235
+ page: pageNum,
1236
+ ...annotation.id ? { id: annotation.id } : {},
1237
+ ...subtype ? { subtype } : {},
1238
+ ...contents ? { contents } : {},
1239
+ ...title ? { title } : {},
1240
+ ...url ? { url } : {},
1241
+ ...annotation.dest !== undefined ? { dest: annotation.dest } : {},
1242
+ ...boundingBox ? { bounding_box: boundingBox } : {}
1243
+ };
1244
+ };
1245
+ var isRecord = (value) => typeof value === "object" && value !== null;
1246
+ var normalizeStructureTreeContent = (rawContent) => {
1247
+ const type = typeof rawContent.type === "string" ? rawContent.type.trim() : "";
1248
+ const id = typeof rawContent.id === "string" ? rawContent.id.trim() : "";
1249
+ if (!type && !id)
1250
+ return;
1251
+ return {
1252
+ type: type || "content",
1253
+ ...id ? { id } : {}
1254
+ };
1255
+ };
1256
+ var normalizeStructureTreeChild = (rawChild) => {
1257
+ if (!isRecord(rawChild))
1258
+ return;
1259
+ if ("role" in rawChild || "children" in rawChild) {
1260
+ return normalizeStructureTreeNode(rawChild);
1261
+ }
1262
+ return normalizeStructureTreeContent(rawChild);
1263
+ };
1264
+ var normalizeStructureTreeNode = (rawNode) => {
1265
+ const role = typeof rawNode.role === "string" && rawNode.role.trim() ? rawNode.role.trim() : "Unknown";
1266
+ const children = Array.isArray(rawNode.children) ? rawNode.children.map((child) => normalizeStructureTreeChild(child)).filter((child) => child !== undefined) : [];
1267
+ return {
1268
+ role,
1269
+ ...children.length > 0 ? { children } : {}
1270
+ };
1271
+ };
1272
+ var extractAnnotations = async (pdfDocument, pagesToProcess) => {
1273
+ const pageAnnotations = [];
1274
+ for (const pageNum of pagesToProcess) {
1275
+ try {
1276
+ const page = await pdfDocument.getPage(pageNum);
1277
+ if (typeof page.getAnnotations !== "function")
1278
+ continue;
1279
+ const annotations = await page.getAnnotations({ intent: "display" });
1280
+ const normalized = annotations.map((annotation) => normalizeAnnotation(annotation, pageNum)).filter((annotation) => annotation !== undefined);
1281
+ if (normalized.length > 0) {
1282
+ pageAnnotations.push({ page: pageNum, annotations: normalized });
1283
+ }
1284
+ } catch (error) {
1285
+ const message = error instanceof Error ? error.message : String(error);
1286
+ logger3.warn("Error extracting annotations from page", { pageNum, error: message });
1287
+ }
1288
+ }
1289
+ return pageAnnotations;
1290
+ };
1291
+ var extractStructureTrees = async (pdfDocument, pagesToProcess) => {
1292
+ const pageStructureTrees = [];
1293
+ for (const pageNum of pagesToProcess) {
1294
+ try {
1295
+ const page = await pdfDocument.getPage(pageNum);
1296
+ if (typeof page.getStructTree !== "function")
1297
+ continue;
1298
+ const rawTree = await page.getStructTree();
1299
+ if (!rawTree)
1300
+ continue;
1301
+ pageStructureTrees.push({
1302
+ page: pageNum,
1303
+ tree: normalizeStructureTreeNode(rawTree)
1304
+ });
1305
+ } catch (error) {
1306
+ const message = error instanceof Error ? error.message : String(error);
1307
+ logger3.warn("Error extracting structure tree", { pageNum, error: message });
1308
+ }
1309
+ }
1310
+ return pageStructureTrees;
1311
+ };
1312
+ var extractPageGeometry = async (pdfDocument, pagesToProcess) => {
1313
+ const pageGeometry = [];
1314
+ for (const pageNum of pagesToProcess) {
1315
+ try {
1316
+ const page = await pdfDocument.getPage(pageNum);
1317
+ const viewBox = buildRectBoundingBox(page.view);
1318
+ const viewport = page.getViewport({ scale: 1 });
1319
+ const width = finiteNumber(viewport.width) ? viewport.width : viewBox ? viewBox.right - viewBox.left : undefined;
1320
+ const height = finiteNumber(viewport.height) ? viewport.height : viewBox ? viewBox.top - viewBox.bottom : undefined;
1321
+ if (!finiteNumber(width) || !finiteNumber(height)) {
1322
+ logger3.warn("Skipping page geometry with invalid dimensions", { pageNum });
1323
+ continue;
1324
+ }
1325
+ pageGeometry.push({
1326
+ page: pageNum,
1327
+ width,
1328
+ height,
1329
+ rotation: finiteNumber(page.rotate) ? page.rotate : 0,
1330
+ ...finiteNumber(page.userUnit) ? { user_unit: page.userUnit } : {},
1331
+ ...viewBox ? { view_box: viewBox } : {}
1332
+ });
1333
+ } catch (error) {
1334
+ const message = error instanceof Error ? error.message : String(error);
1335
+ logger3.warn("Error extracting page geometry", { pageNum, error: message });
1336
+ }
1337
+ }
1338
+ return pageGeometry;
1339
+ };
1340
+ var buildWarnings = (invalidPages, totalPages) => {
1341
+ if (invalidPages.length === 0) {
1342
+ return [];
1343
+ }
1344
+ return [
1345
+ `Requested page numbers ${invalidPages.join(", ")} exceed total pages (${String(totalPages)}).`
1346
+ ];
1347
+ };
1348
+ var extractPageContent = async (pdfDocument, pageNum, includeImages, sourceDescription) => {
1349
+ const contentItems = [];
1350
+ try {
1351
+ const page = await pdfDocument.getPage(pageNum);
1352
+ const textContent = await page.getTextContent();
1353
+ const textByY = new Map;
1354
+ for (const item of textContent.items) {
1355
+ const textItem = item;
1356
+ const xCoord = textItem.transform?.[4];
1357
+ const yCoord = textItem.transform?.[5];
1358
+ if (yCoord === undefined)
1359
+ continue;
1360
+ const y = Math.round(yCoord);
1361
+ const width = textItem.width ?? textItem.str.length * 6;
1362
+ const height = textItem.height ?? Math.abs(textItem.transform?.[3] ?? 0);
1363
+ const boundingBox = buildBoundingBox2(xCoord, yCoord, width, height);
1364
+ if (!textByY.has(y)) {
1365
+ textByY.set(y, []);
1366
+ }
1367
+ textByY.get(y)?.push({
1368
+ text: textItem.str,
1369
+ x: xCoord ?? 0,
1370
+ width,
1371
+ height,
1372
+ bounding_box: boundingBox
1373
+ });
1374
+ }
1375
+ for (const [y, textParts] of textByY.entries()) {
1376
+ for (const segment of splitTextPartsIntoSegments(textParts)) {
1377
+ const contentItem = textSegmentToContentItem(y, segment);
1378
+ if (contentItem) {
1379
+ contentItems.push(contentItem);
1380
+ }
1381
+ }
1382
+ }
1383
+ if (includeImages) {
1384
+ const operatorList = await page.getOperatorList();
1385
+ const imageIndices = [];
1386
+ for (let i = 0;i < operatorList.fnArray.length; i++) {
1387
+ const op = operatorList.fnArray[i];
266
1388
  if (op === OPS.paintImageXObject || op === OPS.paintXObject) {
267
1389
  imageIndices.push(i);
268
1390
  }
@@ -273,10 +1395,15 @@ var extractPageContent = async (pdfDocument, pageNum, includeImages, sourceDescr
273
1395
  return null;
274
1396
  }
275
1397
  const imageName = argsArray[0];
276
- let yPosition = 0;
1398
+ let xPosition;
1399
+ let yPosition;
277
1400
  if (argsArray.length > 1 && Array.isArray(argsArray[1])) {
278
1401
  const transform = argsArray[1];
1402
+ const xCoord = transform[4];
279
1403
  const yCoord = transform[5];
1404
+ if (xCoord !== undefined) {
1405
+ xPosition = Math.round(xCoord);
1406
+ }
280
1407
  if (yCoord !== undefined) {
281
1408
  yPosition = Math.round(yCoord);
282
1409
  }
@@ -284,9 +1411,15 @@ var extractPageContent = async (pdfDocument, pageNum, includeImages, sourceDescr
284
1411
  const imageData = await retrieveImageData(page, imageName, pageNum);
285
1412
  const extractedImage = processImageData(imageData, pageNum, arrayIndex);
286
1413
  if (extractedImage) {
1414
+ const imageBox = buildBoundingBox2(xPosition, yPosition, extractedImage.width, extractedImage.height);
1415
+ extractedImage.bounding_box = imageBox;
287
1416
  return {
288
1417
  type: "image",
289
- yPosition,
1418
+ yPosition: imageBox?.top ?? yPosition ?? 0,
1419
+ xPosition,
1420
+ width: extractedImage.width,
1421
+ height: extractedImage.height,
1422
+ bounding_box: imageBox,
290
1423
  imageData: extractedImage
291
1424
  };
292
1425
  }
@@ -298,7 +1431,7 @@ var extractPageContent = async (pdfDocument, pageNum, includeImages, sourceDescr
298
1431
  }
299
1432
  } catch (error) {
300
1433
  const message = error instanceof Error ? error.message : String(error);
301
- logger2.warn("Error extracting page content", {
1434
+ logger3.warn("Error extracting page content", {
302
1435
  pageNum,
303
1436
  sourceDescription,
304
1437
  error: message
@@ -311,7 +1444,7 @@ var extractPageContent = async (pdfDocument, pageNum, includeImages, sourceDescr
311
1444
  }
312
1445
  ];
313
1446
  }
314
- return contentItems.sort((a, b) => b.yPosition - a.yPosition);
1447
+ return sortPageContentItems(contentItems);
315
1448
  };
316
1449
 
317
1450
  // src/pdf/loader.ts
@@ -527,7 +1660,7 @@ var resolvePath = (userPath) => {
527
1660
  };
528
1661
 
529
1662
  // src/pdf/loader.ts
530
- var logger3 = createLogger("Loader");
1663
+ var logger4 = createLogger("Loader");
531
1664
  var require2 = createRequire(import.meta.url);
532
1665
  var PDFJS_ROOT = require2.resolve("pdfjs-dist/package.json").replace("package.json", "");
533
1666
  var CMAP_URL = `${PDFJS_ROOT}cmaps/`;
@@ -651,7 +1784,7 @@ var fetchUrlBody = async (url, config) => {
651
1784
  throw new PdfError(-32600 /* InvalidRequest */, `URL fetch timed out after ${String(URL_FETCH_TIMEOUT_MS / 1000)}s.`, { cause: err });
652
1785
  }
653
1786
  const message = err instanceof Error ? err.message : String(err);
654
- logger3.warn("URL fetch failed", { url, error: message });
1787
+ logger4.warn("URL fetch failed", { url, error: message });
655
1788
  throw new PdfError(-32600 /* InvalidRequest */, `URL fetch failed for '${url}'.`, {
656
1789
  cause: err instanceof Error ? err : undefined
657
1790
  });
@@ -666,387 +1799,113 @@ var loadPdfDocument = async (source, sourceDescription) => {
666
1799
  if (source.path) {
667
1800
  pdfData = await loadLocalFile(source.path);
668
1801
  } else if (source.url) {
669
- const config = getSecurityConfig();
670
- pdfData = await fetchUrlBody(source.url, config);
671
- } else {
672
- throw new PdfError(-32602 /* InvalidParams */, `Source ${safeSource} missing 'path' or 'url'.`);
673
- }
674
- } catch (err) {
675
- if (err instanceof PdfError) {
676
- throw err;
677
- }
678
- const message = err instanceof Error ? err.message : String(err);
679
- logger3.error("Unexpected error preparing PDF source", {
680
- sourceDescription: safeSource,
681
- error: message
682
- });
683
- throw new PdfError(-32600 /* InvalidRequest */, `Failed to prepare PDF source ${safeSource}.`, {
684
- cause: err instanceof Error ? err : undefined
685
- });
686
- }
687
- const loadingTask = getDocument({
688
- data: pdfData,
689
- cMapUrl: CMAP_URL,
690
- cMapPacked: true,
691
- standardFontDataUrl: STANDARD_FONT_DATA_URL,
692
- wasmUrl: WASM_URL,
693
- iccUrl: ICC_URL
694
- });
695
- try {
696
- return await loadingTask.promise;
697
- } catch (err) {
698
- const message = err instanceof Error ? err.message : String(err);
699
- logger3.error("PDF.js loading error", { sourceDescription: safeSource, error: message });
700
- throw new PdfError(-32600 /* InvalidRequest */, `Failed to load PDF document from ${safeSource}.`, { cause: err instanceof Error ? err : undefined });
701
- }
702
- };
703
-
704
- // src/pdf/parser.ts
705
- var logger4 = createLogger("Parser");
706
- var MAX_RANGE_SIZE = 1e4;
707
- var parseRangePart = (part, pages) => {
708
- const trimmedPart = part.trim();
709
- if (trimmedPart.includes("-")) {
710
- const splitResult = trimmedPart.split("-");
711
- const startStr = splitResult[0] || "";
712
- const endStr = splitResult[1];
713
- const start = parseInt(startStr, 10);
714
- const end = endStr === "" || endStr === undefined ? Infinity : parseInt(endStr, 10);
715
- if (Number.isNaN(start) || Number.isNaN(end) || start <= 0 || start > end) {
716
- throw new Error(`Invalid page range values: ${trimmedPart}`);
717
- }
718
- const practicalEnd = Math.min(end, start + MAX_RANGE_SIZE);
719
- for (let i = start;i <= practicalEnd; i++) {
720
- pages.add(i);
721
- }
722
- if (end === Infinity && practicalEnd === start + MAX_RANGE_SIZE) {
723
- logger4.warn("Open-ended range truncated", { start, practicalEnd });
724
- }
725
- } else {
726
- const page = parseInt(trimmedPart, 10);
727
- if (Number.isNaN(page) || page <= 0) {
728
- throw new Error(`Invalid page number: ${trimmedPart}`);
729
- }
730
- pages.add(page);
731
- }
732
- };
733
- var parsePageRanges = (ranges) => {
734
- const pages = new Set;
735
- const parts = ranges.split(",");
736
- for (const part of parts) {
737
- parseRangePart(part, pages);
738
- }
739
- if (pages.size === 0) {
740
- throw new Error("Page range string resulted in zero valid pages.");
741
- }
742
- return Array.from(pages).sort((a, b) => a - b);
743
- };
744
- var getTargetPages = (sourcePages, sourceDescription) => {
745
- if (!sourcePages) {
746
- return;
747
- }
748
- try {
749
- if (typeof sourcePages === "string") {
750
- return parsePageRanges(sourcePages);
751
- }
752
- if (sourcePages.some((p) => !Number.isInteger(p) || p <= 0)) {
753
- throw new Error("Page numbers in array must be positive integers.");
754
- }
755
- const uniquePages = [...new Set(sourcePages)].sort((a, b) => a - b);
756
- if (uniquePages.length === 0) {
757
- throw new Error("Page specification resulted in an empty set of pages.");
758
- }
759
- return uniquePages;
760
- } catch (error) {
761
- const message = error instanceof Error ? error.message : String(error);
762
- throw new PdfError(-32602 /* InvalidParams */, `Invalid page specification for source ${sourceDescription}: ${message}`);
763
- }
764
- };
765
- var determinePagesToProcess = (targetPages, totalPages, includeFullText) => {
766
- if (targetPages) {
767
- const pagesToProcess = targetPages.filter((p) => p <= totalPages);
768
- const invalidPages = targetPages.filter((p) => p > totalPages);
769
- return { pagesToProcess, invalidPages };
770
- }
771
- if (includeFullText) {
772
- const pagesToProcess = Array.from({ length: totalPages }, (_, i) => i + 1);
773
- return { pagesToProcess, invalidPages: [] };
774
- }
775
- return { pagesToProcess: [], invalidPages: [] };
776
- };
777
-
778
- // src/pdf/tableExtractor.ts
779
- var logger5 = createLogger("TableExtractor");
780
- var Y_TOLERANCE = 5;
781
- var COLUMN_GAP_THRESHOLD = 15;
782
- var MIN_ROWS = 2;
783
- var MIN_COLS = 2;
784
- var MIN_ROW_ITEMS = 2;
785
- var extractTextItemsWithPositions = async (page) => {
786
- const textContent = await page.getTextContent();
787
- const items = [];
788
- for (const item of textContent.items) {
789
- const textItem = item;
790
- if (!textItem.str.trim())
791
- continue;
792
- if (!textItem.transform || textItem.transform.length < 6)
793
- continue;
794
- const x = textItem.transform[4];
795
- const y = textItem.transform[5];
796
- if (x === undefined || y === undefined)
797
- continue;
798
- items.push({
799
- text: textItem.str,
800
- x,
801
- y,
802
- width: textItem.width ?? textItem.str.length * 6
803
- });
804
- }
805
- return items;
806
- };
807
- var clusterByY = (items, tolerance = Y_TOLERANCE) => {
808
- if (items.length === 0)
809
- return [];
810
- const sorted = [...items].sort((a, b) => b.y - a.y);
811
- const firstItem = sorted[0];
812
- if (!firstItem)
813
- return [];
814
- const rows = [];
815
- let currentRow = { y: firstItem.y, items: [firstItem] };
816
- for (let i = 1;i < sorted.length; i++) {
817
- const item = sorted[i];
818
- if (!item)
819
- continue;
820
- const yDiff = Math.abs(currentRow.y - item.y);
821
- if (yDiff <= tolerance) {
822
- currentRow.items.push(item);
823
- } else {
824
- rows.push(currentRow);
825
- currentRow = { y: item.y, items: [item] };
826
- }
827
- }
828
- rows.push(currentRow);
829
- for (const row of rows) {
830
- row.items.sort((a, b) => a.x - b.x);
831
- }
832
- return rows;
833
- };
834
- var detectColumnBoundaries = (rows, gapThreshold = COLUMN_GAP_THRESHOLD) => {
835
- if (rows.length === 0)
836
- return [];
837
- const allXPositions = [];
838
- for (const row of rows) {
839
- for (const item of row.items) {
840
- allXPositions.push(item.x);
841
- }
842
- }
843
- if (allXPositions.length === 0)
844
- return [];
845
- allXPositions.sort((a, b) => a - b);
846
- const firstX = allXPositions[0];
847
- if (firstX === undefined)
848
- return [];
849
- const boundaries = [firstX];
850
- for (let i = 1;i < allXPositions.length; i++) {
851
- const current = allXPositions[i];
852
- const previous = allXPositions[i - 1];
853
- if (current === undefined || previous === undefined)
854
- continue;
855
- const gap = current - previous;
856
- if (gap >= gapThreshold) {
857
- boundaries.push(current);
1802
+ const config = getSecurityConfig();
1803
+ pdfData = await fetchUrlBody(source.url, config);
1804
+ } else {
1805
+ throw new PdfError(-32602 /* InvalidParams */, `Source ${safeSource} missing 'path' or 'url'.`);
858
1806
  }
859
- }
860
- return boundaries;
861
- };
862
- var assignToColumns = (row, columnBoundaries, tolerance = COLUMN_GAP_THRESHOLD / 2) => {
863
- const cells = new Array(columnBoundaries.length).fill("");
864
- for (const item of row.items) {
865
- let colIndex = 0;
866
- for (let i = columnBoundaries.length - 1;i >= 0; i--) {
867
- const boundary = columnBoundaries[i];
868
- if (boundary !== undefined && item.x >= boundary - tolerance) {
869
- colIndex = i;
870
- break;
871
- }
1807
+ } catch (err) {
1808
+ if (err instanceof PdfError) {
1809
+ throw err;
872
1810
  }
873
- const current = cells[colIndex];
874
- cells[colIndex] = current ? `${current} ${item.text}` : item.text;
1811
+ const message = err instanceof Error ? err.message : String(err);
1812
+ logger4.error("Unexpected error preparing PDF source", {
1813
+ sourceDescription: safeSource,
1814
+ error: message
1815
+ });
1816
+ throw new PdfError(-32600 /* InvalidRequest */, `Failed to prepare PDF source ${safeSource}.`, {
1817
+ cause: err instanceof Error ? err : undefined
1818
+ });
875
1819
  }
876
- return cells;
877
- };
878
- var calculateConfidence = (rows, columnBoundaries) => {
879
- if (rows.length < MIN_ROWS || columnBoundaries.length < MIN_COLS) {
880
- return 0;
1820
+ const loadingTask = getDocument({
1821
+ data: pdfData,
1822
+ cMapUrl: CMAP_URL,
1823
+ cMapPacked: true,
1824
+ standardFontDataUrl: STANDARD_FONT_DATA_URL,
1825
+ wasmUrl: WASM_URL,
1826
+ iccUrl: ICC_URL
1827
+ });
1828
+ try {
1829
+ return await loadingTask.promise;
1830
+ } catch (err) {
1831
+ const message = err instanceof Error ? err.message : String(err);
1832
+ logger4.error("PDF.js loading error", { sourceDescription: safeSource, error: message });
1833
+ throw new PdfError(-32600 /* InvalidRequest */, `Failed to load PDF document from ${safeSource}.`, { cause: err instanceof Error ? err : undefined });
881
1834
  }
882
- let score = 0;
883
- let checks = 0;
884
- for (const row of rows) {
885
- const itemsPerColumn = new Set;
886
- for (const item of row.items) {
887
- for (let i = columnBoundaries.length - 1;i >= 0; i--) {
888
- const boundary = columnBoundaries[i];
889
- if (boundary !== undefined && item.x >= boundary - COLUMN_GAP_THRESHOLD / 2) {
890
- itemsPerColumn.add(i);
891
- break;
892
- }
893
- }
1835
+ };
1836
+
1837
+ // src/pdf/parser.ts
1838
+ var logger5 = createLogger("Parser");
1839
+ var MAX_RANGE_SIZE = 1e4;
1840
+ var parseRangePart = (part, pages) => {
1841
+ const trimmedPart = part.trim();
1842
+ if (trimmedPart.includes("-")) {
1843
+ const splitResult = trimmedPart.split("-");
1844
+ const startStr = splitResult[0] || "";
1845
+ const endStr = splitResult[1];
1846
+ const start = parseInt(startStr, 10);
1847
+ const end = endStr === "" || endStr === undefined ? Infinity : parseInt(endStr, 10);
1848
+ if (Number.isNaN(start) || Number.isNaN(end) || start <= 0 || start > end) {
1849
+ throw new Error(`Invalid page range values: ${trimmedPart}`);
894
1850
  }
895
- score += itemsPerColumn.size / columnBoundaries.length;
896
- checks++;
897
- }
898
- if (rows.length >= 2) {
899
- const spacings = [];
900
- for (let i = 1;i < rows.length; i++) {
901
- const prevRow = rows[i - 1];
902
- const currRow = rows[i];
903
- if (prevRow && currRow) {
904
- spacings.push(Math.abs(prevRow.y - currRow.y));
905
- }
1851
+ const practicalEnd = Math.min(end, start + MAX_RANGE_SIZE);
1852
+ for (let i = start;i <= practicalEnd; i++) {
1853
+ pages.add(i);
906
1854
  }
907
- if (spacings.length > 0) {
908
- const avgSpacing = spacings.reduce((a, b) => a + b, 0) / spacings.length;
909
- const variance = spacings.reduce((sum, s) => sum + (s - avgSpacing) ** 2, 0) / spacings.length;
910
- const stdDev = Math.sqrt(variance);
911
- const regularityScore = avgSpacing > 0 ? Math.max(0, 1 - stdDev / avgSpacing) : 0;
912
- score += regularityScore;
913
- checks++;
1855
+ if (end === Infinity && practicalEnd === start + MAX_RANGE_SIZE) {
1856
+ logger5.warn("Open-ended range truncated", { start, practicalEnd });
1857
+ }
1858
+ } else {
1859
+ const page = parseInt(trimmedPart, 10);
1860
+ if (Number.isNaN(page) || page <= 0) {
1861
+ throw new Error(`Invalid page number: ${trimmedPart}`);
914
1862
  }
1863
+ pages.add(page);
915
1864
  }
916
- return checks > 0 ? Math.min(1, score / checks) : 0;
917
1865
  };
918
- var identifyTableRegions = (rows) => {
919
- const regions = [];
920
- const candidateRows = rows.filter((row) => row.items.length >= MIN_ROW_ITEMS);
921
- if (candidateRows.length < MIN_ROWS) {
922
- return regions;
923
- }
924
- const columnBoundaries = detectColumnBoundaries(candidateRows);
925
- if (columnBoundaries.length < MIN_COLS) {
926
- return regions;
927
- }
928
- let currentRegion = [];
929
- for (const row of candidateRows) {
930
- const alignedItems = row.items.filter((item) => {
931
- return columnBoundaries.some((boundary) => Math.abs(item.x - boundary) < COLUMN_GAP_THRESHOLD);
932
- });
933
- if (alignedItems.length >= MIN_COLS - 1) {
934
- currentRegion.push(row);
935
- } else if (currentRegion.length >= MIN_ROWS) {
936
- const firstRow = currentRegion[0];
937
- const lastRow = currentRegion[currentRegion.length - 1];
938
- if (firstRow && lastRow) {
939
- regions.push({
940
- rows: currentRegion,
941
- columnBoundaries,
942
- startY: firstRow.y,
943
- endY: lastRow.y
944
- });
945
- }
946
- currentRegion = [];
947
- } else {
948
- currentRegion = [];
949
- }
1866
+ var parsePageRanges = (ranges) => {
1867
+ const pages = new Set;
1868
+ const parts = ranges.split(",");
1869
+ for (const part of parts) {
1870
+ parseRangePart(part, pages);
950
1871
  }
951
- if (currentRegion.length >= MIN_ROWS) {
952
- const firstRow = currentRegion[0];
953
- const lastRow = currentRegion[currentRegion.length - 1];
954
- if (firstRow && lastRow) {
955
- regions.push({
956
- rows: currentRegion,
957
- columnBoundaries,
958
- startY: firstRow.y,
959
- endY: lastRow.y
960
- });
961
- }
1872
+ if (pages.size === 0) {
1873
+ throw new Error("Page range string resulted in zero valid pages.");
962
1874
  }
963
- return regions;
1875
+ return Array.from(pages).sort((a, b) => a - b);
964
1876
  };
965
- var extractTablesFromPage = async (page, pageNum) => {
966
- const tables = [];
1877
+ var getTargetPages = (sourcePages, sourceDescription) => {
1878
+ if (!sourcePages) {
1879
+ return;
1880
+ }
967
1881
  try {
968
- const textItems = await extractTextItemsWithPositions(page);
969
- if (textItems.length === 0) {
970
- return tables;
1882
+ if (typeof sourcePages === "string") {
1883
+ return parsePageRanges(sourcePages);
971
1884
  }
972
- const rows = clusterByY(textItems);
973
- const tableRegions = identifyTableRegions(rows);
974
- for (let tableIndex = 0;tableIndex < tableRegions.length; tableIndex++) {
975
- const region = tableRegions[tableIndex];
976
- if (!region)
977
- continue;
978
- const tableRows = [];
979
- for (const row of region.rows) {
980
- const cells = assignToColumns(row, region.columnBoundaries);
981
- tableRows.push(cells);
982
- }
983
- const confidence = calculateConfidence(region.rows, region.columnBoundaries);
984
- if (confidence >= 0.3) {
985
- tables.push({
986
- page: pageNum,
987
- tableIndex,
988
- rows: tableRows,
989
- rowCount: tableRows.length,
990
- colCount: region.columnBoundaries.length,
991
- confidence: Math.round(confidence * 100) / 100
992
- });
993
- }
1885
+ if (sourcePages.some((p) => !Number.isInteger(p) || p <= 0)) {
1886
+ throw new Error("Page numbers in array must be positive integers.");
1887
+ }
1888
+ const uniquePages = [...new Set(sourcePages)].sort((a, b) => a - b);
1889
+ if (uniquePages.length === 0) {
1890
+ throw new Error("Page specification resulted in an empty set of pages.");
994
1891
  }
1892
+ return uniquePages;
995
1893
  } catch (error) {
996
1894
  const message = error instanceof Error ? error.message : String(error);
997
- logger5.warn("Error extracting tables from page", { pageNum, error: message });
998
- }
999
- return tables;
1000
- };
1001
- var extractTables = async (pdfDocument, pagesToProcess) => {
1002
- const allTables = [];
1003
- for (const pageNum of pagesToProcess) {
1004
- try {
1005
- const page = await pdfDocument.getPage(pageNum);
1006
- const pageTables = await extractTablesFromPage(page, pageNum);
1007
- allTables.push(...pageTables);
1008
- } catch (error) {
1009
- const message = error instanceof Error ? error.message : String(error);
1010
- logger5.warn("Error getting page for table extraction", { pageNum, error: message });
1011
- }
1895
+ throw new PdfError(-32602 /* InvalidParams */, `Invalid page specification for source ${sourceDescription}: ${message}`);
1012
1896
  }
1013
- return allTables;
1014
1897
  };
1015
- var tableToMarkdown = (table) => {
1016
- if (table.rows.length === 0)
1017
- return "";
1018
- const lines = [];
1019
- const headerRow = table.rows[0];
1020
- if (!headerRow)
1021
- return "";
1022
- lines.push(`| ${headerRow.map((cell) => cell.trim() || " ").join(" | ")} |`);
1023
- lines.push(`| ${headerRow.map(() => "---").join(" | ")} |`);
1024
- for (let i = 1;i < table.rows.length; i++) {
1025
- const row = table.rows[i];
1026
- if (!row)
1027
- continue;
1028
- const paddedRow = [...row];
1029
- while (paddedRow.length < headerRow.length) {
1030
- paddedRow.push("");
1031
- }
1032
- lines.push(`| ${paddedRow.map((cell) => cell.trim() || " ").join(" | ")} |`);
1898
+ var determinePagesToProcess = (targetPages, totalPages, includeFullText) => {
1899
+ if (targetPages) {
1900
+ const pagesToProcess = targetPages.filter((p) => p <= totalPages);
1901
+ const invalidPages = targetPages.filter((p) => p > totalPages);
1902
+ return { pagesToProcess, invalidPages };
1033
1903
  }
1034
- return lines.join(`
1035
- `);
1036
- };
1037
- var tablesToMarkdown = (tables) => {
1038
- if (tables.length === 0)
1039
- return "";
1040
- const sections = ["## Extracted Tables", ""];
1041
- for (const table of tables) {
1042
- sections.push(`### Page ${table.page}, Table ${table.tableIndex + 1}`);
1043
- sections.push(`*Confidence: ${(table.confidence * 100).toFixed(0)}%*`);
1044
- sections.push("");
1045
- sections.push(tableToMarkdown(table));
1046
- sections.push("");
1904
+ if (includeFullText) {
1905
+ const pagesToProcess = Array.from({ length: totalPages }, (_, i) => i + 1);
1906
+ return { pagesToProcess, invalidPages: [] };
1047
1907
  }
1048
- return sections.join(`
1049
- `);
1908
+ return { pagesToProcess: [], invalidPages: [] };
1050
1909
  };
1051
1910
 
1052
1911
  // src/schemas/readPdf.ts
@@ -1075,7 +1934,21 @@ var readPdfArgsSchema = object({
1075
1934
  include_metadata: optional(bool(description("Include metadata and info objects for each PDF."))),
1076
1935
  include_page_count: optional(bool(description("Include the total number of pages for each PDF."))),
1077
1936
  include_images: optional(bool(description("Extract and include embedded images from the PDF pages as base64-encoded data."))),
1078
- include_tables: optional(bool(description("Detect and extract tables from PDF pages. Uses spatial clustering of text coordinates to identify tabular structures.")))
1937
+ include_tables: optional(bool(description("Detect and extract tables from PDF pages. Uses spatial clustering of text coordinates to identify tabular structures."))),
1938
+ include_elements: optional(bool(description("Include agent-ready structured document elements with page numbers, stable IDs, provenance, and best-effort bounding boxes."))),
1939
+ include_semantic_hints: optional(bool(description("Include deterministic semantic hints on text elements, such as heading, list item, or paragraph."))),
1940
+ include_markdown: optional(bool(description("Include a Markdown rendering of extracted pages for RAG, summarization, and agent context."))),
1941
+ include_html: optional(bool(description("Include a simple HTML rendering of extracted pages for preview, export, and downstream conversion."))),
1942
+ include_chunks: optional(bool(description("Include page-level citation-ready chunks with text, element IDs, page ranges, and best-effort bounding boxes."))),
1943
+ include_outline: optional(bool(description("Include document outline/bookmark entries when the PDF exposes them."))),
1944
+ include_annotations: optional(bool(description("Include page annotations such as links, notes, and form-related annotations with safe summary fields."))),
1945
+ include_page_labels: optional(bool(description("Include PDF page labels when available, such as roman numerals or section labels."))),
1946
+ include_page_geometry: optional(bool(description("Include page viewport geometry such as width, height, rotation, user unit, and view box."))),
1947
+ include_permissions: optional(bool(description("Include PDF permission and marking signals when exposed by the parser."))),
1948
+ include_form_fields: optional(bool(description("Include PDF form field summaries when AcroForm fields are exposed."))),
1949
+ include_attachments: optional(bool(description("Include embedded attachment metadata such as filename and size. Attachment bytes are not returned."))),
1950
+ include_structure_tree: optional(bool(description("Include best-effort tagged PDF structure trees for selected pages when the PDF exposes them."))),
1951
+ include_safety_findings: optional(bool(description("Include deterministic content safety findings for prompt-injection patterns, tiny text, and off-page text.")))
1079
1952
  });
1080
1953
 
1081
1954
  // src/handlers/readPdf.ts
@@ -1091,41 +1964,63 @@ var processSingleSource = async (source, options) => {
1091
1964
  const totalPages = pdfDocument.numPages;
1092
1965
  const metadataOutput = await extractMetadataAndPageCount(pdfDocument, options.includeMetadata, options.includePageCount);
1093
1966
  const output = { ...metadataOutput };
1094
- const { pagesToProcess, invalidPages } = determinePagesToProcess(targetPages, totalPages, options.includeFullText);
1967
+ const structureOutput = await extractDocumentStructure(pdfDocument, {
1968
+ includeOutline: options.includeOutline,
1969
+ includePageLabels: options.includePageLabels,
1970
+ includePermissions: options.includePermissions,
1971
+ includeFormFields: options.includeFormFields,
1972
+ includeAttachments: options.includeAttachments
1973
+ });
1974
+ Object.assign(output, structureOutput);
1975
+ const explicitPageContent = options.includeFullText || options.includeElements || options.includeSemanticHints || options.includeMarkdown || options.includeHtml || options.includeChunks || options.includeImages || options.includeSafetyFindings;
1976
+ const pageScopedMetadata = options.includeTables || options.includeAnnotations || options.includePageGeometry || options.includeStructureTree;
1977
+ const includeSelectedPageText = targetPages !== undefined && !explicitPageContent && !pageScopedMetadata;
1978
+ const shouldSelectPages = explicitPageContent || includeSelectedPageText || pageScopedMetadata;
1979
+ const { pagesToProcess, invalidPages } = determinePagesToProcess(targetPages, totalPages, shouldSelectPages);
1095
1980
  const warnings = buildWarnings(invalidPages, totalPages);
1096
1981
  if (warnings.length > 0) {
1097
1982
  output.warnings = warnings;
1098
1983
  }
1099
1984
  if (pagesToProcess.length > 0) {
1100
- const MAX_CONCURRENT_PAGES = 5;
1101
- const pageContents = [];
1102
- for (let i = 0;i < pagesToProcess.length; i += MAX_CONCURRENT_PAGES) {
1103
- const batch = pagesToProcess.slice(i, i + MAX_CONCURRENT_PAGES);
1104
- const batchResults = await Promise.all(batch.map((pageNum) => extractPageContent(pdfDocument, pageNum, options.includeImages, sourceDescription)));
1105
- pageContents.push(...batchResults);
1106
- if (i + MAX_CONCURRENT_PAGES < pagesToProcess.length) {
1107
- await new Promise((resolve) => setImmediate(resolve));
1985
+ const needsPageContent = explicitPageContent || includeSelectedPageText;
1986
+ let pageGeometry;
1987
+ if (options.includePageGeometry || options.includeSafetyFindings) {
1988
+ pageGeometry = await extractPageGeometry(pdfDocument, pagesToProcess);
1989
+ if (pageGeometry.length > 0 && options.includePageGeometry) {
1990
+ output.page_geometry = pageGeometry;
1108
1991
  }
1109
1992
  }
1110
- output.page_contents = pageContents.map((items, idx) => ({
1111
- page: pagesToProcess[idx],
1112
- items
1113
- }));
1114
- const extractedPageTexts = pageContents.map((items, idx) => ({
1115
- page: pagesToProcess[idx],
1116
- text: items.filter((item) => item.type === "text").map((item) => item.textContent).join("")
1117
- }));
1118
- if (targetPages) {
1119
- output.page_texts = extractedPageTexts;
1120
- } else {
1121
- output.full_text = extractedPageTexts.map((p) => p.text).join(`
1993
+ if (needsPageContent) {
1994
+ const MAX_CONCURRENT_PAGES = 5;
1995
+ const pageContents = [];
1996
+ for (let i = 0;i < pagesToProcess.length; i += MAX_CONCURRENT_PAGES) {
1997
+ const batch = pagesToProcess.slice(i, i + MAX_CONCURRENT_PAGES);
1998
+ const batchResults = await Promise.all(batch.map((pageNum) => extractPageContent(pdfDocument, pageNum, options.includeImages, sourceDescription)));
1999
+ pageContents.push(...batchResults);
2000
+ if (i + MAX_CONCURRENT_PAGES < pagesToProcess.length) {
2001
+ await new Promise((resolve) => setImmediate(resolve));
2002
+ }
2003
+ }
2004
+ output.page_contents = pageContents.map((items, idx) => ({
2005
+ page: pagesToProcess[idx],
2006
+ items
2007
+ }));
2008
+ const extractedPageTexts = pageContents.map((items, idx) => ({
2009
+ page: pagesToProcess[idx],
2010
+ text: items.filter((item) => item.type === "text").map((item) => item.textContent).join("")
2011
+ }));
2012
+ if (targetPages) {
2013
+ output.page_texts = extractedPageTexts;
2014
+ } else if (options.includeFullText) {
2015
+ output.full_text = extractedPageTexts.map((p) => p.text).join(`
1122
2016
 
1123
2017
  `);
1124
- }
1125
- if (options.includeImages) {
1126
- const extractedImages = pageContents.flatMap((items) => items.filter((item) => item.type === "image" && item.imageData)).map((item) => item.imageData).filter((img) => img !== undefined);
1127
- if (extractedImages.length > 0) {
1128
- output.images = extractedImages;
2018
+ }
2019
+ if (options.includeImages) {
2020
+ const extractedImages = pageContents.flatMap((items) => items.filter((item) => item.type === "image" && item.imageData)).map((item) => item.imageData).filter((img) => img !== undefined);
2021
+ if (extractedImages.length > 0) {
2022
+ output.images = extractedImages;
2023
+ }
1129
2024
  }
1130
2025
  }
1131
2026
  if (options.includeTables) {
@@ -1134,6 +2029,40 @@ var processSingleSource = async (source, options) => {
1134
2029
  output.tables = extractedTables;
1135
2030
  }
1136
2031
  }
2032
+ const buildElementsForOutput = () => buildStructuredElements(output.page_contents ?? [], output.tables, options.includeSemanticHints);
2033
+ if ((options.includeElements || options.includeSemanticHints) && output.page_contents) {
2034
+ output.elements = buildElementsForOutput();
2035
+ }
2036
+ if (options.includeMarkdown && output.page_contents) {
2037
+ output.markdown = renderMarkdownFromPageContents(output.page_contents, output.tables);
2038
+ }
2039
+ if (options.includeHtml && output.page_contents) {
2040
+ output.html = renderHtmlFromPageContents(output.page_contents, output.tables);
2041
+ }
2042
+ if (options.includeChunks && output.page_contents) {
2043
+ const chunkElements = output.elements ?? buildElementsForOutput();
2044
+ output.chunks = buildCitationChunks(chunkElements, {
2045
+ useSemanticBoundaries: options.includeSemanticHints
2046
+ });
2047
+ }
2048
+ if (options.includeSafetyFindings && output.page_contents) {
2049
+ const safetyFindings = buildSafetyFindings(output.page_contents, pageGeometry);
2050
+ if (safetyFindings.length > 0) {
2051
+ output.safety_findings = safetyFindings;
2052
+ }
2053
+ }
2054
+ if (options.includeAnnotations) {
2055
+ const annotations = await extractAnnotations(pdfDocument, pagesToProcess);
2056
+ if (annotations.length > 0) {
2057
+ output.annotations = annotations;
2058
+ }
2059
+ }
2060
+ if (options.includeStructureTree) {
2061
+ const structureTrees = await extractStructureTrees(pdfDocument, pagesToProcess);
2062
+ if (structureTrees.length > 0) {
2063
+ output.structure_trees = structureTrees;
2064
+ }
2065
+ }
1137
2066
  }
1138
2067
  individualResult = { ...individualResult, data: output, success: true };
1139
2068
  } catch (error) {
@@ -1152,9 +2081,10 @@ var processSingleSource = async (source, options) => {
1152
2081
  individualResult.success = false;
1153
2082
  individualResult.data = undefined;
1154
2083
  } finally {
1155
- if (pdfDocument && typeof pdfDocument.destroy === "function") {
2084
+ const loadingTask = pdfDocument?.loadingTask;
2085
+ if (loadingTask && typeof loadingTask.destroy === "function") {
1156
2086
  try {
1157
- await pdfDocument.destroy();
2087
+ await loadingTask.destroy();
1158
2088
  } catch (destroyError) {
1159
2089
  const message = destroyError instanceof Error ? destroyError.message : String(destroyError);
1160
2090
  logger6.warn("Error destroying PDF document", { sourceDescription, error: message });
@@ -1170,7 +2100,21 @@ var readPdf = tool().description("Reads content/metadata/images from one or more
1170
2100
  include_metadata,
1171
2101
  include_page_count,
1172
2102
  include_images,
1173
- include_tables
2103
+ include_tables,
2104
+ include_elements,
2105
+ include_semantic_hints,
2106
+ include_markdown,
2107
+ include_html,
2108
+ include_chunks,
2109
+ include_outline,
2110
+ include_annotations,
2111
+ include_page_labels,
2112
+ include_page_geometry,
2113
+ include_permissions,
2114
+ include_form_fields,
2115
+ include_attachments,
2116
+ include_structure_tree,
2117
+ include_safety_findings
1174
2118
  } = input;
1175
2119
  const MAX_CONCURRENT_SOURCES = 3;
1176
2120
  const results = [];
@@ -1179,7 +2123,21 @@ var readPdf = tool().description("Reads content/metadata/images from one or more
1179
2123
  includeMetadata: include_metadata ?? true,
1180
2124
  includePageCount: include_page_count ?? true,
1181
2125
  includeImages: include_images ?? false,
1182
- includeTables: include_tables ?? false
2126
+ includeTables: include_tables ?? false,
2127
+ includeElements: include_elements ?? false,
2128
+ includeSemanticHints: include_semantic_hints ?? false,
2129
+ includeMarkdown: include_markdown ?? false,
2130
+ includeHtml: include_html ?? false,
2131
+ includeChunks: include_chunks ?? false,
2132
+ includeOutline: include_outline ?? false,
2133
+ includeAnnotations: include_annotations ?? false,
2134
+ includePageLabels: include_page_labels ?? false,
2135
+ includePageGeometry: include_page_geometry ?? false,
2136
+ includePermissions: include_permissions ?? false,
2137
+ includeFormFields: include_form_fields ?? false,
2138
+ includeAttachments: include_attachments ?? false,
2139
+ includeStructureTree: include_structure_tree ?? false,
2140
+ includeSafetyFindings: include_safety_findings ?? false
1183
2141
  };
1184
2142
  for (let i = 0;i < sources.length; i += MAX_CONCURRENT_SOURCES) {
1185
2143
  const batch = sources.slice(i, i + MAX_CONCURRENT_SOURCES);
@@ -1211,6 +2169,8 @@ var readPdf = tool().description("Reads content/metadata/images from one or more
1211
2169
  tableIndex: tbl.tableIndex,
1212
2170
  rowCount: tbl.rowCount,
1213
2171
  colCount: tbl.colCount,
2172
+ cellCount: tbl.cells?.length ?? tbl.rowCount * tbl.colCount,
2173
+ bounding_box: tbl.bounding_box,
1214
2174
  confidence: tbl.confidence
1215
2175
  }));
1216
2176
  }