markit-ai 0.1.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,654 @@
1
+ /**
2
+ * Table grid detection from vector segments and text boxes.
3
+ *
4
+ * Ported from @oharato/pdf2md-ts with TypeScript types and without
5
+ * CJK-specific borderless table heuristics. The core algorithm:
6
+ *
7
+ * 1. Classify segments as horizontal or vertical lines
8
+ * 2. Group horizontal Y-lines into table groups (split by vertical gaps)
9
+ * 3. For each group:
10
+ * a. Full grid (H+V lines): build cells from grid intersections,
11
+ * place text via raycasting
12
+ * b. H-line only (no V lines): infer columns from text X positions
13
+ * 4. Prune empty rows/cols
14
+ *
15
+ * Coordinate system: PDF native (bottom-left origin, Y increases upward).
16
+ */
17
+ function castRaysForTextBox(textBox, segments) {
18
+ const cx = (textBox.bounds.left + textBox.bounds.right) / 2;
19
+ const cy = (textBox.bounds.top + textBox.bounds.bottom) / 2;
20
+ let up = { direction: "up", segmentId: null, distance: Infinity };
21
+ let down = { direction: "down", segmentId: null, distance: Infinity };
22
+ let left = { direction: "left", segmentId: null, distance: Infinity };
23
+ let right = {
24
+ direction: "right",
25
+ segmentId: null,
26
+ distance: Infinity,
27
+ };
28
+ for (const seg of segments) {
29
+ const isH = Math.abs(seg.y1 - seg.y2) < 0.5;
30
+ const isV = Math.abs(seg.x1 - seg.x2) < 0.5;
31
+ if (isH) {
32
+ const minX = Math.min(seg.x1, seg.x2);
33
+ const maxX = Math.max(seg.x1, seg.x2);
34
+ if (cx >= minX && cx <= maxX) {
35
+ const d = seg.y1 - cy;
36
+ if (d >= 0 && d < up.distance)
37
+ up = { direction: "up", segmentId: seg.id, distance: d };
38
+ const dd = cy - seg.y1;
39
+ if (dd >= 0 && dd < down.distance)
40
+ down = { direction: "down", segmentId: seg.id, distance: dd };
41
+ }
42
+ }
43
+ if (isV) {
44
+ const minY = Math.min(seg.y1, seg.y2);
45
+ const maxY = Math.max(seg.y1, seg.y2);
46
+ if (cy >= minY && cy <= maxY) {
47
+ const d = cx - seg.x1;
48
+ if (d >= 0 && d < left.distance)
49
+ left = { direction: "left", segmentId: seg.id, distance: d };
50
+ const rd = seg.x1 - cx;
51
+ if (rd >= 0 && rd < right.distance)
52
+ right = { direction: "right", segmentId: seg.id, distance: rd };
53
+ }
54
+ }
55
+ }
56
+ return [up, down, left, right];
57
+ }
58
+ // ---------------------------------------------------------------------------
59
+ // Utility
60
+ // ---------------------------------------------------------------------------
61
+ const AXIS_EPSILON = 0.8;
62
+ const PAGE_MARGIN = 20;
63
+ function uniqueSorted(values) {
64
+ const sorted = [...values].sort((a, b) => a - b);
65
+ const result = [];
66
+ for (const v of sorted) {
67
+ if (result.length === 0 || Math.abs(result[result.length - 1] - v) > 1)
68
+ result.push(v);
69
+ }
70
+ return result;
71
+ }
72
+ // ---------------------------------------------------------------------------
73
+ // Y-line group splitting
74
+ // ---------------------------------------------------------------------------
75
+ function chainCoversRange(intervals, lowerY, upperY, eps) {
76
+ const sorted = [...intervals].sort((a, b) => a.min - b.min);
77
+ let covered = lowerY;
78
+ for (const iv of sorted) {
79
+ if (iv.min > covered + eps)
80
+ break;
81
+ if (iv.max > covered)
82
+ covered = iv.max;
83
+ if (covered >= upperY - eps)
84
+ return true;
85
+ }
86
+ return false;
87
+ }
88
+ function countBridgingVLineCols(upperY, lowerY, verticals) {
89
+ const eps = 1.5;
90
+ const byX = new Map();
91
+ for (const seg of verticals) {
92
+ const rx = Math.round(seg.x1);
93
+ if (!byX.has(rx))
94
+ byX.set(rx, []);
95
+ byX
96
+ .get(rx)
97
+ ?.push({ min: Math.min(seg.y1, seg.y2), max: Math.max(seg.y1, seg.y2) });
98
+ }
99
+ let count = 0;
100
+ for (const intervals of byX.values()) {
101
+ if (chainCoversRange(intervals, lowerY, upperY, eps))
102
+ count++;
103
+ }
104
+ return count;
105
+ }
106
+ function bridgingXSet(upperY, lowerY, verticals) {
107
+ const eps = 1.5;
108
+ const xs = new Set();
109
+ const byX = new Map();
110
+ for (const seg of verticals) {
111
+ const rx = Math.round(seg.x1);
112
+ if (!byX.has(rx))
113
+ byX.set(rx, []);
114
+ byX
115
+ .get(rx)
116
+ ?.push({ min: Math.min(seg.y1, seg.y2), max: Math.max(seg.y1, seg.y2) });
117
+ }
118
+ for (const [rx, intervals] of byX) {
119
+ if (chainCoversRange(intervals, lowerY, upperY, eps))
120
+ xs.add(rx);
121
+ }
122
+ return xs;
123
+ }
124
+ const MIN_RICH_BRIDGING_COLS = 3;
125
+ function splitYLinesIntoGroups(yLines, verticals) {
126
+ if (yLines.length === 0)
127
+ return [];
128
+ const eps = 1.5;
129
+ const allX = verticals.map((s) => Math.round(s.x1));
130
+ const globalXMin = allX.length > 0 ? Math.min(...allX) : 0;
131
+ const globalXMax = allX.length > 0 ? Math.max(...allX) : 0;
132
+ const groups = [];
133
+ let currentGroup = [yLines[0]];
134
+ let prevBridgingCols = -1;
135
+ for (let i = 1; i < yLines.length; i++) {
136
+ const upperY = yLines[i - 1];
137
+ const lowerY = yLines[i];
138
+ const cols = countBridgingVLineCols(upperY, lowerY, verticals);
139
+ if (cols === 0) {
140
+ groups.push(currentGroup);
141
+ currentGroup = [yLines[i]];
142
+ prevBridgingCols = -1;
143
+ continue;
144
+ }
145
+ if (prevBridgingCols >= MIN_RICH_BRIDGING_COLS &&
146
+ cols < MIN_RICH_BRIDGING_COLS) {
147
+ const bxs = bridgingXSet(upperY, lowerY, verticals);
148
+ const isOuterFrameOnly = [...bxs].every((x) => Math.abs(x - globalXMin) <= eps || Math.abs(x - globalXMax) <= eps);
149
+ if (!isOuterFrameOnly) {
150
+ groups.push(currentGroup);
151
+ currentGroup = [yLines[i - 1], yLines[i]];
152
+ prevBridgingCols = cols;
153
+ continue;
154
+ }
155
+ }
156
+ currentGroup.push(yLines[i]);
157
+ prevBridgingCols = cols;
158
+ }
159
+ groups.push(currentGroup);
160
+ return groups;
161
+ }
162
+ // ---------------------------------------------------------------------------
163
+ // Sub-row Y-cluster expansion
164
+ // ---------------------------------------------------------------------------
165
+ const Y_CLUSTER_GAP = 10;
166
+ const MIN_COLS_IN_TOP_CLUSTER = 2;
167
+ function assignToYCluster(y, clusters) {
168
+ let closest = 0;
169
+ let closestDist = Math.abs(y - clusters[0]);
170
+ for (let k = 1; k < clusters.length; k++) {
171
+ const d = Math.abs(y - clusters[k]);
172
+ if (d < closestDist) {
173
+ closestDist = d;
174
+ closest = k;
175
+ }
176
+ }
177
+ return closest;
178
+ }
179
+ function expandSubRowsByYClusters(originalRows, cols, cells, cellBoxes) {
180
+ let addedRows = 0;
181
+ for (let origRow = 0; origRow < originalRows; origRow++) {
182
+ const currentRow = origRow + addedRows;
183
+ const rowCellInfos = [];
184
+ for (let col = 0; col < cols; col++) {
185
+ const cell = cells.find((c) => c.row === currentRow && c.col === col);
186
+ if (!cell)
187
+ continue;
188
+ const boxes = cellBoxes.get(cell);
189
+ if (boxes && boxes.length > 0)
190
+ rowCellInfos.push({ cell, col, boxes });
191
+ }
192
+ if (rowCellInfos.length === 0)
193
+ continue;
194
+ const allMidYs = rowCellInfos.flatMap(({ boxes }) => boxes.map((b) => (b.bounds.top + b.bounds.bottom) / 2));
195
+ const sortedY = [
196
+ ...new Set(allMidYs.map((y) => Math.round(y * 10) / 10)),
197
+ ].sort((a, b) => b - a);
198
+ const clusters = [sortedY[0]];
199
+ for (let i = 1; i < sortedY.length; i++) {
200
+ if (clusters[clusters.length - 1] - sortedY[i] > Y_CLUSTER_GAP) {
201
+ clusters.push(sortedY[i]);
202
+ }
203
+ }
204
+ if (clusters.length < 2)
205
+ continue;
206
+ const colsInTopCluster = new Set();
207
+ const totalNonEmptyCols = new Set();
208
+ for (const { col, boxes } of rowCellInfos) {
209
+ totalNonEmptyCols.add(col);
210
+ if (boxes.some((b) => assignToYCluster((b.bounds.top + b.bounds.bottom) / 2, clusters) ===
211
+ 0)) {
212
+ colsInTopCluster.add(col);
213
+ }
214
+ }
215
+ if (colsInTopCluster.size < MIN_COLS_IN_TOP_CLUSTER)
216
+ continue;
217
+ if (colsInTopCluster.size >= totalNonEmptyCols.size)
218
+ continue;
219
+ const sparseColsHaveMultipleBoxes = rowCellInfos.some(({ col, boxes }) => !colsInTopCluster.has(col) && boxes.length > 1);
220
+ if (!sparseColsHaveMultipleBoxes)
221
+ continue;
222
+ const numSubRows = clusters.length;
223
+ const numNewRows = numSubRows - 1;
224
+ for (const cell of cells) {
225
+ if (cell.row > currentRow)
226
+ cell.row += numNewRows;
227
+ }
228
+ for (let subRow = 1; subRow < numSubRows; subRow++) {
229
+ for (let col = 0; col < cols; col++) {
230
+ cells.push({
231
+ row: currentRow + subRow,
232
+ col,
233
+ text: "",
234
+ rowSpan: 1,
235
+ colSpan: 1,
236
+ });
237
+ }
238
+ }
239
+ for (const { cell: origCell, col, boxes } of rowCellInfos) {
240
+ const subRowBoxGroups = Array.from({ length: numSubRows }, () => []);
241
+ for (const box of boxes) {
242
+ const cy = (box.bounds.top + box.bounds.bottom) / 2;
243
+ subRowBoxGroups[assignToYCluster(cy, clusters)].push(box);
244
+ }
245
+ cellBoxes.set(origCell, subRowBoxGroups[0]);
246
+ if (subRowBoxGroups[0].length === 0)
247
+ cellBoxes.delete(origCell);
248
+ for (let subRow = 1; subRow < numSubRows; subRow++) {
249
+ if (subRowBoxGroups[subRow].length > 0) {
250
+ const newCell = cells.find((c) => c.row === currentRow + subRow && c.col === col);
251
+ if (newCell)
252
+ cellBoxes.set(newCell, subRowBoxGroups[subRow]);
253
+ }
254
+ }
255
+ }
256
+ addedRows += numNewRows;
257
+ }
258
+ return originalRows + addedRows;
259
+ }
260
+ // ---------------------------------------------------------------------------
261
+ // Full grid table (H + V lines)
262
+ // ---------------------------------------------------------------------------
263
+ function buildCells(rows, cols) {
264
+ const cells = [];
265
+ for (let row = 0; row < rows; row++) {
266
+ for (let col = 0; col < cols; col++) {
267
+ cells.push({ row, col, text: "", rowSpan: 1, colSpan: 1 });
268
+ }
269
+ }
270
+ return cells;
271
+ }
272
+ function buildTableGrid(pageNumber, yLines, xLines, filteredSegments, textBoxes) {
273
+ let rows = yLines.length - 1;
274
+ const cols = xLines.length - 1;
275
+ const cells = buildCells(rows, cols);
276
+ const consumedIds = [];
277
+ const yMin = yLines[yLines.length - 1];
278
+ const yMax = yLines[0];
279
+ const xMin = xLines[0];
280
+ const xMax = xLines[xLines.length - 1];
281
+ // Look for header text boxes just above the grid
282
+ const headerBoxes = textBoxes.filter((tb) => {
283
+ const cy = (tb.bounds.top + tb.bounds.bottom) / 2;
284
+ const cx = (tb.bounds.left + tb.bounds.right) / 2;
285
+ return cy > yMax && cy <= yMax + 20 && cx >= xMin && cx <= xMax;
286
+ });
287
+ if (headerBoxes.length > 0) {
288
+ rows += 1;
289
+ for (const cell of cells)
290
+ cell.row += 1;
291
+ for (let col = 0; col < cols; col++) {
292
+ cells.push({ row: 0, col, text: "", rowSpan: 1, colSpan: 1 });
293
+ }
294
+ for (const tb of headerBoxes) {
295
+ const cx = (tb.bounds.left + tb.bounds.right) / 2;
296
+ const col = xLines.findIndex((lineX, idx) => {
297
+ const next = xLines[idx + 1];
298
+ return next !== undefined && cx >= lineX && cx <= next;
299
+ });
300
+ if (col >= 0 && col < cols) {
301
+ const cell = cells.find((c) => c.row === 0 && c.col === col);
302
+ if (cell) {
303
+ cell.text =
304
+ cell.text.length === 0 ? tb.text : `${cell.text} ${tb.text}`;
305
+ consumedIds.push(tb.id);
306
+ }
307
+ }
308
+ }
309
+ }
310
+ const cellBoxes = new Map();
311
+ for (const tb of textBoxes) {
312
+ const cx = (tb.bounds.left + tb.bounds.right) / 2;
313
+ const cy = (tb.bounds.top + tb.bounds.bottom) / 2;
314
+ if (cy < yMin || cy > yMax || cx < xMin || cx > xMax)
315
+ continue;
316
+ const rays = castRaysForTextBox(tb, filteredSegments);
317
+ const rayConfidence = rays.filter((r) => r.segmentId !== null).length;
318
+ let row = yLines.findIndex((lineY, idx) => {
319
+ const next = yLines[idx + 1];
320
+ return next !== undefined && cy <= lineY && cy >= next;
321
+ });
322
+ if (row < 0 || row >= (headerBoxes.length > 0 ? rows - 1 : rows))
323
+ continue;
324
+ if (headerBoxes.length > 0)
325
+ row += 1;
326
+ const col = xLines.findIndex((lineX, idx) => {
327
+ const next = xLines[idx + 1];
328
+ return next !== undefined && cx >= lineX && cx <= next;
329
+ });
330
+ if (col < 0 || col >= cols)
331
+ continue;
332
+ if (rayConfidence === 0)
333
+ continue;
334
+ const cell = cells.find((c) => c.row === row && c.col === col);
335
+ if (!cell)
336
+ continue;
337
+ if (!cellBoxes.has(cell))
338
+ cellBoxes.set(cell, []);
339
+ cellBoxes.get(cell)?.push(tb);
340
+ consumedIds.push(tb.id);
341
+ }
342
+ rows = expandSubRowsByYClusters(rows, cols, cells, cellBoxes);
343
+ // Merge text boxes within each cell into cell text
344
+ for (const [cell, boxes] of cellBoxes.entries()) {
345
+ boxes.sort((a, b) => b.bounds.top - a.bounds.top);
346
+ const lines = [];
347
+ let currentLine = [];
348
+ let currentY = boxes[0].bounds.top;
349
+ for (const box of boxes) {
350
+ if (Math.abs(box.bounds.top - currentY) > 5) {
351
+ lines.push(currentLine.join(" "));
352
+ currentLine = [box.text];
353
+ currentY = box.bounds.top;
354
+ }
355
+ else {
356
+ currentLine.push(box.text);
357
+ }
358
+ }
359
+ if (currentLine.length > 0)
360
+ lines.push(currentLine.join(" "));
361
+ cell.text = lines.join("<br>");
362
+ }
363
+ const grid = pruneEmptyRowsAndCols({
364
+ pageNumber,
365
+ rows,
366
+ cols,
367
+ cells,
368
+ warnings: [],
369
+ topY: yLines[0],
370
+ isBorderless: false,
371
+ });
372
+ return { grid, consumedIds };
373
+ }
374
+ // ---------------------------------------------------------------------------
375
+ // H-line-only table (inferred columns)
376
+ // ---------------------------------------------------------------------------
377
+ const COL_GAP_THRESHOLD = 20;
378
+ const HONLY_ROW_GAP = 30;
379
+ const HONLY_ROW_TOLERANCE = 8;
380
+ const MIN_TABLE_HEIGHT = 24;
381
+ const MIN_LEFT_SPREAD = 50;
382
+ function inferXLinesFromBoxes(textBoxes, xMin, xMax) {
383
+ const centers = textBoxes
384
+ .map((tb) => (tb.bounds.left + tb.bounds.right) / 2)
385
+ .sort((a, b) => a - b);
386
+ if (centers.length === 0)
387
+ return [xMin, xMax];
388
+ const boundaries = [xMin];
389
+ for (let i = 1; i < centers.length; i++) {
390
+ if (centers[i] - centers[i - 1] >= COL_GAP_THRESHOLD) {
391
+ boundaries.push((centers[i - 1] + centers[i]) / 2);
392
+ }
393
+ }
394
+ boundaries.push(xMax);
395
+ return boundaries;
396
+ }
397
+ function buildHLineOnlyTable(pageNumber, yLines, xMin, xMax, textBoxes, alreadyConsumed) {
398
+ const yMax = yLines[0];
399
+ const yMin = yLines[yLines.length - 1];
400
+ const candidates = textBoxes.filter((tb) => !alreadyConsumed.has(tb.id));
401
+ const BOX_LEFT_TOLERANCE = 30;
402
+ const inRange = candidates.filter((tb) => {
403
+ const cy = (tb.bounds.top + tb.bounds.bottom) / 2;
404
+ return (tb.bounds.left >= xMin - BOX_LEFT_TOLERANCE &&
405
+ tb.bounds.right <= xMax + BOX_LEFT_TOLERANCE &&
406
+ cy >= yMin &&
407
+ cy <= yMax);
408
+ });
409
+ // Extend downward below yMin
410
+ const belowYMin = candidates
411
+ .filter((tb) => {
412
+ const cx = (tb.bounds.left + tb.bounds.right) / 2;
413
+ const cy = (tb.bounds.top + tb.bounds.bottom) / 2;
414
+ return cx >= xMin && cx <= xMax && cy < yMin;
415
+ })
416
+ .sort((a, b) => (b.bounds.top + b.bounds.bottom) / 2 -
417
+ (a.bounds.top + a.bounds.bottom) / 2);
418
+ const extensionBoxes = [];
419
+ let lastY = yMin;
420
+ for (const tb of belowYMin) {
421
+ const cy = (tb.bounds.top + tb.bounds.bottom) / 2;
422
+ if (lastY - cy > HONLY_ROW_GAP)
423
+ break;
424
+ extensionBoxes.push(tb);
425
+ lastY = cy;
426
+ }
427
+ const allBoxes = [...inRange, ...extensionBoxes];
428
+ if (allBoxes.length === 0)
429
+ return null;
430
+ const leftEdges = allBoxes.map((tb) => tb.bounds.left);
431
+ if (Math.max(...leftEdges) - Math.min(...leftEdges) < MIN_LEFT_SPREAD)
432
+ return null;
433
+ const xLines = inferXLinesFromBoxes(allBoxes, xMin, xMax);
434
+ if (xLines.length < 2)
435
+ return null;
436
+ const cols = xLines.length - 1;
437
+ // Build visual rows
438
+ const visualRows = [];
439
+ const sortedBoxes = [...allBoxes].sort((a, b) => {
440
+ const ya = (a.bounds.top + a.bounds.bottom) / 2;
441
+ const yb = (b.bounds.top + b.bounds.bottom) / 2;
442
+ if (Math.abs(ya - yb) > 0.5)
443
+ return yb - ya;
444
+ return a.bounds.left - b.bounds.left;
445
+ });
446
+ for (const box of sortedBoxes) {
447
+ const cy = (box.bounds.top + box.bounds.bottom) / 2;
448
+ const last = visualRows[visualRows.length - 1];
449
+ if (last && Math.abs(last.midY - cy) <= HONLY_ROW_TOLERANCE) {
450
+ last.boxes.push(box);
451
+ }
452
+ else {
453
+ visualRows.push({ midY: cy, boxes: [box] });
454
+ }
455
+ }
456
+ if (visualRows.length === 0)
457
+ return null;
458
+ const cells = [];
459
+ const consumedIds = [];
460
+ for (let rowIdx = 0; rowIdx < visualRows.length; rowIdx++) {
461
+ const vrow = visualRows[rowIdx];
462
+ const colBoxes = new Map();
463
+ for (const box of vrow.boxes) {
464
+ const cx = (box.bounds.left + box.bounds.right) / 2;
465
+ const col = xLines.findIndex((lineX, idx) => {
466
+ const next = xLines[idx + 1];
467
+ return next !== undefined && cx >= lineX && cx <= next;
468
+ });
469
+ if (col >= 0 && col < cols) {
470
+ if (!colBoxes.has(col))
471
+ colBoxes.set(col, []);
472
+ colBoxes.get(col)?.push(box);
473
+ }
474
+ }
475
+ for (let c = 0; c < cols; c++) {
476
+ const cbs = (colBoxes.get(c) ?? []).sort((a, b) => a.bounds.left - b.bounds.left);
477
+ cells.push({
478
+ row: rowIdx,
479
+ col: c,
480
+ text: cbs.map((b) => b.text).join(" "),
481
+ rowSpan: 1,
482
+ colSpan: 1,
483
+ });
484
+ consumedIds.push(...cbs.map((b) => b.id));
485
+ }
486
+ }
487
+ const contentTopY = visualRows.length > 0 ? visualRows[0].midY : yMax;
488
+ const grid = pruneEmptyRowsAndCols({
489
+ pageNumber,
490
+ rows: visualRows.length,
491
+ cols,
492
+ cells,
493
+ warnings: [],
494
+ topY: contentTopY,
495
+ isBorderless: false,
496
+ });
497
+ return { grid, consumedIds };
498
+ }
499
+ // ---------------------------------------------------------------------------
500
+ // Pruning
501
+ // ---------------------------------------------------------------------------
502
+ function pruneEmptyRowsAndCols(table) {
503
+ const occupiedRows = new Set(table.cells.filter((c) => c.text.trim().length > 0).map((c) => c.row));
504
+ const occupiedCols = new Set(table.cells.filter((c) => c.text.trim().length > 0).map((c) => c.col));
505
+ if (occupiedRows.size === 0)
506
+ return table;
507
+ const rowMap = new Map();
508
+ let newRow = 0;
509
+ for (let r = 0; r < table.rows; r++) {
510
+ if (occupiedRows.has(r))
511
+ rowMap.set(r, newRow++);
512
+ }
513
+ const colMap = new Map();
514
+ let newCol = 0;
515
+ for (let c = 0; c < table.cols; c++) {
516
+ if (occupiedCols.has(c))
517
+ colMap.set(c, newCol++);
518
+ }
519
+ const prunedCells = table.cells
520
+ .filter((c) => occupiedRows.has(c.row) && occupiedCols.has(c.col))
521
+ .map((c) => ({
522
+ ...c,
523
+ row: rowMap.get(c.row) ?? c.row,
524
+ col: colMap.get(c.col) ?? c.col,
525
+ }));
526
+ return { ...table, rows: newRow, cols: newCol, cells: prunedCells };
527
+ }
528
+ // ---------------------------------------------------------------------------
529
+ // Diagram vs table discrimination
530
+ // ---------------------------------------------------------------------------
531
+ /** Maximum column count for a plausible data table. */
532
+ const MAX_TABLE_COLS = 25;
533
+ /**
534
+ * Returns true if a grid looks like a vector diagram rather than a data table.
535
+ *
536
+ * Heuristics (any match → diagram):
537
+ * 1. Column count > 25 (diagrams create many X-lines from box edges)
538
+ * 2. Fill ratio < 25% (most cells empty — scattered boxes)
539
+ * 3. Fill < 50% AND duplicate text ratio > 30% (repeating labels in a
540
+ * diagram layout, e.g. "Hash", "Transaction" appearing in each column)
541
+ * 4. Fill < 50% AND cols >= 6 (moderate sparseness with wide grid)
542
+ */
543
+ function isDiagram(grid) {
544
+ const totalCells = grid.rows * grid.cols;
545
+ if (totalCells === 0)
546
+ return true;
547
+ const filled = grid.cells.filter((c) => c.text.trim().length > 0);
548
+ const fillRatio = filled.length / totalCells;
549
+ // Very high column count
550
+ if (grid.cols > MAX_TABLE_COLS)
551
+ return true;
552
+ // Very sparse
553
+ if (fillRatio < 0.25)
554
+ return true;
555
+ // Compute duplicate text ratio among non-trivial cells.
556
+ // Exclude short values (≤3 chars) like "—", "V", "YES", "NO" which
557
+ // naturally repeat in real data tables.
558
+ const substantive = filled.filter((c) => c.text.trim().length > 3);
559
+ const uniqueTexts = new Set(substantive.map((c) => c.text.trim())).size;
560
+ const dupRatio = substantive.length > 2 ? 1 - uniqueTexts / substantive.length : 0;
561
+ // Sparse + highly duplicated substantive text → repeating diagram
562
+ if (fillRatio < 0.5 && dupRatio > 0.3)
563
+ return true;
564
+ // High duplication + wide grid → repeating diagram even at moderate fill
565
+ if (dupRatio > 0.4 && grid.cols >= 6)
566
+ return true;
567
+ // Sparse + wide grid with no substantive text to judge
568
+ if (fillRatio < 0.4 && grid.cols >= 6)
569
+ return true;
570
+ return false;
571
+ }
572
+ /**
573
+ * Detect all table grids on a single page from its text boxes and segments.
574
+ */
575
+ export function resolveTableGrids(pageNumber, textBoxes, segments) {
576
+ const vertical = segments.filter((s) => Math.abs(s.x1 - s.x2) <= AXIS_EPSILON);
577
+ const horizontal = segments.filter((s) => Math.abs(s.y1 - s.y2) <= AXIS_EPSILON);
578
+ // Filter segments to the text's visible area
579
+ const textYValues = textBoxes.flatMap((t) => [t.bounds.bottom, t.bounds.top]);
580
+ const textYMin = textYValues.length > 0 ? Math.min(...textYValues) - PAGE_MARGIN : -Infinity;
581
+ const textYMax = textYValues.length > 0 ? Math.max(...textYValues) + PAGE_MARGIN : Infinity;
582
+ const textXValues = textBoxes.flatMap((t) => [t.bounds.left, t.bounds.right]);
583
+ const textXMin = textXValues.length > 0 ? Math.min(...textXValues) - 100 : -Infinity;
584
+ const textXMax = textXValues.length > 0 ? Math.max(...textXValues) + 100 : Infinity;
585
+ const filteredH = horizontal.filter((s) => s.y1 >= textYMin &&
586
+ s.y1 <= textYMax &&
587
+ s.x1 <= textXMax &&
588
+ s.x2 >= textXMin);
589
+ const hMaxX2 = filteredH.length > 0 ? Math.max(...filteredH.map((s) => s.x2)) : textXMax;
590
+ const vSegXMax = Math.max(textXMax, hMaxX2 + PAGE_MARGIN);
591
+ const filteredV = vertical.filter((s) => {
592
+ const segMin = Math.min(s.y1, s.y2);
593
+ const segMax = Math.max(s.y1, s.y2);
594
+ return (segMax >= textYMin &&
595
+ segMin <= textYMax &&
596
+ s.x1 >= textXMin &&
597
+ s.x1 <= vSegXMax);
598
+ });
599
+ const allYLines = uniqueSorted(filteredH.flatMap((s) => [s.y1, s.y2])).sort((a, b) => b - a);
600
+ if (allYLines.length < 2) {
601
+ return { grids: [], consumedIds: [] };
602
+ }
603
+ const filteredSegments = [...filteredH, ...filteredV];
604
+ const yGroups = splitYLinesIntoGroups(allYLines, filteredV);
605
+ const grids = [];
606
+ const gridConsumedIds = [];
607
+ // Flat set for the alreadyConsumed check in H-line-only tables
608
+ const allConsumedIds = [];
609
+ for (const yLines of yGroups) {
610
+ if (yLines.length < 2)
611
+ continue;
612
+ const yMin = yLines[yLines.length - 1];
613
+ const yMax = yLines[0];
614
+ const groupVerticals = filteredV.filter((s) => {
615
+ const segMin = Math.min(s.y1, s.y2);
616
+ const segMax = Math.max(s.y1, s.y2);
617
+ return segMin < yMax - 1.5 && segMax > yMin + 1.5;
618
+ });
619
+ const groupXLines = uniqueSorted(groupVerticals.flatMap((s) => [s.x1, s.x2]));
620
+ if (groupXLines.length < 2) {
621
+ if (yMax - yMin < MIN_TABLE_HEIGHT)
622
+ continue;
623
+ const groupHoriz = filteredH.filter((s) => s.y1 >= yMin - 1.5 && s.y1 <= yMax + 1.5);
624
+ if (groupHoriz.length === 0)
625
+ continue;
626
+ const hxMin = Math.min(...groupHoriz.map((s) => s.x1));
627
+ const hxMax = Math.max(...groupHoriz.map((s) => s.x2));
628
+ const result = buildHLineOnlyTable(pageNumber, yLines, hxMin, hxMax, textBoxes, new Set(allConsumedIds));
629
+ if (result) {
630
+ grids.push(result.grid);
631
+ gridConsumedIds.push(result.consumedIds);
632
+ allConsumedIds.push(...result.consumedIds);
633
+ }
634
+ continue;
635
+ }
636
+ if (yMax - yMin < MIN_TABLE_HEIGHT)
637
+ continue;
638
+ const result = buildTableGrid(pageNumber, yLines, groupXLines, filteredSegments, textBoxes);
639
+ grids.push(result.grid);
640
+ gridConsumedIds.push(result.consumedIds);
641
+ allConsumedIds.push(...result.consumedIds);
642
+ }
643
+ // Filter out grids that look like vector diagrams, not data tables.
644
+ // Their consumed text box IDs are released so the text becomes free text.
645
+ const filteredGrids = [];
646
+ const filteredConsumedIds = [];
647
+ for (let i = 0; i < grids.length; i++) {
648
+ if (isDiagram(grids[i]))
649
+ continue;
650
+ filteredGrids.push(grids[i]);
651
+ filteredConsumedIds.push(...gridConsumedIds[i]);
652
+ }
653
+ return { grids: filteredGrids, consumedIds: filteredConsumedIds };
654
+ }
@@ -0,0 +1,24 @@
1
+ /**
2
+ * Running header/footer detection and removal.
3
+ *
4
+ * Many PDFs have repeated text at the top or bottom of every page:
5
+ * document titles, chapter names, page numbers, copyright notices.
6
+ * These pollute the markdown output as false headings or noise.
7
+ *
8
+ * Algorithm:
9
+ * 1. For each page, bucket text boxes by Y position (top/bottom zones)
10
+ * 2. Collect the text content at each zone across all pages
11
+ * 3. Text appearing on >20% of pages OR 8+ consecutive pages is a
12
+ * running header/footer
13
+ * 4. Remove matching text boxes before further processing
14
+ */
15
+ import type { PageContent } from "./types.js";
16
+ /**
17
+ * Detect and remove running headers and footers from all pages.
18
+ * Mutates the pages array in place, removing header/footer text boxes.
19
+ *
20
+ * Uses two strategies:
21
+ * 1. Global frequency: text appearing on > 20% of all pages
22
+ * 2. Consecutive runs: text appearing on 8+ consecutive pages
23
+ */
24
+ export declare function stripHeadersFooters(pages: PageContent[]): void;