markit-ai 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -258,6 +258,122 @@ function expandSubRowsByYClusters(originalRows, cols, cells, cellBoxes) {
258
258
  return originalRows + addedRows;
259
259
  }
260
260
  // ---------------------------------------------------------------------------
261
+ // Cross-column text box splitting
262
+ // ---------------------------------------------------------------------------
263
+ /**
264
+ * Find which column a horizontal position falls into.
265
+ * Returns -1 if outside the grid.
266
+ */
267
+ function findCol(x, xLines) {
268
+ for (let i = 0; i < xLines.length - 1; i++) {
269
+ if (x >= xLines[i] && x <= xLines[i + 1])
270
+ return i;
271
+ }
272
+ return -1;
273
+ }
274
+ /**
275
+ * When a text box spans across one or more vertical column boundaries,
276
+ * split it into multiple virtual text boxes — one per column — with the
277
+ * text divided proportionally by width.
278
+ *
279
+ * We split at word boundaries closest to the proportional split point
280
+ * so we don't chop words in half.
281
+ */
282
+ function splitCrossColumnBoxes(textBoxes, xLines) {
283
+ const result = [];
284
+ const MARGIN = 5; // allow small overlap before considering it cross-column
285
+ for (const tb of textBoxes) {
286
+ const leftCol = findCol(tb.bounds.left + MARGIN, xLines);
287
+ const rightCol = findCol(tb.bounds.right - MARGIN, xLines);
288
+ // Not spanning columns, or outside grid — keep as-is
289
+ if (leftCol < 0 || rightCol < 0 || leftCol === rightCol) {
290
+ result.push(tb);
291
+ continue;
292
+ }
293
+ // Text box spans from leftCol to rightCol — split it
294
+ const totalWidth = tb.bounds.right - tb.bounds.left;
295
+ if (totalWidth <= 0) {
296
+ result.push(tb);
297
+ continue;
298
+ }
299
+ const words = tb.text.split(/\s+/);
300
+ if (words.length <= 1) {
301
+ // Single word spanning columns — just assign to whichever col has more overlap
302
+ result.push(tb);
303
+ continue;
304
+ }
305
+ // For each column boundary crossing, find the best word-boundary split
306
+ let remainingWords = [...words];
307
+ let currentLeft = tb.bounds.left;
308
+ for (let col = leftCol; col <= rightCol && remainingWords.length > 0; col++) {
309
+ const colRight = col < xLines.length - 1 ? xLines[col + 1] : tb.bounds.right;
310
+ const segmentRight = Math.min(colRight, tb.bounds.right);
311
+ if (col === rightCol) {
312
+ // Last column — take all remaining words
313
+ result.push({
314
+ ...tb,
315
+ id: `${tb.id}-split${col}`,
316
+ text: remainingWords.join(" "),
317
+ bounds: {
318
+ ...tb.bounds,
319
+ left: currentLeft,
320
+ right: tb.bounds.right,
321
+ },
322
+ });
323
+ remainingWords = [];
324
+ }
325
+ else {
326
+ // Find how many words fit in this column segment proportionally
327
+ const segmentWidth = segmentRight - currentLeft;
328
+ const fractionOfTotal = segmentWidth / totalWidth;
329
+ const approxChars = Math.round(fractionOfTotal * tb.text.length);
330
+ // Walk words to find the split closest to the proportional point
331
+ let charCount = 0;
332
+ let splitIdx = 0;
333
+ for (let w = 0; w < remainingWords.length; w++) {
334
+ const nextCount = charCount + remainingWords[w].length + (w > 0 ? 1 : 0);
335
+ if (nextCount > approxChars && splitIdx > 0)
336
+ break;
337
+ charCount = nextCount;
338
+ splitIdx = w + 1;
339
+ }
340
+ if (splitIdx === 0)
341
+ splitIdx = 1; // take at least one word
342
+ if (splitIdx >= remainingWords.length) {
343
+ // All remaining words fit here
344
+ result.push({
345
+ ...tb,
346
+ id: `${tb.id}-split${col}`,
347
+ text: remainingWords.join(" "),
348
+ bounds: {
349
+ ...tb.bounds,
350
+ left: currentLeft,
351
+ right: segmentRight,
352
+ },
353
+ });
354
+ remainingWords = [];
355
+ }
356
+ else {
357
+ const partWords = remainingWords.slice(0, splitIdx);
358
+ result.push({
359
+ ...tb,
360
+ id: `${tb.id}-split${col}`,
361
+ text: partWords.join(" "),
362
+ bounds: {
363
+ ...tb.bounds,
364
+ left: currentLeft,
365
+ right: segmentRight,
366
+ },
367
+ });
368
+ remainingWords = remainingWords.slice(splitIdx);
369
+ currentLeft = segmentRight;
370
+ }
371
+ }
372
+ }
373
+ }
374
+ return result;
375
+ }
376
+ // ---------------------------------------------------------------------------
261
377
  // Full grid table (H + V lines)
262
378
  // ---------------------------------------------------------------------------
263
379
  function buildCells(rows, cols) {
@@ -278,11 +394,26 @@ function buildTableGrid(pageNumber, yLines, xLines, filteredSegments, textBoxes)
278
394
  const yMax = yLines[0];
279
395
  const xMin = xLines[0];
280
396
  const xMax = xLines[xLines.length - 1];
281
- // Look for header text boxes just above the grid
397
+ // Split text boxes that span multiple columns before placement
398
+ const splitBoxes = splitCrossColumnBoxes(textBoxes, xLines);
399
+ // Track which split piece IDs get placed in cells, so we can consume
400
+ // the original (unsplit) text box IDs too.
401
+ const placedSplitIds = new Set();
402
+ // Look for header text boxes just above the grid.
403
+ // Use the ORIGINAL (unsplit) text boxes for header detection so that
404
+ // wide paragraph text isn't falsely split into column-sized header chunks.
405
+ // Reject boxes wider than 1.5 columns — those are paragraph text, not headers.
406
+ const avgColWidth = (xMax - xMin) / cols;
407
+ const maxHeaderBoxWidth = avgColWidth * 1.5;
282
408
  const headerBoxes = textBoxes.filter((tb) => {
283
409
  const cy = (tb.bounds.top + tb.bounds.bottom) / 2;
284
410
  const cx = (tb.bounds.left + tb.bounds.right) / 2;
285
- return cy > yMax && cy <= yMax + 20 && cx >= xMin && cx <= xMax;
411
+ const boxWidth = tb.bounds.right - tb.bounds.left;
412
+ return (cy > yMax &&
413
+ cy <= yMax + 20 &&
414
+ cx >= xMin &&
415
+ cx <= xMax &&
416
+ boxWidth <= maxHeaderBoxWidth);
286
417
  });
287
418
  if (headerBoxes.length > 0) {
288
419
  rows += 1;
@@ -308,7 +439,7 @@ function buildTableGrid(pageNumber, yLines, xLines, filteredSegments, textBoxes)
308
439
  }
309
440
  }
310
441
  const cellBoxes = new Map();
311
- for (const tb of textBoxes) {
442
+ for (const tb of splitBoxes) {
312
443
  const cx = (tb.bounds.left + tb.bounds.right) / 2;
313
444
  const cy = (tb.bounds.top + tb.bounds.bottom) / 2;
314
445
  if (cy < yMin || cy > yMax || cx < xMin || cx > xMax)
@@ -338,6 +469,8 @@ function buildTableGrid(pageNumber, yLines, xLines, filteredSegments, textBoxes)
338
469
  cellBoxes.set(cell, []);
339
470
  cellBoxes.get(cell)?.push(tb);
340
471
  consumedIds.push(tb.id);
472
+ if (tb.id.includes("-split"))
473
+ placedSplitIds.add(tb.id);
341
474
  }
342
475
  rows = expandSubRowsByYClusters(rows, cols, cells, cellBoxes);
343
476
  // Merge text boxes within each cell into cell text
@@ -369,6 +502,14 @@ function buildTableGrid(pageNumber, yLines, xLines, filteredSegments, textBoxes)
369
502
  topY: yLines[0],
370
503
  isBorderless: false,
371
504
  });
505
+ // Also consume the original (unsplit) text box IDs when any of their
506
+ // split pieces were placed in a cell.
507
+ for (const splitId of placedSplitIds) {
508
+ const origId = splitId.replace(/-split\d+$/, "");
509
+ if (!consumedIds.includes(origId)) {
510
+ consumedIds.push(origId);
511
+ }
512
+ }
372
513
  return { grid, consumedIds };
373
514
  }
374
515
  // ---------------------------------------------------------------------------
@@ -80,8 +80,28 @@ export class PdfConverter {
80
80
  });
81
81
  }
82
82
  }
83
- // Detect column layout
83
+ // Detect column layout.
84
+ // If the page has vertical segments (tables), suppress column detection
85
+ // when one detected column is very narrow — that's a table's first column,
86
+ // not a page layout column.
84
87
  const layout = detectColumns(page.textBoxes);
88
+ if (layout.columnCount > 1 &&
89
+ page.segments.some((s) => Math.abs(s.x1 - s.x2) <= 0.8)) {
90
+ const pageXMin = Math.min(...page.textBoxes.map((tb) => tb.bounds.left));
91
+ const pageXMax = Math.max(...page.textBoxes.map((tb) => tb.bounds.right));
92
+ const pageWidth = pageXMax - pageXMin;
93
+ const minColFraction = 0.3;
94
+ const tooNarrow = layout.columns.some((col) => {
95
+ const colXMin = Math.min(...col.map((tb) => tb.bounds.left));
96
+ const colXMax = Math.max(...col.map((tb) => tb.bounds.right));
97
+ return (colXMax - colXMin) / pageWidth < minColFraction;
98
+ });
99
+ if (tooNarrow) {
100
+ layout.columnCount = 1;
101
+ layout.columns = [page.textBoxes];
102
+ layout.boundaries = [];
103
+ }
104
+ }
85
105
  if (layout.columnCount === 1) {
86
106
  // Single column — process normally
87
107
  const md = processColumn(page.pageNumber, page.textBoxes, page.segments, imageBlocks);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "markit-ai",
3
- "version": "0.5.0",
3
+ "version": "0.5.1",
4
4
  "description": "Convert anything to markdown. PDF, DOCX, PPTX, XLSX, HTML, EPUB, Jupyter, RSS, images, audio, URLs, and more. Pluggable converters, built-in LLM providers for image description and audio transcription. Works as a CLI and as a library.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",