npm - markit-ai - Versions diffs - 0.5.0 → 0.5.2 - Mend

markit-ai 0.5.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/converters/pdf/grid.js +144 -3
package/dist/converters/pdf/index.js +21 -1
package/dist/markit.d.ts +15 -0
package/dist/markit.js +90 -7
package/package.json +1 -1

package/dist/converters/pdf/grid.js CHANGED Viewed

@@ -258,6 +258,122 @@ function expandSubRowsByYClusters(originalRows, cols, cells, cellBoxes) {
     return originalRows + addedRows;
 }
 // ---------------------------------------------------------------------------
+// Cross-column text box splitting
+// ---------------------------------------------------------------------------
+/**
+ * Find which column a horizontal position falls into.
+ * Returns -1 if outside the grid.
+ */
+function findCol(x, xLines) {
+    for (let i = 0; i < xLines.length - 1; i++) {
+        if (x >= xLines[i] && x <= xLines[i + 1])
+            return i;
+    }
+    return -1;
+}
+/**
+ * When a text box spans across one or more vertical column boundaries,
+ * split it into multiple virtual text boxes — one per column — with the
+ * text divided proportionally by width.
+ *
+ * We split at word boundaries closest to the proportional split point
+ * so we don't chop words in half.
+ */
+function splitCrossColumnBoxes(textBoxes, xLines) {
+    const result = [];
+    const MARGIN = 5; // allow small overlap before considering it cross-column
+    for (const tb of textBoxes) {
+        const leftCol = findCol(tb.bounds.left + MARGIN, xLines);
+        const rightCol = findCol(tb.bounds.right - MARGIN, xLines);
+        // Not spanning columns, or outside grid — keep as-is
+        if (leftCol < 0 || rightCol < 0 || leftCol === rightCol) {
+            result.push(tb);
+            continue;
+        }
+        // Text box spans from leftCol to rightCol — split it
+        const totalWidth = tb.bounds.right - tb.bounds.left;
+        if (totalWidth <= 0) {
+            result.push(tb);
+            continue;
+        }
+        const words = tb.text.split(/\s+/);
+        if (words.length <= 1) {
+            // Single word spanning columns — just assign to whichever col has more overlap
+            result.push(tb);
+            continue;
+        }
+        // For each column boundary crossing, find the best word-boundary split
+        let remainingWords = [...words];
+        let currentLeft = tb.bounds.left;
+        for (let col = leftCol; col <= rightCol && remainingWords.length > 0; col++) {
+            const colRight = col < xLines.length - 1 ? xLines[col + 1] : tb.bounds.right;
+            const segmentRight = Math.min(colRight, tb.bounds.right);
+            if (col === rightCol) {
+                // Last column — take all remaining words
+                result.push({
+                    ...tb,
+                    id: `${tb.id}-split${col}`,
+                    text: remainingWords.join(" "),
+                    bounds: {
+                        ...tb.bounds,
+                        left: currentLeft,
+                        right: tb.bounds.right,
+                    },
+                });
+                remainingWords = [];
+            }
+            else {
+                // Find how many words fit in this column segment proportionally
+                const segmentWidth = segmentRight - currentLeft;
+                const fractionOfTotal = segmentWidth / totalWidth;
+                const approxChars = Math.round(fractionOfTotal * tb.text.length);
+                // Walk words to find the split closest to the proportional point
+                let charCount = 0;
+                let splitIdx = 0;
+                for (let w = 0; w < remainingWords.length; w++) {
+                    const nextCount = charCount + remainingWords[w].length + (w > 0 ? 1 : 0);
+                    if (nextCount > approxChars && splitIdx > 0)
+                        break;
+                    charCount = nextCount;
+                    splitIdx = w + 1;
+                }
+                if (splitIdx === 0)
+                    splitIdx = 1; // take at least one word
+                if (splitIdx >= remainingWords.length) {
+                    // All remaining words fit here
+                    result.push({
+                        ...tb,
+                        id: `${tb.id}-split${col}`,
+                        text: remainingWords.join(" "),
+                        bounds: {
+                            ...tb.bounds,
+                            left: currentLeft,
+                            right: segmentRight,
+                        },
+                    });
+                    remainingWords = [];
+                }
+                else {
+                    const partWords = remainingWords.slice(0, splitIdx);
+                    result.push({
+                        ...tb,
+                        id: `${tb.id}-split${col}`,
+                        text: partWords.join(" "),
+                        bounds: {
+                            ...tb.bounds,
+                            left: currentLeft,
+                            right: segmentRight,
+                        },
+                    });
+                    remainingWords = remainingWords.slice(splitIdx);
+                    currentLeft = segmentRight;
+                }
+            }
+        }
+    }
+    return result;
+}
+// ---------------------------------------------------------------------------
 // Full grid table (H + V lines)
 // ---------------------------------------------------------------------------
 function buildCells(rows, cols) {
@@ -278,11 +394,26 @@ function buildTableGrid(pageNumber, yLines, xLines, filteredSegments, textBoxes)
     const yMax = yLines[0];
     const xMin = xLines[0];
     const xMax = xLines[xLines.length - 1];
-    // Look for header text boxes just above the grid
+    // Split text boxes that span multiple columns before placement
+    const splitBoxes = splitCrossColumnBoxes(textBoxes, xLines);
+    // Track which split piece IDs get placed in cells, so we can consume
+    // the original (unsplit) text box IDs too.
+    const placedSplitIds = new Set();
+    // Look for header text boxes just above the grid.
+    // Use the ORIGINAL (unsplit) text boxes for header detection so that
+    // wide paragraph text isn't falsely split into column-sized header chunks.
+    // Reject boxes wider than 1.5 columns — those are paragraph text, not headers.
+    const avgColWidth = (xMax - xMin) / cols;
+    const maxHeaderBoxWidth = avgColWidth * 1.5;
     const headerBoxes = textBoxes.filter((tb) => {
         const cy = (tb.bounds.top + tb.bounds.bottom) / 2;
         const cx = (tb.bounds.left + tb.bounds.right) / 2;
-        return cy > yMax && cy <= yMax + 20 && cx >= xMin && cx <= xMax;
+        const boxWidth = tb.bounds.right - tb.bounds.left;
+        return (cy > yMax &&
+            cy <= yMax + 20 &&
+            cx >= xMin &&
+            cx <= xMax &&
+            boxWidth <= maxHeaderBoxWidth);
     });
     if (headerBoxes.length > 0) {
         rows += 1;
@@ -308,7 +439,7 @@ function buildTableGrid(pageNumber, yLines, xLines, filteredSegments, textBoxes)
         }
     }
     const cellBoxes = new Map();
-    for (const tb of textBoxes) {
+    for (const tb of splitBoxes) {
         const cx = (tb.bounds.left + tb.bounds.right) / 2;
         const cy = (tb.bounds.top + tb.bounds.bottom) / 2;
         if (cy < yMin || cy > yMax || cx < xMin || cx > xMax)
@@ -338,6 +469,8 @@ function buildTableGrid(pageNumber, yLines, xLines, filteredSegments, textBoxes)
             cellBoxes.set(cell, []);
         cellBoxes.get(cell)?.push(tb);
         consumedIds.push(tb.id);
+        if (tb.id.includes("-split"))
+            placedSplitIds.add(tb.id);
     }
     rows = expandSubRowsByYClusters(rows, cols, cells, cellBoxes);
     // Merge text boxes within each cell into cell text
@@ -369,6 +502,14 @@ function buildTableGrid(pageNumber, yLines, xLines, filteredSegments, textBoxes)
         topY: yLines[0],
         isBorderless: false,
     });
+    // Also consume the original (unsplit) text box IDs when any of their
+    // split pieces were placed in a cell.
+    for (const splitId of placedSplitIds) {
+        const origId = splitId.replace(/-split\d+$/, "");
+        if (!consumedIds.includes(origId)) {
+            consumedIds.push(origId);
+        }
+    }
     return { grid, consumedIds };
 }
 // ---------------------------------------------------------------------------

package/dist/converters/pdf/index.js CHANGED Viewed

@@ -80,8 +80,28 @@ export class PdfConverter {
                     });
                 }
             }
-            // Detect column layout
+            // Detect column layout.
+            // If the page has vertical segments (tables), suppress column detection
+            // when one detected column is very narrow — that's a table's first column,
+            // not a page layout column.
             const layout = detectColumns(page.textBoxes);
+            if (layout.columnCount > 1 &&
+                page.segments.some((s) => Math.abs(s.x1 - s.x2) <= 0.8)) {
+                const pageXMin = Math.min(...page.textBoxes.map((tb) => tb.bounds.left));
+                const pageXMax = Math.max(...page.textBoxes.map((tb) => tb.bounds.right));
+                const pageWidth = pageXMax - pageXMin;
+                const minColFraction = 0.3;
+                const tooNarrow = layout.columns.some((col) => {
+                    const colXMin = Math.min(...col.map((tb) => tb.bounds.left));
+                    const colXMax = Math.max(...col.map((tb) => tb.bounds.right));
+                    return (colXMax - colXMin) / pageWidth < minColFraction;
+                });
+                if (tooNarrow) {
+                    layout.columnCount = 1;
+                    layout.columns = [page.textBoxes];
+                    layout.boundaries = [];
+                }
+            }
             if (layout.columnCount === 1) {
                 // Single column — process normally
                 const md = processColumn(page.pageNumber, page.textBoxes, page.segments, imageBlocks);

package/dist/markit.d.ts CHANGED Viewed

@@ -12,8 +12,23 @@ export declare class Markit {
      * Convert a URL to markdown.
      */
     convertUrl(url: string): Promise<ConversionResult>;
+    /**
+     * Inspect an HTML response for a discoverable markdown source URL.
+     * If found, fetch and convert the raw markdown instead.
+     */
+    private tryMarkdownSource;
     /**
      * Convert a buffer with stream info to markdown.
      */
     convert(input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
 }
+/**
+ * Try to discover a raw markdown source URL from an HTML response.
+ * Checks multiple patterns:
+ *   1. <link rel="alternate" type="text/markdown" href="..."> tag
+ *   2. VitePress markers → append .md to the URL
+ *   3. llms.txt convention → try url.md or url.html.md
+ *
+ * @internal Exported for testing.
+ */
+export declare function discoverMarkdownSource(html: string, url: string, ext: string): string | null;

package/dist/markit.js CHANGED Viewed

@@ -19,6 +19,7 @@ import { XlsxConverter } from "./converters/xlsx.js";
 import { XmlConverter } from "./converters/xml.js";
 import { YamlConverter } from "./converters/yaml.js";
 import { ZipConverter } from "./converters/zip.js";
+const USER_AGENT = "markit/0.1.0";
 export class Markit {
     converters = [];
     options;
@@ -89,25 +90,72 @@ export class Markit {
         const response = await fetch(url, {
             headers: {
                 Accept: "text/markdown, text/html;q=0.9, text/plain;q=0.8, */*;q=0.1",
-                "User-Agent": "mill/0.1.0",
+                "User-Agent": USER_AGENT,
             },
         });
         if (!response.ok) {
             throw new Error(`Failed to fetch ${url}: ${response.status} ${response.statusText}`);
         }
         const contentType = response.headers.get("content-type") || "";
-        const [mimetype] = contentType.split(";");
-        // Derive extension from URL path
+        const mimetype = contentType.split(";")[0].trim();
         const urlPath = new URL(url).pathname;
         const ext = extname(urlPath).toLowerCase();
+        // Content negotiation worked — server returned markdown directly
+        if (mimetype === "text/markdown") {
+            const buffer = Buffer.from(await response.arrayBuffer());
+            return this.convert(buffer, {
+                url,
+                mimetype: "text/markdown",
+                extension: ".md",
+                filename: basename(urlPath) || undefined,
+            });
+        }
         const buffer = Buffer.from(await response.arrayBuffer());
-        const fetchedInfo = {
+        // For HTML responses, try to discover a raw markdown source.
+        // Patterns: <link rel="alternate">, VitePress .md files, llms.txt convention.
+        if (mimetype === "text/html") {
+            const result = await this.tryMarkdownSource(buffer, url, ext);
+            if (result)
+                return result;
+        }
+        return this.convert(buffer, {
             url,
-            mimetype: mimetype.trim(),
+            mimetype,
             extension: ext || undefined,
             filename: basename(urlPath) || undefined,
-        };
-        return this.convert(buffer, fetchedInfo);
+        });
+    }
+    /**
+     * Inspect an HTML response for a discoverable markdown source URL.
+     * If found, fetch and convert the raw markdown instead.
+     */
+    async tryMarkdownSource(htmlBuffer, url, ext) {
+        const html = htmlBuffer.toString("utf-8", 0, Math.min(htmlBuffer.length, 50_000));
+        const mdSourceUrl = discoverMarkdownSource(html, url, ext);
+        if (!mdSourceUrl)
+            return null;
+        try {
+            const response = await fetch(mdSourceUrl, {
+                headers: { "User-Agent": USER_AGENT },
+            });
+            if (!response.ok)
+                return null;
+            const ct = (response.headers.get("content-type") || "")
+                .split(";")[0]
+                .trim();
+            if (!ct.includes("markdown") && !ct.includes("text/plain"))
+                return null;
+            const mdBuffer = Buffer.from(await response.arrayBuffer());
+            return this.convert(mdBuffer, {
+                url: mdSourceUrl,
+                mimetype: "text/markdown",
+                extension: ".md",
+                filename: basename(new URL(mdSourceUrl).pathname),
+            });
+        }
+        catch {
+            return null;
+        }
     }
     /**
      * Convert a buffer with stream info to markdown.
@@ -136,3 +184,38 @@ export class Markit {
         throw new Error(`Unsupported format: ${streamInfo.extension || streamInfo.mimetype || "unknown"}`);
     }
 }
+/**
+ * Try to discover a raw markdown source URL from an HTML response.
+ * Checks multiple patterns:
+ *   1. <link rel="alternate" type="text/markdown" href="..."> tag
+ *   2. VitePress markers → append .md to the URL
+ *   3. llms.txt convention → try url.md or url.html.md
+ *
+ * @internal Exported for testing.
+ */
+export function discoverMarkdownSource(html, url, ext) {
+    // 1. Look for <link rel="alternate" type="text/markdown" href="...">
+    const linkMatch = html.match(/<link[^>]+rel=["']alternate["'][^>]+type=["']text\/markdown["'][^>]+href=["']([^"']+)["']/i) ??
+        html.match(/<link[^>]+type=["']text\/markdown["'][^>]+rel=["']alternate["'][^>]+href=["']([^"']+)["']/i);
+    if (linkMatch?.[1]) {
+        try {
+            return new URL(linkMatch[1], url).href;
+        }
+        catch {
+            /* ignore malformed URLs */
+        }
+    }
+    // 2. VitePress detection — serves .md alongside HTML
+    const isVitePress = html.includes("__VP_HASH_MAP__") ||
+        html.includes("VPContent") ||
+        html.includes("vitepress");
+    // 3. llms.txt convention: try url.md for extensionless URLs
+    const hasLlmsTxt = html.includes("llms.txt");
+    if (!ext && (isVitePress || hasLlmsTxt)) {
+        return appendMdExtension(url);
+    }
+    return null;
+}
+function appendMdExtension(url) {
+    return url.endsWith("/") ? `${url.slice(0, -1)}.md` : `${url}.md`;
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "markit-ai",
-  "version": "0.5.0",
+  "version": "0.5.2",
   "description": "Convert anything to markdown. PDF, DOCX, PPTX, XLSX, HTML, EPUB, Jupyter, RSS, images, audio, URLs, and more. Pluggable converters, built-in LLM providers for image description and audio transcription. Works as a CLI and as a library.",
   "type": "module",
   "main": "dist/index.js",