npm - markit-ai - Versions diffs - 0.1.3 → 0.3.0 - Mend

markit-ai 0.1.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

package/dist/converters/docx.js +3 -6
package/dist/converters/epub.js +3 -6
package/dist/converters/html.js +3 -6
package/dist/converters/pdf/columns.d.ts +35 -0
package/dist/converters/pdf/columns.js +93 -0
package/dist/converters/pdf/extract.d.ts +19 -0
package/dist/converters/pdf/extract.js +513 -0
package/dist/converters/pdf/grid.d.ts +25 -0
package/dist/converters/pdf/grid.js +654 -0
package/dist/converters/pdf/headers.d.ts +24 -0
package/dist/converters/pdf/headers.js +108 -0
package/dist/converters/pdf/index.d.ts +19 -0
package/dist/converters/pdf/index.js +116 -0
package/dist/converters/pdf/render.d.ts +24 -0
package/dist/converters/pdf/render.js +513 -0
package/dist/converters/pdf/types.d.ts +75 -0
package/dist/converters/pdf/types.js +1 -0
package/dist/converters/rss.js +3 -3
package/dist/converters/wikipedia.js +2 -5
package/dist/index.d.ts +1 -1
package/dist/index.js +1 -1
package/dist/markit.js +1 -1
package/dist/types.d.ts +2 -0
package/dist/utils/turndown.d.ts +8 -0
package/dist/utils/turndown.js +64 -0
package/package.json +4 -3
package/dist/converters/pdf.d.ts +0 -6
package/dist/converters/pdf.js +0 -29

package/dist/converters/docx.js CHANGED Viewed

@@ -1,5 +1,5 @@
 import mammoth from "mammoth";
-import TurndownService from "turndown";
+import { createTurndown, normalizeTablesHtml } from "../utils/turndown.js";
 const EXTENSIONS = [".docx"];
 const MIMETYPES = [
     "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -18,11 +18,8 @@ export class DocxConverter {
     }
     async convert(input, _streamInfo) {
         const { value: html } = await mammoth.convertToHtml({ buffer: input });
-        const turndown = new TurndownService({
-            headingStyle: "atx",
-            codeBlockStyle: "fenced",
-        });
-        const markdown = turndown.turndown(html);
+        const turndown = createTurndown();
+        const markdown = turndown.turndown(normalizeTablesHtml(html));
         return { markdown: markdown.trim() };
     }
 }

package/dist/converters/epub.js CHANGED Viewed

@@ -1,6 +1,6 @@
 import { XMLParser } from "fast-xml-parser";
 import JSZip from "jszip";
-import TurndownService from "turndown";
+import { createTurndown, normalizeTablesHtml } from "../utils/turndown.js";
 const EXTENSIONS = [".epub"];
 const MIMETYPES = [
     "application/epub",
@@ -75,10 +75,7 @@ export class EpubConverter {
         const basePath = opfPath.includes("/")
             ? opfPath.substring(0, opfPath.lastIndexOf("/"))
             : "";
-        const turndown = new TurndownService({
-            headingStyle: "atx",
-            codeBlockStyle: "fenced",
-        });
+        const turndown = createTurndown();
         const sections = [];
         // Add metadata header
         const metaLines = [];
@@ -101,7 +98,7 @@ export class EpubConverter {
             const cleaned = html
                 .replace(/<script[\s\S]*?<\/script>/gi, "")
                 .replace(/<style[\s\S]*?<\/style>/gi, "");
-            const md = turndown.turndown(cleaned).trim();
+            const md = turndown.turndown(normalizeTablesHtml(cleaned)).trim();
             if (md)
                 sections.push(md);
         }

package/dist/converters/html.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import TurndownService from "turndown";
+import { createTurndown, normalizeTablesHtml } from "../utils/turndown.js";
 const EXTENSIONS = [".html", ".htm"];
 const MIMETYPES = ["text/html", "application/xhtml"];
 export class HtmlConverter {
@@ -16,15 +16,12 @@ export class HtmlConverter {
     async convert(input, streamInfo) {
         const charset = streamInfo.charset || "utf-8";
         const html = new TextDecoder(charset).decode(input);
-        const turndown = new TurndownService({
-            headingStyle: "atx",
-            codeBlockStyle: "fenced",
-        });
+        const turndown = createTurndown();
         // Remove script and style tags before converting
         const cleaned = html
             .replace(/<script[\s\S]*?<\/script>/gi, "")
             .replace(/<style[\s\S]*?<\/style>/gi, "");
-        const markdown = turndown.turndown(cleaned);
+        const markdown = turndown.turndown(normalizeTablesHtml(cleaned));
         // Try to extract title
         const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
         const title = titleMatch ? titleMatch[1].trim() : undefined;

package/dist/converters/pdf/columns.d.ts ADDED Viewed

@@ -0,0 +1,35 @@
+/**
+ * Multi-column layout detection and text box reordering.
+ *
+ * Many PDFs (legal documents, datasheets, academic papers) use two-column
+ * layouts. Without column detection, text boxes are ordered by Y position
+ * only, interleaving left and right column content.
+ *
+ * Algorithm:
+ *   1. Collect left edges of all text boxes on the page
+ *   2. Find the largest horizontal gap between consecutive left edges
+ *   3. If gap > MIN_GAP_RATIO of the text width and both sides have
+ *      enough boxes → multi-column detected
+ *   4. Assign each text box to a column based on its center X
+ *   5. Return columns in reading order (left-to-right, top-to-bottom)
+ *
+ * This only detects the column structure. The caller is responsible for
+ * processing each column's text boxes independently (table detection,
+ * rendering, etc.).
+ */
+import type { TextBox } from "./types.js";
+export interface ColumnLayout {
+    /** Number of columns detected (1 = single column, 2+ = multi-column). */
+    columnCount: number;
+    /** Text boxes grouped by column, in reading order (left to right). */
+    columns: TextBox[][];
+    /** X positions of column boundaries (between columns). */
+    boundaries: number[];
+}
+/**
+ * Detect column layout and return text boxes grouped by column.
+ *
+ * For single-column pages, returns all boxes in one group.
+ * For multi-column pages, returns boxes split by column in reading order.
+ */
+export declare function detectColumns(textBoxes: TextBox[]): ColumnLayout;

package/dist/converters/pdf/columns.js ADDED Viewed

@@ -0,0 +1,93 @@
+/**
+ * Multi-column layout detection and text box reordering.
+ *
+ * Many PDFs (legal documents, datasheets, academic papers) use two-column
+ * layouts. Without column detection, text boxes are ordered by Y position
+ * only, interleaving left and right column content.
+ *
+ * Algorithm:
+ *   1. Collect left edges of all text boxes on the page
+ *   2. Find the largest horizontal gap between consecutive left edges
+ *   3. If gap > MIN_GAP_RATIO of the text width and both sides have
+ *      enough boxes → multi-column detected
+ *   4. Assign each text box to a column based on its center X
+ *   5. Return columns in reading order (left-to-right, top-to-bottom)
+ *
+ * This only detects the column structure. The caller is responsible for
+ * processing each column's text boxes independently (table detection,
+ * rendering, etc.).
+ */
+/**
+ * Minimum gap as a fraction of the total text width to consider a column
+ * boundary. A two-column layout typically has ~50% gap; we use a lower
+ * threshold to catch asymmetric columns.
+ */
+const MIN_GAP_RATIO = 0.15;
+/** Minimum number of text boxes on each side of the gap. */
+const MIN_BOXES_PER_COLUMN = 4;
+/** Minimum gap in absolute points to avoid splitting on small whitespace. */
+const MIN_GAP_PTS = 40;
+/**
+ * Detect column layout and return text boxes grouped by column.
+ *
+ * For single-column pages, returns all boxes in one group.
+ * For multi-column pages, returns boxes split by column in reading order.
+ */
+export function detectColumns(textBoxes) {
+    if (textBoxes.length < MIN_BOXES_PER_COLUMN * 2) {
+        return { columnCount: 1, columns: [textBoxes], boundaries: [] };
+    }
+    // Collect unique left edges (rounded to avoid float noise)
+    const lefts = [
+        ...new Set(textBoxes.map((tb) => Math.round(tb.bounds.left))),
+    ].sort((a, b) => a - b);
+    if (lefts.length < 2) {
+        return { columnCount: 1, columns: [textBoxes], boundaries: [] };
+    }
+    const textXMin = lefts[0];
+    const textXMax = Math.max(...textBoxes.map((tb) => Math.round(tb.bounds.right)));
+    const textWidth = textXMax - textXMin;
+    if (textWidth <= 0) {
+        return { columnCount: 1, columns: [textBoxes], boundaries: [] };
+    }
+    // Find the largest gap between consecutive left-edge positions
+    let maxGap = 0;
+    let gapLeft = 0;
+    let gapRight = 0;
+    for (let i = 1; i < lefts.length; i++) {
+        const gap = lefts[i] - lefts[i - 1];
+        if (gap > maxGap) {
+            maxGap = gap;
+            gapLeft = lefts[i - 1];
+            gapRight = lefts[i];
+        }
+    }
+    const gapRatio = maxGap / textWidth;
+    if (gapRatio < MIN_GAP_RATIO || maxGap < MIN_GAP_PTS) {
+        return { columnCount: 1, columns: [textBoxes], boundaries: [] };
+    }
+    // Split point is the midpoint of the gap
+    const splitX = (gapLeft + gapRight) / 2;
+    // Assign boxes to columns based on center X
+    const leftCol = [];
+    const rightCol = [];
+    for (const tb of textBoxes) {
+        const cx = (tb.bounds.left + tb.bounds.right) / 2;
+        if (cx < splitX) {
+            leftCol.push(tb);
+        }
+        else {
+            rightCol.push(tb);
+        }
+    }
+    // Validate both columns have enough content
+    if (leftCol.length < MIN_BOXES_PER_COLUMN ||
+        rightCol.length < MIN_BOXES_PER_COLUMN) {
+        return { columnCount: 1, columns: [textBoxes], boundaries: [] };
+    }
+    return {
+        columnCount: 2,
+        columns: [leftCol, rightCol],
+        boundaries: [splitX],
+    };
+}

package/dist/converters/pdf/extract.d.ts ADDED Viewed

@@ -0,0 +1,19 @@
+/**
+ * PDF content extraction using mupdf.
+ *
+ * Extracts text boxes (with position, font size, bold) and vector line
+ * segments (table borders) from each page. Uses mupdf's native WASM
+ * engine for fast parsing, and reads raw content streams for vector graphics.
+ *
+ * Coordinate system: PDF native (origin = bottom-left, Y increases upward).
+ */
+import type { ImageRegion, PageContent } from "./types.js";
+/**
+ * Render an image region from a PDF page as a PNG buffer.
+ * Uses mupdf's DrawDevice to render just the cropped area at 2x resolution.
+ */
+export declare function renderImageRegion(input: Uint8Array, region: ImageRegion): Uint8Array;
+/**
+ * Extract text boxes and vector segments from all pages of a PDF buffer.
+ */
+export declare function extractPages(input: Uint8Array): Promise<PageContent[]>;