markit-ai 0.1.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  import mammoth from "mammoth";
2
- import TurndownService from "turndown";
2
+ import { createTurndown, normalizeTablesHtml } from "../utils/turndown.js";
3
3
  const EXTENSIONS = [".docx"];
4
4
  const MIMETYPES = [
5
5
  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -18,11 +18,8 @@ export class DocxConverter {
18
18
  }
19
19
  async convert(input, _streamInfo) {
20
20
  const { value: html } = await mammoth.convertToHtml({ buffer: input });
21
- const turndown = new TurndownService({
22
- headingStyle: "atx",
23
- codeBlockStyle: "fenced",
24
- });
25
- const markdown = turndown.turndown(html);
21
+ const turndown = createTurndown();
22
+ const markdown = turndown.turndown(normalizeTablesHtml(html));
26
23
  return { markdown: markdown.trim() };
27
24
  }
28
25
  }
@@ -1,6 +1,6 @@
1
1
  import { XMLParser } from "fast-xml-parser";
2
2
  import JSZip from "jszip";
3
- import TurndownService from "turndown";
3
+ import { createTurndown, normalizeTablesHtml } from "../utils/turndown.js";
4
4
  const EXTENSIONS = [".epub"];
5
5
  const MIMETYPES = [
6
6
  "application/epub",
@@ -75,10 +75,7 @@ export class EpubConverter {
75
75
  const basePath = opfPath.includes("/")
76
76
  ? opfPath.substring(0, opfPath.lastIndexOf("/"))
77
77
  : "";
78
- const turndown = new TurndownService({
79
- headingStyle: "atx",
80
- codeBlockStyle: "fenced",
81
- });
78
+ const turndown = createTurndown();
82
79
  const sections = [];
83
80
  // Add metadata header
84
81
  const metaLines = [];
@@ -101,7 +98,7 @@ export class EpubConverter {
101
98
  const cleaned = html
102
99
  .replace(/<script[\s\S]*?<\/script>/gi, "")
103
100
  .replace(/<style[\s\S]*?<\/style>/gi, "");
104
- const md = turndown.turndown(cleaned).trim();
101
+ const md = turndown.turndown(normalizeTablesHtml(cleaned)).trim();
105
102
  if (md)
106
103
  sections.push(md);
107
104
  }
@@ -1,4 +1,4 @@
1
- import TurndownService from "turndown";
1
+ import { createTurndown, normalizeTablesHtml } from "../utils/turndown.js";
2
2
  const EXTENSIONS = [".html", ".htm"];
3
3
  const MIMETYPES = ["text/html", "application/xhtml"];
4
4
  export class HtmlConverter {
@@ -16,15 +16,12 @@ export class HtmlConverter {
16
16
  async convert(input, streamInfo) {
17
17
  const charset = streamInfo.charset || "utf-8";
18
18
  const html = new TextDecoder(charset).decode(input);
19
- const turndown = new TurndownService({
20
- headingStyle: "atx",
21
- codeBlockStyle: "fenced",
22
- });
19
+ const turndown = createTurndown();
23
20
  // Remove script and style tags before converting
24
21
  const cleaned = html
25
22
  .replace(/<script[\s\S]*?<\/script>/gi, "")
26
23
  .replace(/<style[\s\S]*?<\/style>/gi, "");
27
- const markdown = turndown.turndown(cleaned);
24
+ const markdown = turndown.turndown(normalizeTablesHtml(cleaned));
28
25
  // Try to extract title
29
26
  const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
30
27
  const title = titleMatch ? titleMatch[1].trim() : undefined;
@@ -0,0 +1,35 @@
1
+ /**
2
+ * Multi-column layout detection and text box reordering.
3
+ *
4
+ * Many PDFs (legal documents, datasheets, academic papers) use two-column
5
+ * layouts. Without column detection, text boxes are ordered by Y position
6
+ * only, interleaving left and right column content.
7
+ *
8
+ * Algorithm:
9
+ * 1. Collect left edges of all text boxes on the page
10
+ * 2. Find the largest horizontal gap between consecutive left edges
11
+ * 3. If gap > MIN_GAP_RATIO of the text width and both sides have
12
+ * enough boxes → multi-column detected
13
+ * 4. Assign each text box to a column based on its center X
14
+ * 5. Return columns in reading order (left-to-right, top-to-bottom)
15
+ *
16
+ * This only detects the column structure. The caller is responsible for
17
+ * processing each column's text boxes independently (table detection,
18
+ * rendering, etc.).
19
+ */
20
+ import type { TextBox } from "./types.js";
21
+ export interface ColumnLayout {
22
+ /** Number of columns detected (1 = single column, 2+ = multi-column). */
23
+ columnCount: number;
24
+ /** Text boxes grouped by column, in reading order (left to right). */
25
+ columns: TextBox[][];
26
+ /** X positions of column boundaries (between columns). */
27
+ boundaries: number[];
28
+ }
29
+ /**
30
+ * Detect column layout and return text boxes grouped by column.
31
+ *
32
+ * For single-column pages, returns all boxes in one group.
33
+ * For multi-column pages, returns boxes split by column in reading order.
34
+ */
35
+ export declare function detectColumns(textBoxes: TextBox[]): ColumnLayout;
@@ -0,0 +1,93 @@
1
+ /**
2
+ * Multi-column layout detection and text box reordering.
3
+ *
4
+ * Many PDFs (legal documents, datasheets, academic papers) use two-column
5
+ * layouts. Without column detection, text boxes are ordered by Y position
6
+ * only, interleaving left and right column content.
7
+ *
8
+ * Algorithm:
9
+ * 1. Collect left edges of all text boxes on the page
10
+ * 2. Find the largest horizontal gap between consecutive left edges
11
+ * 3. If gap > MIN_GAP_RATIO of the text width and both sides have
12
+ * enough boxes → multi-column detected
13
+ * 4. Assign each text box to a column based on its center X
14
+ * 5. Return columns in reading order (left-to-right, top-to-bottom)
15
+ *
16
+ * This only detects the column structure. The caller is responsible for
17
+ * processing each column's text boxes independently (table detection,
18
+ * rendering, etc.).
19
+ */
20
+ /**
21
+ * Minimum gap as a fraction of the total text width to consider a column
22
+ * boundary. A two-column layout typically has ~50% gap; we use a lower
23
+ * threshold to catch asymmetric columns.
24
+ */
25
+ const MIN_GAP_RATIO = 0.15;
26
+ /** Minimum number of text boxes on each side of the gap. */
27
+ const MIN_BOXES_PER_COLUMN = 4;
28
+ /** Minimum gap in absolute points to avoid splitting on small whitespace. */
29
+ const MIN_GAP_PTS = 40;
30
+ /**
31
+ * Detect column layout and return text boxes grouped by column.
32
+ *
33
+ * For single-column pages, returns all boxes in one group.
34
+ * For multi-column pages, returns boxes split by column in reading order.
35
+ */
36
+ export function detectColumns(textBoxes) {
37
+ if (textBoxes.length < MIN_BOXES_PER_COLUMN * 2) {
38
+ return { columnCount: 1, columns: [textBoxes], boundaries: [] };
39
+ }
40
+ // Collect unique left edges (rounded to avoid float noise)
41
+ const lefts = [
42
+ ...new Set(textBoxes.map((tb) => Math.round(tb.bounds.left))),
43
+ ].sort((a, b) => a - b);
44
+ if (lefts.length < 2) {
45
+ return { columnCount: 1, columns: [textBoxes], boundaries: [] };
46
+ }
47
+ const textXMin = lefts[0];
48
+ const textXMax = Math.max(...textBoxes.map((tb) => Math.round(tb.bounds.right)));
49
+ const textWidth = textXMax - textXMin;
50
+ if (textWidth <= 0) {
51
+ return { columnCount: 1, columns: [textBoxes], boundaries: [] };
52
+ }
53
+ // Find the largest gap between consecutive left-edge positions
54
+ let maxGap = 0;
55
+ let gapLeft = 0;
56
+ let gapRight = 0;
57
+ for (let i = 1; i < lefts.length; i++) {
58
+ const gap = lefts[i] - lefts[i - 1];
59
+ if (gap > maxGap) {
60
+ maxGap = gap;
61
+ gapLeft = lefts[i - 1];
62
+ gapRight = lefts[i];
63
+ }
64
+ }
65
+ const gapRatio = maxGap / textWidth;
66
+ if (gapRatio < MIN_GAP_RATIO || maxGap < MIN_GAP_PTS) {
67
+ return { columnCount: 1, columns: [textBoxes], boundaries: [] };
68
+ }
69
+ // Split point is the midpoint of the gap
70
+ const splitX = (gapLeft + gapRight) / 2;
71
+ // Assign boxes to columns based on center X
72
+ const leftCol = [];
73
+ const rightCol = [];
74
+ for (const tb of textBoxes) {
75
+ const cx = (tb.bounds.left + tb.bounds.right) / 2;
76
+ if (cx < splitX) {
77
+ leftCol.push(tb);
78
+ }
79
+ else {
80
+ rightCol.push(tb);
81
+ }
82
+ }
83
+ // Validate both columns have enough content
84
+ if (leftCol.length < MIN_BOXES_PER_COLUMN ||
85
+ rightCol.length < MIN_BOXES_PER_COLUMN) {
86
+ return { columnCount: 1, columns: [textBoxes], boundaries: [] };
87
+ }
88
+ return {
89
+ columnCount: 2,
90
+ columns: [leftCol, rightCol],
91
+ boundaries: [splitX],
92
+ };
93
+ }
@@ -0,0 +1,19 @@
1
+ /**
2
+ * PDF content extraction using mupdf.
3
+ *
4
+ * Extracts text boxes (with position, font size, bold) and vector line
5
+ * segments (table borders) from each page. Uses mupdf's native WASM
6
+ * engine for fast parsing, and reads raw content streams for vector graphics.
7
+ *
8
+ * Coordinate system: PDF native (origin = bottom-left, Y increases upward).
9
+ */
10
+ import type { ImageRegion, PageContent } from "./types.js";
11
+ /**
12
+ * Render an image region from a PDF page as a PNG buffer.
13
+ * Uses mupdf's DrawDevice to render just the cropped area at 2x resolution.
14
+ */
15
+ export declare function renderImageRegion(input: Uint8Array, region: ImageRegion): Uint8Array;
16
+ /**
17
+ * Extract text boxes and vector segments from all pages of a PDF buffer.
18
+ */
19
+ export declare function extractPages(input: Uint8Array): Promise<PageContent[]>;