markit-ai 0.1.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/converters/docx.js +3 -6
- package/dist/converters/epub.js +3 -6
- package/dist/converters/html.js +3 -6
- package/dist/converters/pdf/columns.d.ts +35 -0
- package/dist/converters/pdf/columns.js +93 -0
- package/dist/converters/pdf/extract.d.ts +19 -0
- package/dist/converters/pdf/extract.js +513 -0
- package/dist/converters/pdf/grid.d.ts +25 -0
- package/dist/converters/pdf/grid.js +654 -0
- package/dist/converters/pdf/headers.d.ts +24 -0
- package/dist/converters/pdf/headers.js +108 -0
- package/dist/converters/pdf/index.d.ts +19 -0
- package/dist/converters/pdf/index.js +116 -0
- package/dist/converters/pdf/render.d.ts +24 -0
- package/dist/converters/pdf/render.js +513 -0
- package/dist/converters/pdf/types.d.ts +75 -0
- package/dist/converters/pdf/types.js +1 -0
- package/dist/converters/rss.js +3 -3
- package/dist/converters/wikipedia.js +2 -5
- package/dist/index.d.ts +1 -1
- package/dist/index.js +1 -1
- package/dist/markit.js +1 -1
- package/dist/types.d.ts +2 -0
- package/dist/utils/turndown.d.ts +8 -0
- package/dist/utils/turndown.js +64 -0
- package/package.json +4 -3
- package/dist/converters/pdf.d.ts +0 -6
- package/dist/converters/pdf.js +0 -29
package/dist/converters/docx.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import mammoth from "mammoth";
|
|
2
|
-
import
|
|
2
|
+
import { createTurndown, normalizeTablesHtml } from "../utils/turndown.js";
|
|
3
3
|
const EXTENSIONS = [".docx"];
|
|
4
4
|
const MIMETYPES = [
|
|
5
5
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
@@ -18,11 +18,8 @@ export class DocxConverter {
|
|
|
18
18
|
}
|
|
19
19
|
async convert(input, _streamInfo) {
|
|
20
20
|
const { value: html } = await mammoth.convertToHtml({ buffer: input });
|
|
21
|
-
const turndown =
|
|
22
|
-
|
|
23
|
-
codeBlockStyle: "fenced",
|
|
24
|
-
});
|
|
25
|
-
const markdown = turndown.turndown(html);
|
|
21
|
+
const turndown = createTurndown();
|
|
22
|
+
const markdown = turndown.turndown(normalizeTablesHtml(html));
|
|
26
23
|
return { markdown: markdown.trim() };
|
|
27
24
|
}
|
|
28
25
|
}
|
package/dist/converters/epub.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { XMLParser } from "fast-xml-parser";
|
|
2
2
|
import JSZip from "jszip";
|
|
3
|
-
import
|
|
3
|
+
import { createTurndown, normalizeTablesHtml } from "../utils/turndown.js";
|
|
4
4
|
const EXTENSIONS = [".epub"];
|
|
5
5
|
const MIMETYPES = [
|
|
6
6
|
"application/epub",
|
|
@@ -75,10 +75,7 @@ export class EpubConverter {
|
|
|
75
75
|
const basePath = opfPath.includes("/")
|
|
76
76
|
? opfPath.substring(0, opfPath.lastIndexOf("/"))
|
|
77
77
|
: "";
|
|
78
|
-
const turndown =
|
|
79
|
-
headingStyle: "atx",
|
|
80
|
-
codeBlockStyle: "fenced",
|
|
81
|
-
});
|
|
78
|
+
const turndown = createTurndown();
|
|
82
79
|
const sections = [];
|
|
83
80
|
// Add metadata header
|
|
84
81
|
const metaLines = [];
|
|
@@ -101,7 +98,7 @@ export class EpubConverter {
|
|
|
101
98
|
const cleaned = html
|
|
102
99
|
.replace(/<script[\s\S]*?<\/script>/gi, "")
|
|
103
100
|
.replace(/<style[\s\S]*?<\/style>/gi, "");
|
|
104
|
-
const md = turndown.turndown(cleaned).trim();
|
|
101
|
+
const md = turndown.turndown(normalizeTablesHtml(cleaned)).trim();
|
|
105
102
|
if (md)
|
|
106
103
|
sections.push(md);
|
|
107
104
|
}
|
package/dist/converters/html.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import
|
|
1
|
+
import { createTurndown, normalizeTablesHtml } from "../utils/turndown.js";
|
|
2
2
|
const EXTENSIONS = [".html", ".htm"];
|
|
3
3
|
const MIMETYPES = ["text/html", "application/xhtml"];
|
|
4
4
|
export class HtmlConverter {
|
|
@@ -16,15 +16,12 @@ export class HtmlConverter {
|
|
|
16
16
|
async convert(input, streamInfo) {
|
|
17
17
|
const charset = streamInfo.charset || "utf-8";
|
|
18
18
|
const html = new TextDecoder(charset).decode(input);
|
|
19
|
-
const turndown =
|
|
20
|
-
headingStyle: "atx",
|
|
21
|
-
codeBlockStyle: "fenced",
|
|
22
|
-
});
|
|
19
|
+
const turndown = createTurndown();
|
|
23
20
|
// Remove script and style tags before converting
|
|
24
21
|
const cleaned = html
|
|
25
22
|
.replace(/<script[\s\S]*?<\/script>/gi, "")
|
|
26
23
|
.replace(/<style[\s\S]*?<\/style>/gi, "");
|
|
27
|
-
const markdown = turndown.turndown(cleaned);
|
|
24
|
+
const markdown = turndown.turndown(normalizeTablesHtml(cleaned));
|
|
28
25
|
// Try to extract title
|
|
29
26
|
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
30
27
|
const title = titleMatch ? titleMatch[1].trim() : undefined;
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Multi-column layout detection and text box reordering.
|
|
3
|
+
*
|
|
4
|
+
* Many PDFs (legal documents, datasheets, academic papers) use two-column
|
|
5
|
+
* layouts. Without column detection, text boxes are ordered by Y position
|
|
6
|
+
* only, interleaving left and right column content.
|
|
7
|
+
*
|
|
8
|
+
* Algorithm:
|
|
9
|
+
* 1. Collect left edges of all text boxes on the page
|
|
10
|
+
* 2. Find the largest horizontal gap between consecutive left edges
|
|
11
|
+
* 3. If gap > MIN_GAP_RATIO of the text width and both sides have
|
|
12
|
+
* enough boxes → multi-column detected
|
|
13
|
+
* 4. Assign each text box to a column based on its center X
|
|
14
|
+
* 5. Return columns in reading order (left-to-right, top-to-bottom)
|
|
15
|
+
*
|
|
16
|
+
* This only detects the column structure. The caller is responsible for
|
|
17
|
+
* processing each column's text boxes independently (table detection,
|
|
18
|
+
* rendering, etc.).
|
|
19
|
+
*/
|
|
20
|
+
import type { TextBox } from "./types.js";
|
|
21
|
+
export interface ColumnLayout {
|
|
22
|
+
/** Number of columns detected (1 = single column, 2+ = multi-column). */
|
|
23
|
+
columnCount: number;
|
|
24
|
+
/** Text boxes grouped by column, in reading order (left to right). */
|
|
25
|
+
columns: TextBox[][];
|
|
26
|
+
/** X positions of column boundaries (between columns). */
|
|
27
|
+
boundaries: number[];
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Detect column layout and return text boxes grouped by column.
|
|
31
|
+
*
|
|
32
|
+
* For single-column pages, returns all boxes in one group.
|
|
33
|
+
* For multi-column pages, returns boxes split by column in reading order.
|
|
34
|
+
*/
|
|
35
|
+
export declare function detectColumns(textBoxes: TextBox[]): ColumnLayout;
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Multi-column layout detection and text box reordering.
|
|
3
|
+
*
|
|
4
|
+
* Many PDFs (legal documents, datasheets, academic papers) use two-column
|
|
5
|
+
* layouts. Without column detection, text boxes are ordered by Y position
|
|
6
|
+
* only, interleaving left and right column content.
|
|
7
|
+
*
|
|
8
|
+
* Algorithm:
|
|
9
|
+
* 1. Collect left edges of all text boxes on the page
|
|
10
|
+
* 2. Find the largest horizontal gap between consecutive left edges
|
|
11
|
+
* 3. If gap > MIN_GAP_RATIO of the text width and both sides have
|
|
12
|
+
* enough boxes → multi-column detected
|
|
13
|
+
* 4. Assign each text box to a column based on its center X
|
|
14
|
+
* 5. Return columns in reading order (left-to-right, top-to-bottom)
|
|
15
|
+
*
|
|
16
|
+
* This only detects the column structure. The caller is responsible for
|
|
17
|
+
* processing each column's text boxes independently (table detection,
|
|
18
|
+
* rendering, etc.).
|
|
19
|
+
*/
|
|
20
|
+
/**
|
|
21
|
+
* Minimum gap as a fraction of the total text width to consider a column
|
|
22
|
+
* boundary. A two-column layout typically has ~50% gap; we use a lower
|
|
23
|
+
* threshold to catch asymmetric columns.
|
|
24
|
+
*/
|
|
25
|
+
const MIN_GAP_RATIO = 0.15;
|
|
26
|
+
/** Minimum number of text boxes on each side of the gap. */
|
|
27
|
+
const MIN_BOXES_PER_COLUMN = 4;
|
|
28
|
+
/** Minimum gap in absolute points to avoid splitting on small whitespace. */
|
|
29
|
+
const MIN_GAP_PTS = 40;
|
|
30
|
+
/**
|
|
31
|
+
* Detect column layout and return text boxes grouped by column.
|
|
32
|
+
*
|
|
33
|
+
* For single-column pages, returns all boxes in one group.
|
|
34
|
+
* For multi-column pages, returns boxes split by column in reading order.
|
|
35
|
+
*/
|
|
36
|
+
export function detectColumns(textBoxes) {
|
|
37
|
+
if (textBoxes.length < MIN_BOXES_PER_COLUMN * 2) {
|
|
38
|
+
return { columnCount: 1, columns: [textBoxes], boundaries: [] };
|
|
39
|
+
}
|
|
40
|
+
// Collect unique left edges (rounded to avoid float noise)
|
|
41
|
+
const lefts = [
|
|
42
|
+
...new Set(textBoxes.map((tb) => Math.round(tb.bounds.left))),
|
|
43
|
+
].sort((a, b) => a - b);
|
|
44
|
+
if (lefts.length < 2) {
|
|
45
|
+
return { columnCount: 1, columns: [textBoxes], boundaries: [] };
|
|
46
|
+
}
|
|
47
|
+
const textXMin = lefts[0];
|
|
48
|
+
const textXMax = Math.max(...textBoxes.map((tb) => Math.round(tb.bounds.right)));
|
|
49
|
+
const textWidth = textXMax - textXMin;
|
|
50
|
+
if (textWidth <= 0) {
|
|
51
|
+
return { columnCount: 1, columns: [textBoxes], boundaries: [] };
|
|
52
|
+
}
|
|
53
|
+
// Find the largest gap between consecutive left-edge positions
|
|
54
|
+
let maxGap = 0;
|
|
55
|
+
let gapLeft = 0;
|
|
56
|
+
let gapRight = 0;
|
|
57
|
+
for (let i = 1; i < lefts.length; i++) {
|
|
58
|
+
const gap = lefts[i] - lefts[i - 1];
|
|
59
|
+
if (gap > maxGap) {
|
|
60
|
+
maxGap = gap;
|
|
61
|
+
gapLeft = lefts[i - 1];
|
|
62
|
+
gapRight = lefts[i];
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
const gapRatio = maxGap / textWidth;
|
|
66
|
+
if (gapRatio < MIN_GAP_RATIO || maxGap < MIN_GAP_PTS) {
|
|
67
|
+
return { columnCount: 1, columns: [textBoxes], boundaries: [] };
|
|
68
|
+
}
|
|
69
|
+
// Split point is the midpoint of the gap
|
|
70
|
+
const splitX = (gapLeft + gapRight) / 2;
|
|
71
|
+
// Assign boxes to columns based on center X
|
|
72
|
+
const leftCol = [];
|
|
73
|
+
const rightCol = [];
|
|
74
|
+
for (const tb of textBoxes) {
|
|
75
|
+
const cx = (tb.bounds.left + tb.bounds.right) / 2;
|
|
76
|
+
if (cx < splitX) {
|
|
77
|
+
leftCol.push(tb);
|
|
78
|
+
}
|
|
79
|
+
else {
|
|
80
|
+
rightCol.push(tb);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
// Validate both columns have enough content
|
|
84
|
+
if (leftCol.length < MIN_BOXES_PER_COLUMN ||
|
|
85
|
+
rightCol.length < MIN_BOXES_PER_COLUMN) {
|
|
86
|
+
return { columnCount: 1, columns: [textBoxes], boundaries: [] };
|
|
87
|
+
}
|
|
88
|
+
return {
|
|
89
|
+
columnCount: 2,
|
|
90
|
+
columns: [leftCol, rightCol],
|
|
91
|
+
boundaries: [splitX],
|
|
92
|
+
};
|
|
93
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF content extraction using mupdf.
|
|
3
|
+
*
|
|
4
|
+
* Extracts text boxes (with position, font size, bold) and vector line
|
|
5
|
+
* segments (table borders) from each page. Uses mupdf's native WASM
|
|
6
|
+
* engine for fast parsing, and reads raw content streams for vector graphics.
|
|
7
|
+
*
|
|
8
|
+
* Coordinate system: PDF native (origin = bottom-left, Y increases upward).
|
|
9
|
+
*/
|
|
10
|
+
import type { ImageRegion, PageContent } from "./types.js";
|
|
11
|
+
/**
|
|
12
|
+
* Render an image region from a PDF page as a PNG buffer.
|
|
13
|
+
* Uses mupdf's DrawDevice to render just the cropped area at 2x resolution.
|
|
14
|
+
*/
|
|
15
|
+
export declare function renderImageRegion(input: Uint8Array, region: ImageRegion): Uint8Array;
|
|
16
|
+
/**
|
|
17
|
+
* Extract text boxes and vector segments from all pages of a PDF buffer.
|
|
18
|
+
*/
|
|
19
|
+
export declare function extractPages(input: Uint8Array): Promise<PageContent[]>;
|