markit-ai 0.1.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/converters/docx.js +3 -6
- package/dist/converters/epub.js +3 -6
- package/dist/converters/html.js +3 -6
- package/dist/converters/pdf/columns.d.ts +35 -0
- package/dist/converters/pdf/columns.js +93 -0
- package/dist/converters/pdf/extract.d.ts +19 -0
- package/dist/converters/pdf/extract.js +513 -0
- package/dist/converters/pdf/grid.d.ts +25 -0
- package/dist/converters/pdf/grid.js +654 -0
- package/dist/converters/pdf/headers.d.ts +24 -0
- package/dist/converters/pdf/headers.js +108 -0
- package/dist/converters/pdf/index.d.ts +19 -0
- package/dist/converters/pdf/index.js +116 -0
- package/dist/converters/pdf/render.d.ts +24 -0
- package/dist/converters/pdf/render.js +513 -0
- package/dist/converters/pdf/types.d.ts +75 -0
- package/dist/converters/pdf/types.js +1 -0
- package/dist/converters/rss.js +3 -3
- package/dist/converters/wikipedia.js +2 -5
- package/dist/index.d.ts +1 -1
- package/dist/index.js +1 -1
- package/dist/markit.js +1 -1
- package/dist/types.d.ts +2 -0
- package/dist/utils/turndown.d.ts +8 -0
- package/dist/utils/turndown.js +64 -0
- package/package.json +4 -3
- package/dist/converters/pdf.d.ts +0 -6
- package/dist/converters/pdf.js +0 -29
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Running header/footer detection and removal.
|
|
3
|
+
*
|
|
4
|
+
* Many PDFs have repeated text at the top or bottom of every page:
|
|
5
|
+
* document titles, chapter names, page numbers, copyright notices.
|
|
6
|
+
* These pollute the markdown output as false headings or noise.
|
|
7
|
+
*
|
|
8
|
+
* Algorithm:
|
|
9
|
+
* 1. For each page, bucket text boxes by Y position (top/bottom zones)
|
|
10
|
+
* 2. Collect the text content at each zone across all pages
|
|
11
|
+
* 3. Text appearing on >20% of pages OR 8+ consecutive pages is a
|
|
12
|
+
* running header/footer
|
|
13
|
+
* 4. Remove matching text boxes before further processing
|
|
14
|
+
*/
|
|
15
|
+
/** Minimum number of pages to enable header/footer detection. */
|
|
16
|
+
const MIN_PAGES = 5;
|
|
17
|
+
/** Minimum Y position for top zone (from bottom of page in PDF coords). */
|
|
18
|
+
const TOP_ZONE_MIN_Y = 700;
|
|
19
|
+
/** Maximum Y position for bottom zone. */
|
|
20
|
+
const BOTTOM_ZONE_MAX_Y = 80;
|
|
21
|
+
/**
|
|
22
|
+
* Minimum consecutive pages a text must appear on to be considered a
|
|
23
|
+
* running header/footer. Catches both document-wide headers (appearing
|
|
24
|
+
* on every page) and chapter-specific headers (appearing on 4+ consecutive
|
|
25
|
+
* pages within a chapter).
|
|
26
|
+
*/
|
|
27
|
+
const MIN_CONSECUTIVE_PAGES = 8;
|
|
28
|
+
/**
|
|
29
|
+
* Detect and remove running headers and footers from all pages.
|
|
30
|
+
* Mutates the pages array in place, removing header/footer text boxes.
|
|
31
|
+
*
|
|
32
|
+
* Uses two strategies:
|
|
33
|
+
* 1. Global frequency: text appearing on > 20% of all pages
|
|
34
|
+
* 2. Consecutive runs: text appearing on 8+ consecutive pages
|
|
35
|
+
*/
|
|
36
|
+
export function stripHeadersFooters(pages) {
|
|
37
|
+
if (pages.length < MIN_PAGES)
|
|
38
|
+
return;
|
|
39
|
+
// Step 1: Build per-page zone text sets
|
|
40
|
+
const pageZoneTexts = [];
|
|
41
|
+
for (const page of pages) {
|
|
42
|
+
const zoneTexts = new Set();
|
|
43
|
+
for (const tb of page.textBoxes) {
|
|
44
|
+
const midY = (tb.bounds.top + tb.bounds.bottom) / 2;
|
|
45
|
+
if (midY >= TOP_ZONE_MIN_Y || midY <= BOTTOM_ZONE_MAX_Y) {
|
|
46
|
+
const key = tb.text.trim().replace(/\s+/g, " ");
|
|
47
|
+
if (key.length > 0)
|
|
48
|
+
zoneTexts.add(key);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
pageZoneTexts.push(zoneTexts);
|
|
52
|
+
}
|
|
53
|
+
// Step 2: Count global frequency AND longest consecutive run for each text
|
|
54
|
+
const globalCount = new Map();
|
|
55
|
+
const maxConsecutive = new Map();
|
|
56
|
+
// Collect all unique zone texts
|
|
57
|
+
const allTexts = new Set();
|
|
58
|
+
for (const zts of pageZoneTexts) {
|
|
59
|
+
for (const t of zts)
|
|
60
|
+
allTexts.add(t);
|
|
61
|
+
}
|
|
62
|
+
for (const text of allTexts) {
|
|
63
|
+
let total = 0;
|
|
64
|
+
let consecutive = 0;
|
|
65
|
+
let maxRun = 0;
|
|
66
|
+
for (const zts of pageZoneTexts) {
|
|
67
|
+
if (zts.has(text)) {
|
|
68
|
+
total++;
|
|
69
|
+
consecutive++;
|
|
70
|
+
if (consecutive > maxRun)
|
|
71
|
+
maxRun = consecutive;
|
|
72
|
+
}
|
|
73
|
+
else {
|
|
74
|
+
consecutive = 0;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
globalCount.set(text, total);
|
|
78
|
+
maxConsecutive.set(text, maxRun);
|
|
79
|
+
}
|
|
80
|
+
// Step 3: Identify running headers/footers
|
|
81
|
+
const globalThreshold = Math.max(3, Math.floor(pages.length * 0.2));
|
|
82
|
+
const repeatedTexts = new Set();
|
|
83
|
+
for (const text of allTexts) {
|
|
84
|
+
const gc = globalCount.get(text) ?? 0;
|
|
85
|
+
const mc = maxConsecutive.get(text) ?? 0;
|
|
86
|
+
// Global: appears on 20%+ of pages
|
|
87
|
+
if (gc >= globalThreshold) {
|
|
88
|
+
repeatedTexts.add(text);
|
|
89
|
+
continue;
|
|
90
|
+
}
|
|
91
|
+
// Consecutive: appears on 8+ consecutive pages (chapter-level headers)
|
|
92
|
+
if (mc >= MIN_CONSECUTIVE_PAGES) {
|
|
93
|
+
repeatedTexts.add(text);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
if (repeatedTexts.size === 0)
|
|
97
|
+
return;
|
|
98
|
+
// Step 4: Remove matching text boxes from each page
|
|
99
|
+
for (const page of pages) {
|
|
100
|
+
page.textBoxes = page.textBoxes.filter((tb) => {
|
|
101
|
+
const midY = (tb.bounds.top + tb.bounds.bottom) / 2;
|
|
102
|
+
if (midY < TOP_ZONE_MIN_Y && midY > BOTTOM_ZONE_MAX_Y)
|
|
103
|
+
return true;
|
|
104
|
+
const normalized = tb.text.trim().replace(/\s+/g, " ");
|
|
105
|
+
return !repeatedTexts.has(normalized);
|
|
106
|
+
});
|
|
107
|
+
}
|
|
108
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF to Markdown converter.
|
|
3
|
+
*
|
|
4
|
+
* Uses mupdf (native WASM) for fast PDF parsing and a custom pipeline for
|
|
5
|
+
* table detection via vector line extraction + raycasting.
|
|
6
|
+
*
|
|
7
|
+
* Pipeline:
|
|
8
|
+
* 1. Extract text boxes + vector segments + image regions per page (mupdf)
|
|
9
|
+
* 2. Detect column layout (single vs multi-column)
|
|
10
|
+
* 3. Per column: detect table grids from segments (grid detection + raycasting)
|
|
11
|
+
* 4. Render diagrams as PNG files (if output directory provided)
|
|
12
|
+
* 5. Render tables as markdown tables, free text as paragraphs/headings
|
|
13
|
+
*/
|
|
14
|
+
import type { ConversionResult, Converter, StreamInfo } from "../../types.js";
|
|
15
|
+
export declare class PdfConverter implements Converter {
|
|
16
|
+
name: string;
|
|
17
|
+
accepts(streamInfo: StreamInfo): boolean;
|
|
18
|
+
convert(input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
|
|
19
|
+
}
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF to Markdown converter.
|
|
3
|
+
*
|
|
4
|
+
* Uses mupdf (native WASM) for fast PDF parsing and a custom pipeline for
|
|
5
|
+
* table detection via vector line extraction + raycasting.
|
|
6
|
+
*
|
|
7
|
+
* Pipeline:
|
|
8
|
+
* 1. Extract text boxes + vector segments + image regions per page (mupdf)
|
|
9
|
+
* 2. Detect column layout (single vs multi-column)
|
|
10
|
+
* 3. Per column: detect table grids from segments (grid detection + raycasting)
|
|
11
|
+
* 4. Render diagrams as PNG files (if output directory provided)
|
|
12
|
+
* 5. Render tables as markdown tables, free text as paragraphs/headings
|
|
13
|
+
*/
|
|
14
|
+
import { mkdirSync, writeFileSync } from "node:fs";
|
|
15
|
+
import { join } from "node:path";
|
|
16
|
+
import { detectColumns } from "./columns.js";
|
|
17
|
+
import { extractPages, renderImageRegion } from "./extract.js";
|
|
18
|
+
import { resolveTableGrids } from "./grid.js";
|
|
19
|
+
import { stripHeadersFooters } from "./headers.js";
|
|
20
|
+
import { renderPageContent } from "./render.js";
|
|
21
|
+
const EXTENSIONS = [".pdf"];
|
|
22
|
+
const MIMETYPES = ["application/pdf", "application/x-pdf"];
|
|
23
|
+
/**
|
|
24
|
+
* Process a set of text boxes (one column or full page): run table detection,
|
|
25
|
+
* separate free text, and render to markdown.
|
|
26
|
+
*/
|
|
27
|
+
function processColumn(pageNumber, textBoxes, segments, imageBlocks) {
|
|
28
|
+
const { grids, consumedIds } = resolveTableGrids(pageNumber, textBoxes, segments);
|
|
29
|
+
const consumedSet = new Set(consumedIds);
|
|
30
|
+
const freeTextBoxes = textBoxes.filter((tb) => !consumedSet.has(tb.id));
|
|
31
|
+
return renderPageContent(freeTextBoxes, grids, imageBlocks, textBoxes);
|
|
32
|
+
}
|
|
33
|
+
export class PdfConverter {
|
|
34
|
+
name = "pdf";
|
|
35
|
+
accepts(streamInfo) {
|
|
36
|
+
if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension)) {
|
|
37
|
+
return true;
|
|
38
|
+
}
|
|
39
|
+
if (streamInfo.mimetype &&
|
|
40
|
+
MIMETYPES.some((m) => streamInfo.mimetype?.startsWith(m))) {
|
|
41
|
+
return true;
|
|
42
|
+
}
|
|
43
|
+
return false;
|
|
44
|
+
}
|
|
45
|
+
async convert(input, streamInfo) {
|
|
46
|
+
const pdfBytes = new Uint8Array(input);
|
|
47
|
+
const pages = await extractPages(pdfBytes);
|
|
48
|
+
// Remove running headers/footers before processing
|
|
49
|
+
stripHeadersFooters(pages);
|
|
50
|
+
const imageDir = streamInfo.imageDir;
|
|
51
|
+
if (imageDir) {
|
|
52
|
+
mkdirSync(imageDir, { recursive: true });
|
|
53
|
+
}
|
|
54
|
+
const pageMarkdowns = [];
|
|
55
|
+
for (const page of pages) {
|
|
56
|
+
// Build image blocks for this page
|
|
57
|
+
const imageBlocks = [];
|
|
58
|
+
if (imageDir && page.images.length > 0) {
|
|
59
|
+
for (const img of page.images) {
|
|
60
|
+
const filename = `${img.id}.png`;
|
|
61
|
+
const filepath = join(imageDir, filename);
|
|
62
|
+
try {
|
|
63
|
+
const png = renderImageRegion(pdfBytes, img);
|
|
64
|
+
writeFileSync(filepath, png);
|
|
65
|
+
imageBlocks.push({
|
|
66
|
+
topY: img.topY,
|
|
67
|
+
markdown: ``,
|
|
68
|
+
});
|
|
69
|
+
}
|
|
70
|
+
catch {
|
|
71
|
+
// Image rendering failed — skip
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
else if (page.images.length > 0) {
|
|
76
|
+
for (const img of page.images) {
|
|
77
|
+
imageBlocks.push({
|
|
78
|
+
topY: img.topY,
|
|
79
|
+
markdown: `<!-- image: ${img.id} (page ${img.pageNumber}, ${img.bbox.w}x${img.bbox.h}pt) -->`,
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
// Detect column layout
|
|
84
|
+
const layout = detectColumns(page.textBoxes);
|
|
85
|
+
if (layout.columnCount === 1) {
|
|
86
|
+
// Single column — process normally
|
|
87
|
+
const md = processColumn(page.pageNumber, page.textBoxes, page.segments, imageBlocks);
|
|
88
|
+
if (md.length > 0)
|
|
89
|
+
pageMarkdowns.push(md);
|
|
90
|
+
}
|
|
91
|
+
else {
|
|
92
|
+
// Multi-column — process each column independently, then join
|
|
93
|
+
const columnMarkdowns = [];
|
|
94
|
+
for (const colBoxes of layout.columns) {
|
|
95
|
+
// Filter segments to those within this column's X range
|
|
96
|
+
const colXMin = Math.min(...colBoxes.map((tb) => tb.bounds.left));
|
|
97
|
+
const colXMax = Math.max(...colBoxes.map((tb) => tb.bounds.right));
|
|
98
|
+
const margin = 10;
|
|
99
|
+
const colSegments = page.segments.filter((seg) => {
|
|
100
|
+
const segXMin = Math.min(seg.x1, seg.x2);
|
|
101
|
+
const segXMax = Math.max(seg.x1, seg.x2);
|
|
102
|
+
return segXMax >= colXMin - margin && segXMin <= colXMax + margin;
|
|
103
|
+
});
|
|
104
|
+
// Images go with the first column only (no X info to split by)
|
|
105
|
+
const md = processColumn(page.pageNumber, colBoxes, colSegments, columnMarkdowns.length === 0 ? imageBlocks : []);
|
|
106
|
+
if (md.length > 0)
|
|
107
|
+
columnMarkdowns.push(md);
|
|
108
|
+
}
|
|
109
|
+
const joined = columnMarkdowns.join("\n\n");
|
|
110
|
+
if (joined.length > 0)
|
|
111
|
+
pageMarkdowns.push(joined);
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
return { markdown: pageMarkdowns.join("\n\n") };
|
|
115
|
+
}
|
|
116
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Markdown rendering for PDF pages.
|
|
3
|
+
*
|
|
4
|
+
* Converts table grids and free text boxes into markdown, handling:
|
|
5
|
+
* - Table grid → markdown table (`| col | col |`)
|
|
6
|
+
* - Free text → paragraphs with heading detection (by font size)
|
|
7
|
+
* - Content ordering (top-to-bottom via Y coordinate)
|
|
8
|
+
* - Paragraph wrap merging (lines broken across PDF line boundaries)
|
|
9
|
+
* - Page number removal
|
|
10
|
+
*
|
|
11
|
+
* Ported from @oharato/pdf2md-ts, stripped of CJK/TDnet-specific logic.
|
|
12
|
+
*/
|
|
13
|
+
import type { TableGrid, TextBox } from "./types.js";
|
|
14
|
+
/**
|
|
15
|
+
* Render a TableGrid as a markdown table.
|
|
16
|
+
*/
|
|
17
|
+
export declare function renderTableToMarkdown(table: TableGrid): string;
|
|
18
|
+
/**
|
|
19
|
+
* Render one page's content: free text and tables interleaved top-to-bottom.
|
|
20
|
+
*/
|
|
21
|
+
export declare function renderPageContent(freeTextBoxes: TextBox[], tables: TableGrid[], imageBlocks?: Array<{
|
|
22
|
+
topY: number;
|
|
23
|
+
markdown: string;
|
|
24
|
+
}>, allTextBoxes?: TextBox[]): string;
|