markit-ai 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,108 @@
1
+ /**
2
+ * Running header/footer detection and removal.
3
+ *
4
+ * Many PDFs have repeated text at the top or bottom of every page:
5
+ * document titles, chapter names, page numbers, copyright notices.
6
+ * These pollute the markdown output as false headings or noise.
7
+ *
8
+ * Algorithm:
9
+ * 1. For each page, bucket text boxes by Y position (top/bottom zones)
10
+ * 2. Collect the text content at each zone across all pages
11
+ * 3. Text appearing on >20% of pages OR 8+ consecutive pages is a
12
+ * running header/footer
13
+ * 4. Remove matching text boxes before further processing
14
+ */
15
+ /** Minimum number of pages to enable header/footer detection. */
16
+ const MIN_PAGES = 5;
17
+ /** Minimum Y position for top zone (from bottom of page in PDF coords). */
18
+ const TOP_ZONE_MIN_Y = 700;
19
+ /** Maximum Y position for bottom zone. */
20
+ const BOTTOM_ZONE_MAX_Y = 80;
21
+ /**
22
+ * Minimum consecutive pages a text must appear on to be considered a
23
+ * running header/footer. Catches both document-wide headers (appearing
24
+ * on every page) and chapter-specific headers (appearing on 4+ consecutive
25
+ * pages within a chapter).
26
+ */
27
+ const MIN_CONSECUTIVE_PAGES = 8;
28
+ /**
29
+ * Detect and remove running headers and footers from all pages.
30
+ * Mutates the pages array in place, removing header/footer text boxes.
31
+ *
32
+ * Uses two strategies:
33
+ * 1. Global frequency: text appearing on > 20% of all pages
34
+ * 2. Consecutive runs: text appearing on 8+ consecutive pages
35
+ */
36
+ export function stripHeadersFooters(pages) {
37
+ if (pages.length < MIN_PAGES)
38
+ return;
39
+ // Step 1: Build per-page zone text sets
40
+ const pageZoneTexts = [];
41
+ for (const page of pages) {
42
+ const zoneTexts = new Set();
43
+ for (const tb of page.textBoxes) {
44
+ const midY = (tb.bounds.top + tb.bounds.bottom) / 2;
45
+ if (midY >= TOP_ZONE_MIN_Y || midY <= BOTTOM_ZONE_MAX_Y) {
46
+ const key = tb.text.trim().replace(/\s+/g, " ");
47
+ if (key.length > 0)
48
+ zoneTexts.add(key);
49
+ }
50
+ }
51
+ pageZoneTexts.push(zoneTexts);
52
+ }
53
+ // Step 2: Count global frequency AND longest consecutive run for each text
54
+ const globalCount = new Map();
55
+ const maxConsecutive = new Map();
56
+ // Collect all unique zone texts
57
+ const allTexts = new Set();
58
+ for (const zts of pageZoneTexts) {
59
+ for (const t of zts)
60
+ allTexts.add(t);
61
+ }
62
+ for (const text of allTexts) {
63
+ let total = 0;
64
+ let consecutive = 0;
65
+ let maxRun = 0;
66
+ for (const zts of pageZoneTexts) {
67
+ if (zts.has(text)) {
68
+ total++;
69
+ consecutive++;
70
+ if (consecutive > maxRun)
71
+ maxRun = consecutive;
72
+ }
73
+ else {
74
+ consecutive = 0;
75
+ }
76
+ }
77
+ globalCount.set(text, total);
78
+ maxConsecutive.set(text, maxRun);
79
+ }
80
+ // Step 3: Identify running headers/footers
81
+ const globalThreshold = Math.max(3, Math.floor(pages.length * 0.2));
82
+ const repeatedTexts = new Set();
83
+ for (const text of allTexts) {
84
+ const gc = globalCount.get(text) ?? 0;
85
+ const mc = maxConsecutive.get(text) ?? 0;
86
+ // Global: appears on 20%+ of pages
87
+ if (gc >= globalThreshold) {
88
+ repeatedTexts.add(text);
89
+ continue;
90
+ }
91
+ // Consecutive: appears on 8+ consecutive pages (chapter-level headers)
92
+ if (mc >= MIN_CONSECUTIVE_PAGES) {
93
+ repeatedTexts.add(text);
94
+ }
95
+ }
96
+ if (repeatedTexts.size === 0)
97
+ return;
98
+ // Step 4: Remove matching text boxes from each page
99
+ for (const page of pages) {
100
+ page.textBoxes = page.textBoxes.filter((tb) => {
101
+ const midY = (tb.bounds.top + tb.bounds.bottom) / 2;
102
+ if (midY < TOP_ZONE_MIN_Y && midY > BOTTOM_ZONE_MAX_Y)
103
+ return true;
104
+ const normalized = tb.text.trim().replace(/\s+/g, " ");
105
+ return !repeatedTexts.has(normalized);
106
+ });
107
+ }
108
+ }
@@ -0,0 +1,19 @@
1
+ /**
2
+ * PDF to Markdown converter.
3
+ *
4
+ * Uses mupdf (native WASM) for fast PDF parsing and a custom pipeline for
5
+ * table detection via vector line extraction + raycasting.
6
+ *
7
+ * Pipeline:
8
+ * 1. Extract text boxes + vector segments + image regions per page (mupdf)
9
+ * 2. Detect column layout (single vs multi-column)
10
+ * 3. Per column: detect table grids from segments (grid detection + raycasting)
11
+ * 4. Render diagrams as PNG files (if output directory provided)
12
+ * 5. Render tables as markdown tables, free text as paragraphs/headings
13
+ */
14
+ import type { ConversionResult, Converter, StreamInfo } from "../../types.js";
15
+ export declare class PdfConverter implements Converter {
16
+ name: string;
17
+ accepts(streamInfo: StreamInfo): boolean;
18
+ convert(input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult>;
19
+ }
@@ -0,0 +1,116 @@
1
+ /**
2
+ * PDF to Markdown converter.
3
+ *
4
+ * Uses mupdf (native WASM) for fast PDF parsing and a custom pipeline for
5
+ * table detection via vector line extraction + raycasting.
6
+ *
7
+ * Pipeline:
8
+ * 1. Extract text boxes + vector segments + image regions per page (mupdf)
9
+ * 2. Detect column layout (single vs multi-column)
10
+ * 3. Per column: detect table grids from segments (grid detection + raycasting)
11
+ * 4. Render diagrams as PNG files (if output directory provided)
12
+ * 5. Render tables as markdown tables, free text as paragraphs/headings
13
+ */
14
+ import { mkdirSync, writeFileSync } from "node:fs";
15
+ import { join } from "node:path";
16
+ import { detectColumns } from "./columns.js";
17
+ import { extractPages, renderImageRegion } from "./extract.js";
18
+ import { resolveTableGrids } from "./grid.js";
19
+ import { stripHeadersFooters } from "./headers.js";
20
+ import { renderPageContent } from "./render.js";
21
+ const EXTENSIONS = [".pdf"];
22
+ const MIMETYPES = ["application/pdf", "application/x-pdf"];
23
+ /**
24
+ * Process a set of text boxes (one column or full page): run table detection,
25
+ * separate free text, and render to markdown.
26
+ */
27
+ function processColumn(pageNumber, textBoxes, segments, imageBlocks) {
28
+ const { grids, consumedIds } = resolveTableGrids(pageNumber, textBoxes, segments);
29
+ const consumedSet = new Set(consumedIds);
30
+ const freeTextBoxes = textBoxes.filter((tb) => !consumedSet.has(tb.id));
31
+ return renderPageContent(freeTextBoxes, grids, imageBlocks, textBoxes);
32
+ }
33
+ export class PdfConverter {
34
+ name = "pdf";
35
+ accepts(streamInfo) {
36
+ if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension)) {
37
+ return true;
38
+ }
39
+ if (streamInfo.mimetype &&
40
+ MIMETYPES.some((m) => streamInfo.mimetype?.startsWith(m))) {
41
+ return true;
42
+ }
43
+ return false;
44
+ }
45
+ async convert(input, streamInfo) {
46
+ const pdfBytes = new Uint8Array(input);
47
+ const pages = await extractPages(pdfBytes);
48
+ // Remove running headers/footers before processing
49
+ stripHeadersFooters(pages);
50
+ const imageDir = streamInfo.imageDir;
51
+ if (imageDir) {
52
+ mkdirSync(imageDir, { recursive: true });
53
+ }
54
+ const pageMarkdowns = [];
55
+ for (const page of pages) {
56
+ // Build image blocks for this page
57
+ const imageBlocks = [];
58
+ if (imageDir && page.images.length > 0) {
59
+ for (const img of page.images) {
60
+ const filename = `${img.id}.png`;
61
+ const filepath = join(imageDir, filename);
62
+ try {
63
+ const png = renderImageRegion(pdfBytes, img);
64
+ writeFileSync(filepath, png);
65
+ imageBlocks.push({
66
+ topY: img.topY,
67
+ markdown: `![${img.id}](${filepath})`,
68
+ });
69
+ }
70
+ catch {
71
+ // Image rendering failed — skip
72
+ }
73
+ }
74
+ }
75
+ else if (page.images.length > 0) {
76
+ for (const img of page.images) {
77
+ imageBlocks.push({
78
+ topY: img.topY,
79
+ markdown: `<!-- image: ${img.id} (page ${img.pageNumber}, ${img.bbox.w}x${img.bbox.h}pt) -->`,
80
+ });
81
+ }
82
+ }
83
+ // Detect column layout
84
+ const layout = detectColumns(page.textBoxes);
85
+ if (layout.columnCount === 1) {
86
+ // Single column — process normally
87
+ const md = processColumn(page.pageNumber, page.textBoxes, page.segments, imageBlocks);
88
+ if (md.length > 0)
89
+ pageMarkdowns.push(md);
90
+ }
91
+ else {
92
+ // Multi-column — process each column independently, then join
93
+ const columnMarkdowns = [];
94
+ for (const colBoxes of layout.columns) {
95
+ // Filter segments to those within this column's X range
96
+ const colXMin = Math.min(...colBoxes.map((tb) => tb.bounds.left));
97
+ const colXMax = Math.max(...colBoxes.map((tb) => tb.bounds.right));
98
+ const margin = 10;
99
+ const colSegments = page.segments.filter((seg) => {
100
+ const segXMin = Math.min(seg.x1, seg.x2);
101
+ const segXMax = Math.max(seg.x1, seg.x2);
102
+ return segXMax >= colXMin - margin && segXMin <= colXMax + margin;
103
+ });
104
+ // Images go with the first column only (no X info to split by)
105
+ const md = processColumn(page.pageNumber, colBoxes, colSegments, columnMarkdowns.length === 0 ? imageBlocks : []);
106
+ if (md.length > 0)
107
+ columnMarkdowns.push(md);
108
+ }
109
+ const joined = columnMarkdowns.join("\n\n");
110
+ if (joined.length > 0)
111
+ pageMarkdowns.push(joined);
112
+ }
113
+ }
114
+ return { markdown: pageMarkdowns.join("\n\n") };
115
+ }
116
+ }
@@ -0,0 +1,24 @@
1
+ /**
2
+ * Markdown rendering for PDF pages.
3
+ *
4
+ * Converts table grids and free text boxes into markdown, handling:
5
+ * - Table grid → markdown table (`| col | col |`)
6
+ * - Free text → paragraphs with heading detection (by font size)
7
+ * - Content ordering (top-to-bottom via Y coordinate)
8
+ * - Paragraph wrap merging (lines broken across PDF line boundaries)
9
+ * - Page number removal
10
+ *
11
+ * Ported from @oharato/pdf2md-ts, stripped of CJK/TDnet-specific logic.
12
+ */
13
+ import type { TableGrid, TextBox } from "./types.js";
14
+ /**
15
+ * Render a TableGrid as a markdown table.
16
+ */
17
+ export declare function renderTableToMarkdown(table: TableGrid): string;
18
+ /**
19
+ * Render one page's content: free text and tables interleaved top-to-bottom.
20
+ */
21
+ export declare function renderPageContent(freeTextBoxes: TextBox[], tables: TableGrid[], imageBlocks?: Array<{
22
+ topY: number;
23
+ markdown: string;
24
+ }>, allTextBoxes?: TextBox[]): string;