markit-ai 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/converters/pdf/columns.d.ts +35 -0
- package/dist/converters/pdf/columns.js +93 -0
- package/dist/converters/pdf/extract.d.ts +19 -0
- package/dist/converters/pdf/extract.js +513 -0
- package/dist/converters/pdf/grid.d.ts +25 -0
- package/dist/converters/pdf/grid.js +654 -0
- package/dist/converters/pdf/headers.d.ts +24 -0
- package/dist/converters/pdf/headers.js +108 -0
- package/dist/converters/pdf/index.d.ts +19 -0
- package/dist/converters/pdf/index.js +116 -0
- package/dist/converters/pdf/render.d.ts +24 -0
- package/dist/converters/pdf/render.js +513 -0
- package/dist/converters/pdf/types.d.ts +75 -0
- package/dist/converters/pdf/types.js +1 -0
- package/dist/index.d.ts +1 -1
- package/dist/index.js +1 -1
- package/dist/markit.js +1 -1
- package/dist/types.d.ts +2 -0
- package/package.json +3 -3
- package/dist/converters/pdf.d.ts +0 -6
- package/dist/converters/pdf.js +0 -29
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Multi-column layout detection and text box reordering.
|
|
3
|
+
*
|
|
4
|
+
* Many PDFs (legal documents, datasheets, academic papers) use two-column
|
|
5
|
+
* layouts. Without column detection, text boxes are ordered by Y position
|
|
6
|
+
* only, interleaving left and right column content.
|
|
7
|
+
*
|
|
8
|
+
* Algorithm:
|
|
9
|
+
* 1. Collect left edges of all text boxes on the page
|
|
10
|
+
* 2. Find the largest horizontal gap between consecutive left edges
|
|
11
|
+
* 3. If gap > MIN_GAP_RATIO of the text width and both sides have
|
|
12
|
+
* enough boxes → multi-column detected
|
|
13
|
+
* 4. Assign each text box to a column based on its center X
|
|
14
|
+
* 5. Return columns in reading order (left-to-right, top-to-bottom)
|
|
15
|
+
*
|
|
16
|
+
* This only detects the column structure. The caller is responsible for
|
|
17
|
+
* processing each column's text boxes independently (table detection,
|
|
18
|
+
* rendering, etc.).
|
|
19
|
+
*/
|
|
20
|
+
import type { TextBox } from "./types.js";
|
|
21
|
+
export interface ColumnLayout {
|
|
22
|
+
/** Number of columns detected (1 = single column, 2+ = multi-column). */
|
|
23
|
+
columnCount: number;
|
|
24
|
+
/** Text boxes grouped by column, in reading order (left to right). */
|
|
25
|
+
columns: TextBox[][];
|
|
26
|
+
/** X positions of column boundaries (between columns). */
|
|
27
|
+
boundaries: number[];
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Detect column layout and return text boxes grouped by column.
|
|
31
|
+
*
|
|
32
|
+
* For single-column pages, returns all boxes in one group.
|
|
33
|
+
* For multi-column pages, returns boxes split by column in reading order.
|
|
34
|
+
*/
|
|
35
|
+
export declare function detectColumns(textBoxes: TextBox[]): ColumnLayout;
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Multi-column layout detection and text box reordering.
|
|
3
|
+
*
|
|
4
|
+
* Many PDFs (legal documents, datasheets, academic papers) use two-column
|
|
5
|
+
* layouts. Without column detection, text boxes are ordered by Y position
|
|
6
|
+
* only, interleaving left and right column content.
|
|
7
|
+
*
|
|
8
|
+
* Algorithm:
|
|
9
|
+
* 1. Collect left edges of all text boxes on the page
|
|
10
|
+
* 2. Find the largest horizontal gap between consecutive left edges
|
|
11
|
+
* 3. If gap > MIN_GAP_RATIO of the text width and both sides have
|
|
12
|
+
* enough boxes → multi-column detected
|
|
13
|
+
* 4. Assign each text box to a column based on its center X
|
|
14
|
+
* 5. Return columns in reading order (left-to-right, top-to-bottom)
|
|
15
|
+
*
|
|
16
|
+
* This only detects the column structure. The caller is responsible for
|
|
17
|
+
* processing each column's text boxes independently (table detection,
|
|
18
|
+
* rendering, etc.).
|
|
19
|
+
*/
|
|
20
|
+
/**
|
|
21
|
+
* Minimum gap as a fraction of the total text width to consider a column
|
|
22
|
+
* boundary. A two-column layout typically has ~50% gap; we use a lower
|
|
23
|
+
* threshold to catch asymmetric columns.
|
|
24
|
+
*/
|
|
25
|
+
const MIN_GAP_RATIO = 0.15;
|
|
26
|
+
/** Minimum number of text boxes on each side of the gap. */
|
|
27
|
+
const MIN_BOXES_PER_COLUMN = 4;
|
|
28
|
+
/** Minimum gap in absolute points to avoid splitting on small whitespace. */
|
|
29
|
+
const MIN_GAP_PTS = 40;
|
|
30
|
+
/**
|
|
31
|
+
* Detect column layout and return text boxes grouped by column.
|
|
32
|
+
*
|
|
33
|
+
* For single-column pages, returns all boxes in one group.
|
|
34
|
+
* For multi-column pages, returns boxes split by column in reading order.
|
|
35
|
+
*/
|
|
36
|
+
export function detectColumns(textBoxes) {
|
|
37
|
+
if (textBoxes.length < MIN_BOXES_PER_COLUMN * 2) {
|
|
38
|
+
return { columnCount: 1, columns: [textBoxes], boundaries: [] };
|
|
39
|
+
}
|
|
40
|
+
// Collect unique left edges (rounded to avoid float noise)
|
|
41
|
+
const lefts = [
|
|
42
|
+
...new Set(textBoxes.map((tb) => Math.round(tb.bounds.left))),
|
|
43
|
+
].sort((a, b) => a - b);
|
|
44
|
+
if (lefts.length < 2) {
|
|
45
|
+
return { columnCount: 1, columns: [textBoxes], boundaries: [] };
|
|
46
|
+
}
|
|
47
|
+
const textXMin = lefts[0];
|
|
48
|
+
const textXMax = Math.max(...textBoxes.map((tb) => Math.round(tb.bounds.right)));
|
|
49
|
+
const textWidth = textXMax - textXMin;
|
|
50
|
+
if (textWidth <= 0) {
|
|
51
|
+
return { columnCount: 1, columns: [textBoxes], boundaries: [] };
|
|
52
|
+
}
|
|
53
|
+
// Find the largest gap between consecutive left-edge positions
|
|
54
|
+
let maxGap = 0;
|
|
55
|
+
let gapLeft = 0;
|
|
56
|
+
let gapRight = 0;
|
|
57
|
+
for (let i = 1; i < lefts.length; i++) {
|
|
58
|
+
const gap = lefts[i] - lefts[i - 1];
|
|
59
|
+
if (gap > maxGap) {
|
|
60
|
+
maxGap = gap;
|
|
61
|
+
gapLeft = lefts[i - 1];
|
|
62
|
+
gapRight = lefts[i];
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
const gapRatio = maxGap / textWidth;
|
|
66
|
+
if (gapRatio < MIN_GAP_RATIO || maxGap < MIN_GAP_PTS) {
|
|
67
|
+
return { columnCount: 1, columns: [textBoxes], boundaries: [] };
|
|
68
|
+
}
|
|
69
|
+
// Split point is the midpoint of the gap
|
|
70
|
+
const splitX = (gapLeft + gapRight) / 2;
|
|
71
|
+
// Assign boxes to columns based on center X
|
|
72
|
+
const leftCol = [];
|
|
73
|
+
const rightCol = [];
|
|
74
|
+
for (const tb of textBoxes) {
|
|
75
|
+
const cx = (tb.bounds.left + tb.bounds.right) / 2;
|
|
76
|
+
if (cx < splitX) {
|
|
77
|
+
leftCol.push(tb);
|
|
78
|
+
}
|
|
79
|
+
else {
|
|
80
|
+
rightCol.push(tb);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
// Validate both columns have enough content
|
|
84
|
+
if (leftCol.length < MIN_BOXES_PER_COLUMN ||
|
|
85
|
+
rightCol.length < MIN_BOXES_PER_COLUMN) {
|
|
86
|
+
return { columnCount: 1, columns: [textBoxes], boundaries: [] };
|
|
87
|
+
}
|
|
88
|
+
return {
|
|
89
|
+
columnCount: 2,
|
|
90
|
+
columns: [leftCol, rightCol],
|
|
91
|
+
boundaries: [splitX],
|
|
92
|
+
};
|
|
93
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF content extraction using mupdf.
|
|
3
|
+
*
|
|
4
|
+
* Extracts text boxes (with position, font size, bold) and vector line
|
|
5
|
+
* segments (table borders) from each page. Uses mupdf's native WASM
|
|
6
|
+
* engine for fast parsing, and reads raw content streams for vector graphics.
|
|
7
|
+
*
|
|
8
|
+
* Coordinate system: PDF native (origin = bottom-left, Y increases upward).
|
|
9
|
+
*/
|
|
10
|
+
import type { ImageRegion, PageContent } from "./types.js";
|
|
11
|
+
/**
|
|
12
|
+
* Render an image region from a PDF page as a PNG buffer.
|
|
13
|
+
* Uses mupdf's DrawDevice to render just the cropped area at 2x resolution.
|
|
14
|
+
*/
|
|
15
|
+
export declare function renderImageRegion(input: Uint8Array, region: ImageRegion): Uint8Array;
|
|
16
|
+
/**
|
|
17
|
+
* Extract text boxes and vector segments from all pages of a PDF buffer.
|
|
18
|
+
*/
|
|
19
|
+
export declare function extractPages(input: Uint8Array): Promise<PageContent[]>;
|
|
@@ -0,0 +1,513 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF content extraction using mupdf.
|
|
3
|
+
*
|
|
4
|
+
* Extracts text boxes (with position, font size, bold) and vector line
|
|
5
|
+
* segments (table borders) from each page. Uses mupdf's native WASM
|
|
6
|
+
* engine for fast parsing, and reads raw content streams for vector graphics.
|
|
7
|
+
*
|
|
8
|
+
* Coordinate system: PDF native (origin = bottom-left, Y increases upward).
|
|
9
|
+
*/
|
|
10
|
+
// ---------------------------------------------------------------------------
|
|
11
|
+
// Text extraction
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
/** Y tolerance for merging text fragments on the same visual line. */
|
|
14
|
+
const SAME_LINE_Y_TOLERANCE = 2;
|
|
15
|
+
/** Max horizontal gap (pts) to merge adjacent fragments into one text box. */
|
|
16
|
+
const MAX_MERGE_GAP = 14;
|
|
17
|
+
/**
|
|
18
|
+
* Merge horizontally adjacent raw text items on the same visual line into
|
|
19
|
+
* word/phrase-level text boxes.
|
|
20
|
+
*/
|
|
21
|
+
function mergeIntoWords(raws) {
|
|
22
|
+
if (raws.length === 0)
|
|
23
|
+
return [];
|
|
24
|
+
// Sort by Y descending (top-first in bottom-left coords), then X ascending
|
|
25
|
+
const sorted = [...raws].sort((a, b) => {
|
|
26
|
+
const dy = b.y - a.y;
|
|
27
|
+
return Math.abs(dy) > SAME_LINE_Y_TOLERANCE ? dy : a.x - b.x;
|
|
28
|
+
});
|
|
29
|
+
const merged = [];
|
|
30
|
+
let cur = { ...sorted[0] };
|
|
31
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
32
|
+
const next = sorted[i];
|
|
33
|
+
const sameY = Math.abs(next.y - cur.y) <= SAME_LINE_Y_TOLERANCE;
|
|
34
|
+
const close = next.x <= cur.x + cur.width + MAX_MERGE_GAP;
|
|
35
|
+
if (sameY && close) {
|
|
36
|
+
const gap = next.x - (cur.x + cur.width);
|
|
37
|
+
const sep = gap > 1 ? " " : "";
|
|
38
|
+
cur.text += sep + next.text;
|
|
39
|
+
cur.width = next.x + next.width - cur.x;
|
|
40
|
+
cur.height = Math.max(cur.height, next.height);
|
|
41
|
+
cur.fontSize = Math.max(cur.fontSize, next.fontSize);
|
|
42
|
+
cur.isBold = cur.isBold || next.isBold;
|
|
43
|
+
}
|
|
44
|
+
else {
|
|
45
|
+
merged.push(cur);
|
|
46
|
+
cur = { ...next };
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
merged.push(cur);
|
|
50
|
+
return merged;
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Extract text boxes from a mupdf page using structured text output.
|
|
54
|
+
*
|
|
55
|
+
* mupdf's structured text JSON uses top-left origin; we convert to
|
|
56
|
+
* bottom-left (standard PDF coordinates) using the page height.
|
|
57
|
+
*/
|
|
58
|
+
function extractTextBoxes(page, pageNumber, pageHeight, stext) {
|
|
59
|
+
if (!stext) {
|
|
60
|
+
stext = JSON.parse(page.toStructuredText("preserve-whitespace").asJSON());
|
|
61
|
+
}
|
|
62
|
+
const raws = [];
|
|
63
|
+
for (const block of stext.blocks) {
|
|
64
|
+
if (block.type !== "text")
|
|
65
|
+
continue;
|
|
66
|
+
for (const line of block.lines) {
|
|
67
|
+
const text = line.text?.trim();
|
|
68
|
+
if (!text)
|
|
69
|
+
continue;
|
|
70
|
+
const fontSize = line.font?.size ?? 0;
|
|
71
|
+
const weight = line.font?.weight ?? "normal";
|
|
72
|
+
const fontName = line.font?.name ?? "";
|
|
73
|
+
const isBold = weight === "bold" ||
|
|
74
|
+
/bold/i.test(fontName) ||
|
|
75
|
+
/Black|Heavy/i.test(fontName);
|
|
76
|
+
// mupdf bbox: {x, y, w, h} in top-left coords
|
|
77
|
+
// Convert to bottom-left: pdfY = pageHeight - (bbox.y + bbox.h)
|
|
78
|
+
const bboxY = line.bbox.y;
|
|
79
|
+
const bboxH = line.bbox.h;
|
|
80
|
+
const pdfY = pageHeight - (bboxY + bboxH);
|
|
81
|
+
raws.push({
|
|
82
|
+
text,
|
|
83
|
+
x: line.bbox.x,
|
|
84
|
+
y: pdfY,
|
|
85
|
+
width: line.bbox.w,
|
|
86
|
+
height: bboxH,
|
|
87
|
+
fontSize,
|
|
88
|
+
isBold,
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
const words = mergeIntoWords(raws);
|
|
93
|
+
return words
|
|
94
|
+
.map((w, i) => ({
|
|
95
|
+
id: `p${pageNumber}-t${i}`,
|
|
96
|
+
text: w.text.trim(),
|
|
97
|
+
pageNumber,
|
|
98
|
+
fontSize: w.fontSize,
|
|
99
|
+
isBold: w.isBold,
|
|
100
|
+
bounds: {
|
|
101
|
+
left: w.x,
|
|
102
|
+
right: w.x + w.width,
|
|
103
|
+
bottom: w.y,
|
|
104
|
+
top: w.y + w.height,
|
|
105
|
+
},
|
|
106
|
+
}))
|
|
107
|
+
.filter((b) => b.text.length > 0);
|
|
108
|
+
}
|
|
109
|
+
// ---------------------------------------------------------------------------
|
|
110
|
+
// Vector segment extraction from raw content stream
|
|
111
|
+
// ---------------------------------------------------------------------------
|
|
112
|
+
/** Minimum aspect ratio for a filled rect to be considered a line. */
|
|
113
|
+
const LINE_ASPECT_THRESHOLD = 6;
|
|
114
|
+
/** Minimum length (pts) for a segment to count. */
|
|
115
|
+
const MIN_LENGTH = 2;
|
|
116
|
+
/** Maximum thickness (pts) for a border line (filters out filled areas). */
|
|
117
|
+
const MAX_THICKNESS = 3;
|
|
118
|
+
/**
|
|
119
|
+
* Convert a thin filled rectangle to a horizontal or vertical segment.
|
|
120
|
+
* Returns null if the rect doesn't look like a border line.
|
|
121
|
+
*/
|
|
122
|
+
function thinRectToSegment(id, x, y, w, h) {
|
|
123
|
+
const aw = Math.abs(w);
|
|
124
|
+
const ah = Math.abs(h);
|
|
125
|
+
if (aw > ah * LINE_ASPECT_THRESHOLD &&
|
|
126
|
+
aw >= MIN_LENGTH &&
|
|
127
|
+
ah <= MAX_THICKNESS) {
|
|
128
|
+
// Horizontal line
|
|
129
|
+
const cy = y + ah / 2;
|
|
130
|
+
return { id, x1: x, y1: cy, x2: x + aw, y2: cy };
|
|
131
|
+
}
|
|
132
|
+
if (ah > aw * LINE_ASPECT_THRESHOLD &&
|
|
133
|
+
ah >= MIN_LENGTH &&
|
|
134
|
+
aw <= MAX_THICKNESS) {
|
|
135
|
+
// Vertical line
|
|
136
|
+
const cx = x + aw / 2;
|
|
137
|
+
return { id, x1: cx, y1: y, x2: cx, y2: y + ah };
|
|
138
|
+
}
|
|
139
|
+
return null;
|
|
140
|
+
}
|
|
141
|
+
/**
|
|
142
|
+
* Emit 4 edge segments from a stroked rectangle.
|
|
143
|
+
*/
|
|
144
|
+
function pushStrokedRectEdges(segments, id, x, y, w, h) {
|
|
145
|
+
const aw = Math.abs(w);
|
|
146
|
+
const ah = Math.abs(h);
|
|
147
|
+
const base = id;
|
|
148
|
+
if (aw >= MIN_LENGTH) {
|
|
149
|
+
segments.push({ id: `${base}-b`, x1: x, y1: y, x2: x + aw, y2: y });
|
|
150
|
+
segments.push({
|
|
151
|
+
id: `${base}-t`,
|
|
152
|
+
x1: x,
|
|
153
|
+
y1: y + ah,
|
|
154
|
+
x2: x + aw,
|
|
155
|
+
y2: y + ah,
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
if (ah >= MIN_LENGTH) {
|
|
159
|
+
segments.push({ id: `${base}-l`, x1: x, y1: y, x2: x, y2: y + ah });
|
|
160
|
+
segments.push({
|
|
161
|
+
id: `${base}-r`,
|
|
162
|
+
x1: x + aw,
|
|
163
|
+
y1: y,
|
|
164
|
+
x2: x + aw,
|
|
165
|
+
y2: y + ah,
|
|
166
|
+
});
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
const CTM_IDENTITY = [1, 0, 0, 1, 0, 0];
|
|
170
|
+
/** Concatenate two affine matrices: result = parent × child. */
|
|
171
|
+
function ctmConcat(p, c) {
|
|
172
|
+
return [
|
|
173
|
+
p[0] * c[0] + p[2] * c[1],
|
|
174
|
+
p[1] * c[0] + p[3] * c[1],
|
|
175
|
+
p[0] * c[2] + p[2] * c[3],
|
|
176
|
+
p[1] * c[2] + p[3] * c[3],
|
|
177
|
+
p[0] * c[4] + p[2] * c[5] + p[4],
|
|
178
|
+
p[1] * c[4] + p[3] * c[5] + p[5],
|
|
179
|
+
];
|
|
180
|
+
}
|
|
181
|
+
function ctmApply(m, x, y) {
|
|
182
|
+
return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
|
|
183
|
+
}
|
|
184
|
+
// ---------------------------------------------------------------------------
|
|
185
|
+
// Content stream parsing
|
|
186
|
+
// ---------------------------------------------------------------------------
|
|
187
|
+
/**
|
|
188
|
+
* Parse a PDF content stream and extract line segments from thin filled
|
|
189
|
+
* rectangles (re+f), stroked rectangles (re+S), and explicit lines (m/l+S).
|
|
190
|
+
* Tracks the CTM via q/Q/cm operators so coordinates are in page space.
|
|
191
|
+
*/
|
|
192
|
+
function extractSegmentsFromContentStream(raw, pageNumber) {
|
|
193
|
+
const segments = [];
|
|
194
|
+
const tokens = tokenizeContentStream(raw);
|
|
195
|
+
let idx = 0;
|
|
196
|
+
let strokeWidth = 1.0;
|
|
197
|
+
// Graphics state stack (q/Q): saves CTM + strokeWidth
|
|
198
|
+
let ctm = [...CTM_IDENTITY];
|
|
199
|
+
const stateStack = [];
|
|
200
|
+
// State for path building (in user coordinates, pre-CTM)
|
|
201
|
+
let curX = 0;
|
|
202
|
+
let curY = 0;
|
|
203
|
+
let pathStartX = 0;
|
|
204
|
+
let pathStartY = 0;
|
|
205
|
+
const pendingRects = [];
|
|
206
|
+
const pendingLines = [];
|
|
207
|
+
function flushPath(mode) {
|
|
208
|
+
const sid = () => `p${pageNumber}-s${segments.length}`;
|
|
209
|
+
if (mode === "fill") {
|
|
210
|
+
for (const r of pendingRects) {
|
|
211
|
+
// Transform the rect corners through CTM, then check if it's a thin line
|
|
212
|
+
const [x0, y0] = ctmApply(ctm, r.x, r.y);
|
|
213
|
+
const [x1, y1] = ctmApply(ctm, r.x + r.w, r.y + r.h);
|
|
214
|
+
const seg = thinRectToSegment(sid(), Math.min(x0, x1), Math.min(y0, y1), Math.abs(x1 - x0), Math.abs(y1 - y0));
|
|
215
|
+
if (seg)
|
|
216
|
+
segments.push(seg);
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
else if (mode === "stroke" && strokeWidth <= MAX_THICKNESS) {
|
|
220
|
+
for (const r of pendingRects) {
|
|
221
|
+
const [x0, y0] = ctmApply(ctm, r.x, r.y);
|
|
222
|
+
const [x1, y1] = ctmApply(ctm, r.x + r.w, r.y + r.h);
|
|
223
|
+
pushStrokedRectEdges(segments, sid(), Math.min(x0, x1), Math.min(y0, y1), Math.abs(x1 - x0), Math.abs(y1 - y0));
|
|
224
|
+
}
|
|
225
|
+
for (const l of pendingLines) {
|
|
226
|
+
const [lx1, ly1] = ctmApply(ctm, l.x1, l.y1);
|
|
227
|
+
const [lx2, ly2] = ctmApply(ctm, l.x2, l.y2);
|
|
228
|
+
const dx = Math.abs(lx2 - lx1);
|
|
229
|
+
const dy = Math.abs(ly2 - ly1);
|
|
230
|
+
// Only keep H/V lines
|
|
231
|
+
if ((dx >= MIN_LENGTH && dy < 1) || (dy >= MIN_LENGTH && dx < 1)) {
|
|
232
|
+
segments.push({ id: sid(), x1: lx1, y1: ly1, x2: lx2, y2: ly2 });
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
pendingRects.length = 0;
|
|
237
|
+
pendingLines.length = 0;
|
|
238
|
+
}
|
|
239
|
+
while (idx < tokens.length) {
|
|
240
|
+
const t = tokens[idx];
|
|
241
|
+
if (t === "q") {
|
|
242
|
+
stateStack.push({ ctm: [...ctm], strokeWidth });
|
|
243
|
+
}
|
|
244
|
+
else if (t === "Q") {
|
|
245
|
+
const saved = stateStack.pop();
|
|
246
|
+
if (saved) {
|
|
247
|
+
ctm = saved.ctm;
|
|
248
|
+
strokeWidth = saved.strokeWidth;
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
else if (t === "cm" && idx >= 6) {
|
|
252
|
+
const a = Number(tokens[idx - 6]);
|
|
253
|
+
const b = Number(tokens[idx - 5]);
|
|
254
|
+
const c = Number(tokens[idx - 4]);
|
|
255
|
+
const d = Number(tokens[idx - 3]);
|
|
256
|
+
const e = Number(tokens[idx - 2]);
|
|
257
|
+
const f = Number(tokens[idx - 1]);
|
|
258
|
+
ctm = ctmConcat(ctm, [a, b, c, d, e, f]);
|
|
259
|
+
}
|
|
260
|
+
else if (t === "w" && idx >= 1) {
|
|
261
|
+
strokeWidth = Number(tokens[idx - 1]) || strokeWidth;
|
|
262
|
+
}
|
|
263
|
+
else if (t === "re" && idx >= 4) {
|
|
264
|
+
const x = Number(tokens[idx - 4]);
|
|
265
|
+
const y = Number(tokens[idx - 3]);
|
|
266
|
+
const w = Number(tokens[idx - 2]);
|
|
267
|
+
const h = Number(tokens[idx - 1]);
|
|
268
|
+
if (Number.isFinite(x + y + w + h)) {
|
|
269
|
+
pendingRects.push({ x, y, w, h });
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
else if (t === "m" && idx >= 2) {
|
|
273
|
+
curX = Number(tokens[idx - 2]);
|
|
274
|
+
curY = Number(tokens[idx - 1]);
|
|
275
|
+
pathStartX = curX;
|
|
276
|
+
pathStartY = curY;
|
|
277
|
+
}
|
|
278
|
+
else if (t === "l" && idx >= 2) {
|
|
279
|
+
const x2 = Number(tokens[idx - 2]);
|
|
280
|
+
const y2 = Number(tokens[idx - 1]);
|
|
281
|
+
pendingLines.push({ x1: curX, y1: curY, x2, y2 });
|
|
282
|
+
curX = x2;
|
|
283
|
+
curY = y2;
|
|
284
|
+
}
|
|
285
|
+
else if (t === "h") {
|
|
286
|
+
// closePath: line back to start
|
|
287
|
+
if (curX !== pathStartX || curY !== pathStartY) {
|
|
288
|
+
pendingLines.push({
|
|
289
|
+
x1: curX,
|
|
290
|
+
y1: curY,
|
|
291
|
+
x2: pathStartX,
|
|
292
|
+
y2: pathStartY,
|
|
293
|
+
});
|
|
294
|
+
}
|
|
295
|
+
curX = pathStartX;
|
|
296
|
+
curY = pathStartY;
|
|
297
|
+
}
|
|
298
|
+
else if (t === "f" || t === "F" || t === "f*") {
|
|
299
|
+
flushPath("fill");
|
|
300
|
+
}
|
|
301
|
+
else if (t === "S" || t === "s") {
|
|
302
|
+
if (t === "s") {
|
|
303
|
+
// closeStroke: implicit closePath
|
|
304
|
+
if (curX !== pathStartX || curY !== pathStartY) {
|
|
305
|
+
pendingLines.push({
|
|
306
|
+
x1: curX,
|
|
307
|
+
y1: curY,
|
|
308
|
+
x2: pathStartX,
|
|
309
|
+
y2: pathStartY,
|
|
310
|
+
});
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
flushPath("stroke");
|
|
314
|
+
}
|
|
315
|
+
else if (t === "B" || t === "B*" || t === "b" || t === "b*") {
|
|
316
|
+
// fill + stroke combined
|
|
317
|
+
flushPath("fill");
|
|
318
|
+
flushPath("stroke");
|
|
319
|
+
}
|
|
320
|
+
else if (t === "n") {
|
|
321
|
+
// end path without painting — discard
|
|
322
|
+
pendingRects.length = 0;
|
|
323
|
+
pendingLines.length = 0;
|
|
324
|
+
}
|
|
325
|
+
idx++;
|
|
326
|
+
}
|
|
327
|
+
return segments;
|
|
328
|
+
}
|
|
329
|
+
/**
|
|
330
|
+
* Fast tokenizer for PDF content streams.
|
|
331
|
+
* Splits on whitespace, skipping comments and string literals.
|
|
332
|
+
*/
|
|
333
|
+
function tokenizeContentStream(raw) {
|
|
334
|
+
const tokens = [];
|
|
335
|
+
const len = raw.length;
|
|
336
|
+
let i = 0;
|
|
337
|
+
while (i < len) {
|
|
338
|
+
const ch = raw.charCodeAt(i);
|
|
339
|
+
// Skip whitespace
|
|
340
|
+
if (ch <= 32) {
|
|
341
|
+
i++;
|
|
342
|
+
continue;
|
|
343
|
+
}
|
|
344
|
+
// Skip comments
|
|
345
|
+
if (ch === 37 /* % */) {
|
|
346
|
+
while (i < len && raw.charCodeAt(i) !== 10)
|
|
347
|
+
i++;
|
|
348
|
+
continue;
|
|
349
|
+
}
|
|
350
|
+
// Skip string literals (...)
|
|
351
|
+
if (ch === 40 /* ( */) {
|
|
352
|
+
let depth = 1;
|
|
353
|
+
i++;
|
|
354
|
+
while (i < len && depth > 0) {
|
|
355
|
+
const c = raw.charCodeAt(i);
|
|
356
|
+
if (c === 92 /* \ */) {
|
|
357
|
+
i++;
|
|
358
|
+
}
|
|
359
|
+
else if (c === 40) {
|
|
360
|
+
depth++;
|
|
361
|
+
}
|
|
362
|
+
else if (c === 41) {
|
|
363
|
+
depth--;
|
|
364
|
+
}
|
|
365
|
+
i++;
|
|
366
|
+
}
|
|
367
|
+
continue;
|
|
368
|
+
}
|
|
369
|
+
// Skip hex strings <...>
|
|
370
|
+
if (ch === 60 /* < */ && i + 1 < len && raw.charCodeAt(i + 1) !== 60) {
|
|
371
|
+
i++;
|
|
372
|
+
while (i < len && raw.charCodeAt(i) !== 62)
|
|
373
|
+
i++;
|
|
374
|
+
i++; // skip >
|
|
375
|
+
continue;
|
|
376
|
+
}
|
|
377
|
+
// Skip dict delimiters << >>
|
|
378
|
+
if (ch === 60 && i + 1 < len && raw.charCodeAt(i + 1) === 60) {
|
|
379
|
+
i += 2;
|
|
380
|
+
continue;
|
|
381
|
+
}
|
|
382
|
+
if (ch === 62 && i + 1 < len && raw.charCodeAt(i + 1) === 62) {
|
|
383
|
+
i += 2;
|
|
384
|
+
continue;
|
|
385
|
+
}
|
|
386
|
+
// Regular token: read until whitespace or delimiter
|
|
387
|
+
const start = i;
|
|
388
|
+
while (i < len) {
|
|
389
|
+
const c = raw.charCodeAt(i);
|
|
390
|
+
if (c <= 32 || c === 40 || c === 41 || c === 60 || c === 62 || c === 37)
|
|
391
|
+
break;
|
|
392
|
+
i++;
|
|
393
|
+
}
|
|
394
|
+
if (i > start) {
|
|
395
|
+
tokens.push(raw.substring(start, i));
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
return tokens;
|
|
399
|
+
}
|
|
400
|
+
// ---------------------------------------------------------------------------
|
|
401
|
+
// Image region detection
|
|
402
|
+
// ---------------------------------------------------------------------------
|
|
403
|
+
/** Minimum area (pts²) for an image to be considered a diagram, not an icon. */
|
|
404
|
+
const MIN_IMAGE_AREA = 5000;
|
|
405
|
+
function extractImageRegions(stext, pageNumber, pageHeight) {
|
|
406
|
+
const regions = [];
|
|
407
|
+
for (const block of stext.blocks) {
|
|
408
|
+
if (block.type !== "image")
|
|
409
|
+
continue;
|
|
410
|
+
const { x, y, w, h } = block.bbox;
|
|
411
|
+
if (w * h < MIN_IMAGE_AREA)
|
|
412
|
+
continue; // skip tiny icons
|
|
413
|
+
// Convert Y from mupdf (top-left) to PDF (bottom-left) for ordering
|
|
414
|
+
const pdfTopY = pageHeight - y;
|
|
415
|
+
regions.push({
|
|
416
|
+
id: `p${pageNumber}-img${regions.length}`,
|
|
417
|
+
pageNumber,
|
|
418
|
+
bbox: { x, y, w, h },
|
|
419
|
+
topY: pdfTopY,
|
|
420
|
+
});
|
|
421
|
+
}
|
|
422
|
+
return regions;
|
|
423
|
+
}
|
|
424
|
+
// ---------------------------------------------------------------------------
|
|
425
|
+
// Public API
|
|
426
|
+
// ---------------------------------------------------------------------------
|
|
427
|
+
/**
|
|
428
|
+
* Render an image region from a PDF page as a PNG buffer.
|
|
429
|
+
* Uses mupdf's DrawDevice to render just the cropped area at 2x resolution.
|
|
430
|
+
*/
|
|
431
|
+
export function renderImageRegion(input, region) {
|
|
432
|
+
const mupdf = require("mupdf");
|
|
433
|
+
const doc = mupdf.Document.openDocument(input, "application/pdf");
|
|
434
|
+
const page = doc.loadPage(region.pageNumber - 1);
|
|
435
|
+
const pad = 10;
|
|
436
|
+
const bx = region.bbox.x - pad;
|
|
437
|
+
const by = region.bbox.y - pad;
|
|
438
|
+
const bw = region.bbox.w + 2 * pad;
|
|
439
|
+
const bh = region.bbox.h + 2 * pad;
|
|
440
|
+
const scale = 2;
|
|
441
|
+
const pw = Math.round(bw * scale);
|
|
442
|
+
const ph = Math.round(bh * scale);
|
|
443
|
+
const pix = new mupdf.Pixmap(mupdf.ColorSpace.DeviceRGB, [0, 0, pw, ph], false);
|
|
444
|
+
pix.clear(255);
|
|
445
|
+
const matrix = [scale, 0, 0, scale, -bx * scale, -by * scale];
|
|
446
|
+
const dl = page.toDisplayList();
|
|
447
|
+
const dev = new mupdf.DrawDevice(matrix, pix);
|
|
448
|
+
dl.run(dev, mupdf.Matrix.identity);
|
|
449
|
+
dev.close();
|
|
450
|
+
return pix.asPNG();
|
|
451
|
+
}
|
|
452
|
+
/**
|
|
453
|
+
* Extract text boxes and vector segments from all pages of a PDF buffer.
|
|
454
|
+
*/
|
|
455
|
+
export async function extractPages(input) {
|
|
456
|
+
let mupdf;
|
|
457
|
+
try {
|
|
458
|
+
mupdf = await import("mupdf");
|
|
459
|
+
}
|
|
460
|
+
catch {
|
|
461
|
+
throw new Error("PDF support requires 'mupdf'. Install it: npm install mupdf");
|
|
462
|
+
}
|
|
463
|
+
const doc = mupdf.Document.openDocument(input, "application/pdf");
|
|
464
|
+
const pages = [];
|
|
465
|
+
for (let i = 0; i < doc.countPages(); i++) {
|
|
466
|
+
const pageNumber = i + 1;
|
|
467
|
+
const page = doc.loadPage(i);
|
|
468
|
+
const bounds = page.getBounds();
|
|
469
|
+
const pageHeight = bounds[3] - bounds[1];
|
|
470
|
+
// Single structured text pass with both flags
|
|
471
|
+
const stext = JSON.parse(page.toStructuredText("preserve-whitespace,preserve-images").asJSON());
|
|
472
|
+
// Extract text boxes and image regions from the same parse
|
|
473
|
+
const textBoxes = extractTextBoxes(page, pageNumber, pageHeight, stext);
|
|
474
|
+
const images = extractImageRegions(stext, pageNumber, pageHeight);
|
|
475
|
+
// Extract vector segments from raw content stream
|
|
476
|
+
let segments = [];
|
|
477
|
+
try {
|
|
478
|
+
const pageObj = page.getObject();
|
|
479
|
+
const contents = pageObj.get("Contents");
|
|
480
|
+
if (contents) {
|
|
481
|
+
let rawBytes;
|
|
482
|
+
if (contents.isArray()) {
|
|
483
|
+
// Multiple content streams — concatenate
|
|
484
|
+
const parts = [];
|
|
485
|
+
const len = contents.length ?? 0;
|
|
486
|
+
for (let j = 0; j < len; j++) {
|
|
487
|
+
const stream = contents.get(j);
|
|
488
|
+
if (stream?.readStream) {
|
|
489
|
+
parts.push(stream.readStream().asUint8Array());
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
const totalLen = parts.reduce((s, p) => s + p.length, 0);
|
|
493
|
+
rawBytes = new Uint8Array(totalLen);
|
|
494
|
+
let offset = 0;
|
|
495
|
+
for (const part of parts) {
|
|
496
|
+
rawBytes.set(part, offset);
|
|
497
|
+
offset += part.length;
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
else {
|
|
501
|
+
rawBytes = contents.readStream().asUint8Array();
|
|
502
|
+
}
|
|
503
|
+
const raw = new TextDecoder().decode(rawBytes);
|
|
504
|
+
segments = extractSegmentsFromContentStream(raw, pageNumber);
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
catch {
|
|
508
|
+
// Content stream extraction failed — proceed with text only
|
|
509
|
+
}
|
|
510
|
+
pages.push({ pageNumber, textBoxes, segments, images });
|
|
511
|
+
}
|
|
512
|
+
return pages;
|
|
513
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Table grid detection from vector segments and text boxes.
|
|
3
|
+
*
|
|
4
|
+
* Ported from @oharato/pdf2md-ts with TypeScript types and without
|
|
5
|
+
* CJK-specific borderless table heuristics. The core algorithm:
|
|
6
|
+
*
|
|
7
|
+
* 1. Classify segments as horizontal or vertical lines
|
|
8
|
+
* 2. Group horizontal Y-lines into table groups (split by vertical gaps)
|
|
9
|
+
* 3. For each group:
|
|
10
|
+
* a. Full grid (H+V lines): build cells from grid intersections,
|
|
11
|
+
* place text via raycasting
|
|
12
|
+
* b. H-line only (no V lines): infer columns from text X positions
|
|
13
|
+
* 4. Prune empty rows/cols
|
|
14
|
+
*
|
|
15
|
+
* Coordinate system: PDF native (bottom-left origin, Y increases upward).
|
|
16
|
+
*/
|
|
17
|
+
import type { Segment, TableGrid, TextBox } from "./types.js";
|
|
18
|
+
export interface GridResult {
|
|
19
|
+
grids: TableGrid[];
|
|
20
|
+
consumedIds: string[];
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Detect all table grids on a single page from its text boxes and segments.
|
|
24
|
+
*/
|
|
25
|
+
export declare function resolveTableGrids(pageNumber: number, textBoxes: TextBox[], segments: Segment[]): GridResult;
|