markit-ai 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/convert.d.ts +1 -0
- package/dist/commands/convert.js +7 -3
- package/dist/commands/formats.js +5 -0
- package/dist/converters/docx.d.ts +1 -1
- package/dist/converters/docx.js +35 -3
- package/dist/converters/epub.js +1 -0
- package/dist/converters/github.d.ts +18 -0
- package/dist/converters/github.js +148 -0
- package/dist/converters/pdf/columns.d.ts +35 -0
- package/dist/converters/pdf/columns.js +93 -0
- package/dist/converters/pdf/extract.d.ts +19 -0
- package/dist/converters/pdf/extract.js +513 -0
- package/dist/converters/pdf/grid.d.ts +25 -0
- package/dist/converters/pdf/grid.js +654 -0
- package/dist/converters/pdf/headers.d.ts +24 -0
- package/dist/converters/pdf/headers.js +108 -0
- package/dist/converters/pdf/index.d.ts +19 -0
- package/dist/converters/pdf/index.js +116 -0
- package/dist/converters/pdf/render.d.ts +24 -0
- package/dist/converters/pdf/render.js +513 -0
- package/dist/converters/pdf/types.d.ts +75 -0
- package/dist/converters/pdf/types.js +1 -0
- package/dist/converters/pptx.d.ts +1 -1
- package/dist/converters/pptx.js +74 -1
- package/dist/converters/xlsx.js +1 -0
- package/dist/index.d.ts +2 -1
- package/dist/index.js +2 -1
- package/dist/main.js +4 -1
- package/dist/markit.d.ts +1 -1
- package/dist/markit.js +19 -4
- package/dist/types.d.ts +8 -0
- package/package.json +3 -3
- package/dist/converters/pdf.d.ts +0 -6
- package/dist/converters/pdf.js +0 -29
|
@@ -0,0 +1,513 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF content extraction using mupdf.
|
|
3
|
+
*
|
|
4
|
+
* Extracts text boxes (with position, font size, bold) and vector line
|
|
5
|
+
* segments (table borders) from each page. Uses mupdf's native WASM
|
|
6
|
+
* engine for fast parsing, and reads raw content streams for vector graphics.
|
|
7
|
+
*
|
|
8
|
+
* Coordinate system: PDF native (origin = bottom-left, Y increases upward).
|
|
9
|
+
*/
|
|
10
|
+
// ---------------------------------------------------------------------------
|
|
11
|
+
// Text extraction
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
/** Y tolerance for merging text fragments on the same visual line. */
|
|
14
|
+
const SAME_LINE_Y_TOLERANCE = 2;
|
|
15
|
+
/** Max horizontal gap (pts) to merge adjacent fragments into one text box. */
|
|
16
|
+
const MAX_MERGE_GAP = 14;
|
|
17
|
+
/**
|
|
18
|
+
* Merge horizontally adjacent raw text items on the same visual line into
|
|
19
|
+
* word/phrase-level text boxes.
|
|
20
|
+
*/
|
|
21
|
+
function mergeIntoWords(raws) {
|
|
22
|
+
if (raws.length === 0)
|
|
23
|
+
return [];
|
|
24
|
+
// Sort by Y descending (top-first in bottom-left coords), then X ascending
|
|
25
|
+
const sorted = [...raws].sort((a, b) => {
|
|
26
|
+
const dy = b.y - a.y;
|
|
27
|
+
return Math.abs(dy) > SAME_LINE_Y_TOLERANCE ? dy : a.x - b.x;
|
|
28
|
+
});
|
|
29
|
+
const merged = [];
|
|
30
|
+
let cur = { ...sorted[0] };
|
|
31
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
32
|
+
const next = sorted[i];
|
|
33
|
+
const sameY = Math.abs(next.y - cur.y) <= SAME_LINE_Y_TOLERANCE;
|
|
34
|
+
const close = next.x <= cur.x + cur.width + MAX_MERGE_GAP;
|
|
35
|
+
if (sameY && close) {
|
|
36
|
+
const gap = next.x - (cur.x + cur.width);
|
|
37
|
+
const sep = gap > 1 ? " " : "";
|
|
38
|
+
cur.text += sep + next.text;
|
|
39
|
+
cur.width = next.x + next.width - cur.x;
|
|
40
|
+
cur.height = Math.max(cur.height, next.height);
|
|
41
|
+
cur.fontSize = Math.max(cur.fontSize, next.fontSize);
|
|
42
|
+
cur.isBold = cur.isBold || next.isBold;
|
|
43
|
+
}
|
|
44
|
+
else {
|
|
45
|
+
merged.push(cur);
|
|
46
|
+
cur = { ...next };
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
merged.push(cur);
|
|
50
|
+
return merged;
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Extract text boxes from a mupdf page using structured text output.
|
|
54
|
+
*
|
|
55
|
+
* mupdf's structured text JSON uses top-left origin; we convert to
|
|
56
|
+
* bottom-left (standard PDF coordinates) using the page height.
|
|
57
|
+
*/
|
|
58
|
+
function extractTextBoxes(page, pageNumber, pageHeight, stext) {
|
|
59
|
+
if (!stext) {
|
|
60
|
+
stext = JSON.parse(page.toStructuredText("preserve-whitespace").asJSON());
|
|
61
|
+
}
|
|
62
|
+
const raws = [];
|
|
63
|
+
for (const block of stext.blocks) {
|
|
64
|
+
if (block.type !== "text")
|
|
65
|
+
continue;
|
|
66
|
+
for (const line of block.lines) {
|
|
67
|
+
const text = line.text?.trim();
|
|
68
|
+
if (!text)
|
|
69
|
+
continue;
|
|
70
|
+
const fontSize = line.font?.size ?? 0;
|
|
71
|
+
const weight = line.font?.weight ?? "normal";
|
|
72
|
+
const fontName = line.font?.name ?? "";
|
|
73
|
+
const isBold = weight === "bold" ||
|
|
74
|
+
/bold/i.test(fontName) ||
|
|
75
|
+
/Black|Heavy/i.test(fontName);
|
|
76
|
+
// mupdf bbox: {x, y, w, h} in top-left coords
|
|
77
|
+
// Convert to bottom-left: pdfY = pageHeight - (bbox.y + bbox.h)
|
|
78
|
+
const bboxY = line.bbox.y;
|
|
79
|
+
const bboxH = line.bbox.h;
|
|
80
|
+
const pdfY = pageHeight - (bboxY + bboxH);
|
|
81
|
+
raws.push({
|
|
82
|
+
text,
|
|
83
|
+
x: line.bbox.x,
|
|
84
|
+
y: pdfY,
|
|
85
|
+
width: line.bbox.w,
|
|
86
|
+
height: bboxH,
|
|
87
|
+
fontSize,
|
|
88
|
+
isBold,
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
const words = mergeIntoWords(raws);
|
|
93
|
+
return words
|
|
94
|
+
.map((w, i) => ({
|
|
95
|
+
id: `p${pageNumber}-t${i}`,
|
|
96
|
+
text: w.text.trim(),
|
|
97
|
+
pageNumber,
|
|
98
|
+
fontSize: w.fontSize,
|
|
99
|
+
isBold: w.isBold,
|
|
100
|
+
bounds: {
|
|
101
|
+
left: w.x,
|
|
102
|
+
right: w.x + w.width,
|
|
103
|
+
bottom: w.y,
|
|
104
|
+
top: w.y + w.height,
|
|
105
|
+
},
|
|
106
|
+
}))
|
|
107
|
+
.filter((b) => b.text.length > 0);
|
|
108
|
+
}
|
|
109
|
+
// ---------------------------------------------------------------------------
|
|
110
|
+
// Vector segment extraction from raw content stream
|
|
111
|
+
// ---------------------------------------------------------------------------
|
|
112
|
+
/** Minimum aspect ratio for a filled rect to be considered a line. */
|
|
113
|
+
const LINE_ASPECT_THRESHOLD = 6;
|
|
114
|
+
/** Minimum length (pts) for a segment to count. */
|
|
115
|
+
const MIN_LENGTH = 2;
|
|
116
|
+
/** Maximum thickness (pts) for a border line (filters out filled areas). */
|
|
117
|
+
const MAX_THICKNESS = 3;
|
|
118
|
+
/**
|
|
119
|
+
* Convert a thin filled rectangle to a horizontal or vertical segment.
|
|
120
|
+
* Returns null if the rect doesn't look like a border line.
|
|
121
|
+
*/
|
|
122
|
+
function thinRectToSegment(id, x, y, w, h) {
|
|
123
|
+
const aw = Math.abs(w);
|
|
124
|
+
const ah = Math.abs(h);
|
|
125
|
+
if (aw > ah * LINE_ASPECT_THRESHOLD &&
|
|
126
|
+
aw >= MIN_LENGTH &&
|
|
127
|
+
ah <= MAX_THICKNESS) {
|
|
128
|
+
// Horizontal line
|
|
129
|
+
const cy = y + ah / 2;
|
|
130
|
+
return { id, x1: x, y1: cy, x2: x + aw, y2: cy };
|
|
131
|
+
}
|
|
132
|
+
if (ah > aw * LINE_ASPECT_THRESHOLD &&
|
|
133
|
+
ah >= MIN_LENGTH &&
|
|
134
|
+
aw <= MAX_THICKNESS) {
|
|
135
|
+
// Vertical line
|
|
136
|
+
const cx = x + aw / 2;
|
|
137
|
+
return { id, x1: cx, y1: y, x2: cx, y2: y + ah };
|
|
138
|
+
}
|
|
139
|
+
return null;
|
|
140
|
+
}
|
|
141
|
+
/**
|
|
142
|
+
* Emit 4 edge segments from a stroked rectangle.
|
|
143
|
+
*/
|
|
144
|
+
function pushStrokedRectEdges(segments, id, x, y, w, h) {
|
|
145
|
+
const aw = Math.abs(w);
|
|
146
|
+
const ah = Math.abs(h);
|
|
147
|
+
const base = id;
|
|
148
|
+
if (aw >= MIN_LENGTH) {
|
|
149
|
+
segments.push({ id: `${base}-b`, x1: x, y1: y, x2: x + aw, y2: y });
|
|
150
|
+
segments.push({
|
|
151
|
+
id: `${base}-t`,
|
|
152
|
+
x1: x,
|
|
153
|
+
y1: y + ah,
|
|
154
|
+
x2: x + aw,
|
|
155
|
+
y2: y + ah,
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
if (ah >= MIN_LENGTH) {
|
|
159
|
+
segments.push({ id: `${base}-l`, x1: x, y1: y, x2: x, y2: y + ah });
|
|
160
|
+
segments.push({
|
|
161
|
+
id: `${base}-r`,
|
|
162
|
+
x1: x + aw,
|
|
163
|
+
y1: y,
|
|
164
|
+
x2: x + aw,
|
|
165
|
+
y2: y + ah,
|
|
166
|
+
});
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
const CTM_IDENTITY = [1, 0, 0, 1, 0, 0];
|
|
170
|
+
/** Concatenate two affine matrices: result = parent × child. */
|
|
171
|
+
function ctmConcat(p, c) {
|
|
172
|
+
return [
|
|
173
|
+
p[0] * c[0] + p[2] * c[1],
|
|
174
|
+
p[1] * c[0] + p[3] * c[1],
|
|
175
|
+
p[0] * c[2] + p[2] * c[3],
|
|
176
|
+
p[1] * c[2] + p[3] * c[3],
|
|
177
|
+
p[0] * c[4] + p[2] * c[5] + p[4],
|
|
178
|
+
p[1] * c[4] + p[3] * c[5] + p[5],
|
|
179
|
+
];
|
|
180
|
+
}
|
|
181
|
+
function ctmApply(m, x, y) {
|
|
182
|
+
return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
|
|
183
|
+
}
|
|
184
|
+
// ---------------------------------------------------------------------------
|
|
185
|
+
// Content stream parsing
|
|
186
|
+
// ---------------------------------------------------------------------------
|
|
187
|
+
/**
|
|
188
|
+
* Parse a PDF content stream and extract line segments from thin filled
|
|
189
|
+
* rectangles (re+f), stroked rectangles (re+S), and explicit lines (m/l+S).
|
|
190
|
+
* Tracks the CTM via q/Q/cm operators so coordinates are in page space.
|
|
191
|
+
*/
|
|
192
|
+
function extractSegmentsFromContentStream(raw, pageNumber) {
|
|
193
|
+
const segments = [];
|
|
194
|
+
const tokens = tokenizeContentStream(raw);
|
|
195
|
+
let idx = 0;
|
|
196
|
+
let strokeWidth = 1.0;
|
|
197
|
+
// Graphics state stack (q/Q): saves CTM + strokeWidth
|
|
198
|
+
let ctm = [...CTM_IDENTITY];
|
|
199
|
+
const stateStack = [];
|
|
200
|
+
// State for path building (in user coordinates, pre-CTM)
|
|
201
|
+
let curX = 0;
|
|
202
|
+
let curY = 0;
|
|
203
|
+
let pathStartX = 0;
|
|
204
|
+
let pathStartY = 0;
|
|
205
|
+
const pendingRects = [];
|
|
206
|
+
const pendingLines = [];
|
|
207
|
+
function flushPath(mode) {
|
|
208
|
+
const sid = () => `p${pageNumber}-s${segments.length}`;
|
|
209
|
+
if (mode === "fill") {
|
|
210
|
+
for (const r of pendingRects) {
|
|
211
|
+
// Transform the rect corners through CTM, then check if it's a thin line
|
|
212
|
+
const [x0, y0] = ctmApply(ctm, r.x, r.y);
|
|
213
|
+
const [x1, y1] = ctmApply(ctm, r.x + r.w, r.y + r.h);
|
|
214
|
+
const seg = thinRectToSegment(sid(), Math.min(x0, x1), Math.min(y0, y1), Math.abs(x1 - x0), Math.abs(y1 - y0));
|
|
215
|
+
if (seg)
|
|
216
|
+
segments.push(seg);
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
else if (mode === "stroke" && strokeWidth <= MAX_THICKNESS) {
|
|
220
|
+
for (const r of pendingRects) {
|
|
221
|
+
const [x0, y0] = ctmApply(ctm, r.x, r.y);
|
|
222
|
+
const [x1, y1] = ctmApply(ctm, r.x + r.w, r.y + r.h);
|
|
223
|
+
pushStrokedRectEdges(segments, sid(), Math.min(x0, x1), Math.min(y0, y1), Math.abs(x1 - x0), Math.abs(y1 - y0));
|
|
224
|
+
}
|
|
225
|
+
for (const l of pendingLines) {
|
|
226
|
+
const [lx1, ly1] = ctmApply(ctm, l.x1, l.y1);
|
|
227
|
+
const [lx2, ly2] = ctmApply(ctm, l.x2, l.y2);
|
|
228
|
+
const dx = Math.abs(lx2 - lx1);
|
|
229
|
+
const dy = Math.abs(ly2 - ly1);
|
|
230
|
+
// Only keep H/V lines
|
|
231
|
+
if ((dx >= MIN_LENGTH && dy < 1) || (dy >= MIN_LENGTH && dx < 1)) {
|
|
232
|
+
segments.push({ id: sid(), x1: lx1, y1: ly1, x2: lx2, y2: ly2 });
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
pendingRects.length = 0;
|
|
237
|
+
pendingLines.length = 0;
|
|
238
|
+
}
|
|
239
|
+
while (idx < tokens.length) {
|
|
240
|
+
const t = tokens[idx];
|
|
241
|
+
if (t === "q") {
|
|
242
|
+
stateStack.push({ ctm: [...ctm], strokeWidth });
|
|
243
|
+
}
|
|
244
|
+
else if (t === "Q") {
|
|
245
|
+
const saved = stateStack.pop();
|
|
246
|
+
if (saved) {
|
|
247
|
+
ctm = saved.ctm;
|
|
248
|
+
strokeWidth = saved.strokeWidth;
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
else if (t === "cm" && idx >= 6) {
|
|
252
|
+
const a = Number(tokens[idx - 6]);
|
|
253
|
+
const b = Number(tokens[idx - 5]);
|
|
254
|
+
const c = Number(tokens[idx - 4]);
|
|
255
|
+
const d = Number(tokens[idx - 3]);
|
|
256
|
+
const e = Number(tokens[idx - 2]);
|
|
257
|
+
const f = Number(tokens[idx - 1]);
|
|
258
|
+
ctm = ctmConcat(ctm, [a, b, c, d, e, f]);
|
|
259
|
+
}
|
|
260
|
+
else if (t === "w" && idx >= 1) {
|
|
261
|
+
strokeWidth = Number(tokens[idx - 1]) || strokeWidth;
|
|
262
|
+
}
|
|
263
|
+
else if (t === "re" && idx >= 4) {
|
|
264
|
+
const x = Number(tokens[idx - 4]);
|
|
265
|
+
const y = Number(tokens[idx - 3]);
|
|
266
|
+
const w = Number(tokens[idx - 2]);
|
|
267
|
+
const h = Number(tokens[idx - 1]);
|
|
268
|
+
if (Number.isFinite(x + y + w + h)) {
|
|
269
|
+
pendingRects.push({ x, y, w, h });
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
else if (t === "m" && idx >= 2) {
|
|
273
|
+
curX = Number(tokens[idx - 2]);
|
|
274
|
+
curY = Number(tokens[idx - 1]);
|
|
275
|
+
pathStartX = curX;
|
|
276
|
+
pathStartY = curY;
|
|
277
|
+
}
|
|
278
|
+
else if (t === "l" && idx >= 2) {
|
|
279
|
+
const x2 = Number(tokens[idx - 2]);
|
|
280
|
+
const y2 = Number(tokens[idx - 1]);
|
|
281
|
+
pendingLines.push({ x1: curX, y1: curY, x2, y2 });
|
|
282
|
+
curX = x2;
|
|
283
|
+
curY = y2;
|
|
284
|
+
}
|
|
285
|
+
else if (t === "h") {
|
|
286
|
+
// closePath: line back to start
|
|
287
|
+
if (curX !== pathStartX || curY !== pathStartY) {
|
|
288
|
+
pendingLines.push({
|
|
289
|
+
x1: curX,
|
|
290
|
+
y1: curY,
|
|
291
|
+
x2: pathStartX,
|
|
292
|
+
y2: pathStartY,
|
|
293
|
+
});
|
|
294
|
+
}
|
|
295
|
+
curX = pathStartX;
|
|
296
|
+
curY = pathStartY;
|
|
297
|
+
}
|
|
298
|
+
else if (t === "f" || t === "F" || t === "f*") {
|
|
299
|
+
flushPath("fill");
|
|
300
|
+
}
|
|
301
|
+
else if (t === "S" || t === "s") {
|
|
302
|
+
if (t === "s") {
|
|
303
|
+
// closeStroke: implicit closePath
|
|
304
|
+
if (curX !== pathStartX || curY !== pathStartY) {
|
|
305
|
+
pendingLines.push({
|
|
306
|
+
x1: curX,
|
|
307
|
+
y1: curY,
|
|
308
|
+
x2: pathStartX,
|
|
309
|
+
y2: pathStartY,
|
|
310
|
+
});
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
flushPath("stroke");
|
|
314
|
+
}
|
|
315
|
+
else if (t === "B" || t === "B*" || t === "b" || t === "b*") {
|
|
316
|
+
// fill + stroke combined
|
|
317
|
+
flushPath("fill");
|
|
318
|
+
flushPath("stroke");
|
|
319
|
+
}
|
|
320
|
+
else if (t === "n") {
|
|
321
|
+
// end path without painting — discard
|
|
322
|
+
pendingRects.length = 0;
|
|
323
|
+
pendingLines.length = 0;
|
|
324
|
+
}
|
|
325
|
+
idx++;
|
|
326
|
+
}
|
|
327
|
+
return segments;
|
|
328
|
+
}
|
|
329
|
+
/**
|
|
330
|
+
* Fast tokenizer for PDF content streams.
|
|
331
|
+
* Splits on whitespace, skipping comments and string literals.
|
|
332
|
+
*/
|
|
333
|
+
function tokenizeContentStream(raw) {
|
|
334
|
+
const tokens = [];
|
|
335
|
+
const len = raw.length;
|
|
336
|
+
let i = 0;
|
|
337
|
+
while (i < len) {
|
|
338
|
+
const ch = raw.charCodeAt(i);
|
|
339
|
+
// Skip whitespace
|
|
340
|
+
if (ch <= 32) {
|
|
341
|
+
i++;
|
|
342
|
+
continue;
|
|
343
|
+
}
|
|
344
|
+
// Skip comments
|
|
345
|
+
if (ch === 37 /* % */) {
|
|
346
|
+
while (i < len && raw.charCodeAt(i) !== 10)
|
|
347
|
+
i++;
|
|
348
|
+
continue;
|
|
349
|
+
}
|
|
350
|
+
// Skip string literals (...)
|
|
351
|
+
if (ch === 40 /* ( */) {
|
|
352
|
+
let depth = 1;
|
|
353
|
+
i++;
|
|
354
|
+
while (i < len && depth > 0) {
|
|
355
|
+
const c = raw.charCodeAt(i);
|
|
356
|
+
if (c === 92 /* \ */) {
|
|
357
|
+
i++;
|
|
358
|
+
}
|
|
359
|
+
else if (c === 40) {
|
|
360
|
+
depth++;
|
|
361
|
+
}
|
|
362
|
+
else if (c === 41) {
|
|
363
|
+
depth--;
|
|
364
|
+
}
|
|
365
|
+
i++;
|
|
366
|
+
}
|
|
367
|
+
continue;
|
|
368
|
+
}
|
|
369
|
+
// Skip hex strings <...>
|
|
370
|
+
if (ch === 60 /* < */ && i + 1 < len && raw.charCodeAt(i + 1) !== 60) {
|
|
371
|
+
i++;
|
|
372
|
+
while (i < len && raw.charCodeAt(i) !== 62)
|
|
373
|
+
i++;
|
|
374
|
+
i++; // skip >
|
|
375
|
+
continue;
|
|
376
|
+
}
|
|
377
|
+
// Skip dict delimiters << >>
|
|
378
|
+
if (ch === 60 && i + 1 < len && raw.charCodeAt(i + 1) === 60) {
|
|
379
|
+
i += 2;
|
|
380
|
+
continue;
|
|
381
|
+
}
|
|
382
|
+
if (ch === 62 && i + 1 < len && raw.charCodeAt(i + 1) === 62) {
|
|
383
|
+
i += 2;
|
|
384
|
+
continue;
|
|
385
|
+
}
|
|
386
|
+
// Regular token: read until whitespace or delimiter
|
|
387
|
+
const start = i;
|
|
388
|
+
while (i < len) {
|
|
389
|
+
const c = raw.charCodeAt(i);
|
|
390
|
+
if (c <= 32 || c === 40 || c === 41 || c === 60 || c === 62 || c === 37)
|
|
391
|
+
break;
|
|
392
|
+
i++;
|
|
393
|
+
}
|
|
394
|
+
if (i > start) {
|
|
395
|
+
tokens.push(raw.substring(start, i));
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
return tokens;
|
|
399
|
+
}
|
|
400
|
+
// ---------------------------------------------------------------------------
|
|
401
|
+
// Image region detection
|
|
402
|
+
// ---------------------------------------------------------------------------
|
|
403
|
+
/** Minimum area (pts²) for an image to be considered a diagram, not an icon. */
|
|
404
|
+
const MIN_IMAGE_AREA = 5000;
|
|
405
|
+
function extractImageRegions(stext, pageNumber, pageHeight) {
|
|
406
|
+
const regions = [];
|
|
407
|
+
for (const block of stext.blocks) {
|
|
408
|
+
if (block.type !== "image")
|
|
409
|
+
continue;
|
|
410
|
+
const { x, y, w, h } = block.bbox;
|
|
411
|
+
if (w * h < MIN_IMAGE_AREA)
|
|
412
|
+
continue; // skip tiny icons
|
|
413
|
+
// Convert Y from mupdf (top-left) to PDF (bottom-left) for ordering
|
|
414
|
+
const pdfTopY = pageHeight - y;
|
|
415
|
+
regions.push({
|
|
416
|
+
id: `p${pageNumber}-img${regions.length}`,
|
|
417
|
+
pageNumber,
|
|
418
|
+
bbox: { x, y, w, h },
|
|
419
|
+
topY: pdfTopY,
|
|
420
|
+
});
|
|
421
|
+
}
|
|
422
|
+
return regions;
|
|
423
|
+
}
|
|
424
|
+
// ---------------------------------------------------------------------------
|
|
425
|
+
// Public API
|
|
426
|
+
// ---------------------------------------------------------------------------
|
|
427
|
+
/**
|
|
428
|
+
* Render an image region from a PDF page as a PNG buffer.
|
|
429
|
+
* Uses mupdf's DrawDevice to render just the cropped area at 2x resolution.
|
|
430
|
+
*/
|
|
431
|
+
export function renderImageRegion(input, region) {
|
|
432
|
+
const mupdf = require("mupdf");
|
|
433
|
+
const doc = mupdf.Document.openDocument(input, "application/pdf");
|
|
434
|
+
const page = doc.loadPage(region.pageNumber - 1);
|
|
435
|
+
const pad = 10;
|
|
436
|
+
const bx = region.bbox.x - pad;
|
|
437
|
+
const by = region.bbox.y - pad;
|
|
438
|
+
const bw = region.bbox.w + 2 * pad;
|
|
439
|
+
const bh = region.bbox.h + 2 * pad;
|
|
440
|
+
const scale = 2;
|
|
441
|
+
const pw = Math.round(bw * scale);
|
|
442
|
+
const ph = Math.round(bh * scale);
|
|
443
|
+
const pix = new mupdf.Pixmap(mupdf.ColorSpace.DeviceRGB, [0, 0, pw, ph], false);
|
|
444
|
+
pix.clear(255);
|
|
445
|
+
const matrix = [scale, 0, 0, scale, -bx * scale, -by * scale];
|
|
446
|
+
const dl = page.toDisplayList();
|
|
447
|
+
const dev = new mupdf.DrawDevice(matrix, pix);
|
|
448
|
+
dl.run(dev, mupdf.Matrix.identity);
|
|
449
|
+
dev.close();
|
|
450
|
+
return pix.asPNG();
|
|
451
|
+
}
|
|
452
|
+
/**
|
|
453
|
+
* Extract text boxes and vector segments from all pages of a PDF buffer.
|
|
454
|
+
*/
|
|
455
|
+
export async function extractPages(input) {
|
|
456
|
+
let mupdf;
|
|
457
|
+
try {
|
|
458
|
+
mupdf = await import("mupdf");
|
|
459
|
+
}
|
|
460
|
+
catch {
|
|
461
|
+
throw new Error("PDF support requires 'mupdf'. Install it: npm install mupdf");
|
|
462
|
+
}
|
|
463
|
+
const doc = mupdf.Document.openDocument(input, "application/pdf");
|
|
464
|
+
const pages = [];
|
|
465
|
+
for (let i = 0; i < doc.countPages(); i++) {
|
|
466
|
+
const pageNumber = i + 1;
|
|
467
|
+
const page = doc.loadPage(i);
|
|
468
|
+
const bounds = page.getBounds();
|
|
469
|
+
const pageHeight = bounds[3] - bounds[1];
|
|
470
|
+
// Single structured text pass with both flags
|
|
471
|
+
const stext = JSON.parse(page.toStructuredText("preserve-whitespace,preserve-images").asJSON());
|
|
472
|
+
// Extract text boxes and image regions from the same parse
|
|
473
|
+
const textBoxes = extractTextBoxes(page, pageNumber, pageHeight, stext);
|
|
474
|
+
const images = extractImageRegions(stext, pageNumber, pageHeight);
|
|
475
|
+
// Extract vector segments from raw content stream
|
|
476
|
+
let segments = [];
|
|
477
|
+
try {
|
|
478
|
+
const pageObj = page.getObject();
|
|
479
|
+
const contents = pageObj.get("Contents");
|
|
480
|
+
if (contents) {
|
|
481
|
+
let rawBytes;
|
|
482
|
+
if (contents.isArray()) {
|
|
483
|
+
// Multiple content streams — concatenate
|
|
484
|
+
const parts = [];
|
|
485
|
+
const len = contents.length ?? 0;
|
|
486
|
+
for (let j = 0; j < len; j++) {
|
|
487
|
+
const stream = contents.get(j);
|
|
488
|
+
if (stream?.readStream) {
|
|
489
|
+
parts.push(stream.readStream().asUint8Array());
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
const totalLen = parts.reduce((s, p) => s + p.length, 0);
|
|
493
|
+
rawBytes = new Uint8Array(totalLen);
|
|
494
|
+
let offset = 0;
|
|
495
|
+
for (const part of parts) {
|
|
496
|
+
rawBytes.set(part, offset);
|
|
497
|
+
offset += part.length;
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
else {
|
|
501
|
+
rawBytes = contents.readStream().asUint8Array();
|
|
502
|
+
}
|
|
503
|
+
const raw = new TextDecoder().decode(rawBytes);
|
|
504
|
+
segments = extractSegmentsFromContentStream(raw, pageNumber);
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
catch {
|
|
508
|
+
// Content stream extraction failed — proceed with text only
|
|
509
|
+
}
|
|
510
|
+
pages.push({ pageNumber, textBoxes, segments, images });
|
|
511
|
+
}
|
|
512
|
+
return pages;
|
|
513
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Table grid detection from vector segments and text boxes.
|
|
3
|
+
*
|
|
4
|
+
* Ported from @oharato/pdf2md-ts with TypeScript types and without
|
|
5
|
+
* CJK-specific borderless table heuristics. The core algorithm:
|
|
6
|
+
*
|
|
7
|
+
* 1. Classify segments as horizontal or vertical lines
|
|
8
|
+
* 2. Group horizontal Y-lines into table groups (split by vertical gaps)
|
|
9
|
+
* 3. For each group:
|
|
10
|
+
* a. Full grid (H+V lines): build cells from grid intersections,
|
|
11
|
+
* place text via raycasting
|
|
12
|
+
* b. H-line only (no V lines): infer columns from text X positions
|
|
13
|
+
* 4. Prune empty rows/cols
|
|
14
|
+
*
|
|
15
|
+
* Coordinate system: PDF native (bottom-left origin, Y increases upward).
|
|
16
|
+
*/
|
|
17
|
+
import type { Segment, TableGrid, TextBox } from "./types.js";
|
|
18
|
+
export interface GridResult {
|
|
19
|
+
grids: TableGrid[];
|
|
20
|
+
consumedIds: string[];
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Detect all table grids on a single page from its text boxes and segments.
|
|
24
|
+
*/
|
|
25
|
+
export declare function resolveTableGrids(pageNumber: number, textBoxes: TextBox[], segments: Segment[]): GridResult;
|