markit-ai 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,513 @@
1
+ /**
2
+ * PDF content extraction using mupdf.
3
+ *
4
+ * Extracts text boxes (with position, font size, bold) and vector line
5
+ * segments (table borders) from each page. Uses mupdf's native WASM
6
+ * engine for fast parsing, and reads raw content streams for vector graphics.
7
+ *
8
+ * Coordinate system: PDF native (origin = bottom-left, Y increases upward).
9
+ */
10
+ // ---------------------------------------------------------------------------
11
+ // Text extraction
12
+ // ---------------------------------------------------------------------------
13
+ /** Y tolerance for merging text fragments on the same visual line. */
14
+ const SAME_LINE_Y_TOLERANCE = 2;
15
+ /** Max horizontal gap (pts) to merge adjacent fragments into one text box. */
16
+ const MAX_MERGE_GAP = 14;
17
+ /**
18
+ * Merge horizontally adjacent raw text items on the same visual line into
19
+ * word/phrase-level text boxes.
20
+ */
21
+ function mergeIntoWords(raws) {
22
+ if (raws.length === 0)
23
+ return [];
24
+ // Sort by Y descending (top-first in bottom-left coords), then X ascending
25
+ const sorted = [...raws].sort((a, b) => {
26
+ const dy = b.y - a.y;
27
+ return Math.abs(dy) > SAME_LINE_Y_TOLERANCE ? dy : a.x - b.x;
28
+ });
29
+ const merged = [];
30
+ let cur = { ...sorted[0] };
31
+ for (let i = 1; i < sorted.length; i++) {
32
+ const next = sorted[i];
33
+ const sameY = Math.abs(next.y - cur.y) <= SAME_LINE_Y_TOLERANCE;
34
+ const close = next.x <= cur.x + cur.width + MAX_MERGE_GAP;
35
+ if (sameY && close) {
36
+ const gap = next.x - (cur.x + cur.width);
37
+ const sep = gap > 1 ? " " : "";
38
+ cur.text += sep + next.text;
39
+ cur.width = next.x + next.width - cur.x;
40
+ cur.height = Math.max(cur.height, next.height);
41
+ cur.fontSize = Math.max(cur.fontSize, next.fontSize);
42
+ cur.isBold = cur.isBold || next.isBold;
43
+ }
44
+ else {
45
+ merged.push(cur);
46
+ cur = { ...next };
47
+ }
48
+ }
49
+ merged.push(cur);
50
+ return merged;
51
+ }
52
+ /**
53
+ * Extract text boxes from a mupdf page using structured text output.
54
+ *
55
+ * mupdf's structured text JSON uses top-left origin; we convert to
56
+ * bottom-left (standard PDF coordinates) using the page height.
57
+ */
58
+ function extractTextBoxes(page, pageNumber, pageHeight, stext) {
59
+ if (!stext) {
60
+ stext = JSON.parse(page.toStructuredText("preserve-whitespace").asJSON());
61
+ }
62
+ const raws = [];
63
+ for (const block of stext.blocks) {
64
+ if (block.type !== "text")
65
+ continue;
66
+ for (const line of block.lines) {
67
+ const text = line.text?.trim();
68
+ if (!text)
69
+ continue;
70
+ const fontSize = line.font?.size ?? 0;
71
+ const weight = line.font?.weight ?? "normal";
72
+ const fontName = line.font?.name ?? "";
73
+ const isBold = weight === "bold" ||
74
+ /bold/i.test(fontName) ||
75
+ /Black|Heavy/i.test(fontName);
76
+ // mupdf bbox: {x, y, w, h} in top-left coords
77
+ // Convert to bottom-left: pdfY = pageHeight - (bbox.y + bbox.h)
78
+ const bboxY = line.bbox.y;
79
+ const bboxH = line.bbox.h;
80
+ const pdfY = pageHeight - (bboxY + bboxH);
81
+ raws.push({
82
+ text,
83
+ x: line.bbox.x,
84
+ y: pdfY,
85
+ width: line.bbox.w,
86
+ height: bboxH,
87
+ fontSize,
88
+ isBold,
89
+ });
90
+ }
91
+ }
92
+ const words = mergeIntoWords(raws);
93
+ return words
94
+ .map((w, i) => ({
95
+ id: `p${pageNumber}-t${i}`,
96
+ text: w.text.trim(),
97
+ pageNumber,
98
+ fontSize: w.fontSize,
99
+ isBold: w.isBold,
100
+ bounds: {
101
+ left: w.x,
102
+ right: w.x + w.width,
103
+ bottom: w.y,
104
+ top: w.y + w.height,
105
+ },
106
+ }))
107
+ .filter((b) => b.text.length > 0);
108
+ }
109
+ // ---------------------------------------------------------------------------
110
+ // Vector segment extraction from raw content stream
111
+ // ---------------------------------------------------------------------------
112
+ /** Minimum aspect ratio for a filled rect to be considered a line. */
113
+ const LINE_ASPECT_THRESHOLD = 6;
114
+ /** Minimum length (pts) for a segment to count. */
115
+ const MIN_LENGTH = 2;
116
+ /** Maximum thickness (pts) for a border line (filters out filled areas). */
117
+ const MAX_THICKNESS = 3;
118
+ /**
119
+ * Convert a thin filled rectangle to a horizontal or vertical segment.
120
+ * Returns null if the rect doesn't look like a border line.
121
+ */
122
+ function thinRectToSegment(id, x, y, w, h) {
123
+ const aw = Math.abs(w);
124
+ const ah = Math.abs(h);
125
+ if (aw > ah * LINE_ASPECT_THRESHOLD &&
126
+ aw >= MIN_LENGTH &&
127
+ ah <= MAX_THICKNESS) {
128
+ // Horizontal line
129
+ const cy = y + ah / 2;
130
+ return { id, x1: x, y1: cy, x2: x + aw, y2: cy };
131
+ }
132
+ if (ah > aw * LINE_ASPECT_THRESHOLD &&
133
+ ah >= MIN_LENGTH &&
134
+ aw <= MAX_THICKNESS) {
135
+ // Vertical line
136
+ const cx = x + aw / 2;
137
+ return { id, x1: cx, y1: y, x2: cx, y2: y + ah };
138
+ }
139
+ return null;
140
+ }
141
+ /**
142
+ * Emit 4 edge segments from a stroked rectangle.
143
+ */
144
+ function pushStrokedRectEdges(segments, id, x, y, w, h) {
145
+ const aw = Math.abs(w);
146
+ const ah = Math.abs(h);
147
+ const base = id;
148
+ if (aw >= MIN_LENGTH) {
149
+ segments.push({ id: `${base}-b`, x1: x, y1: y, x2: x + aw, y2: y });
150
+ segments.push({
151
+ id: `${base}-t`,
152
+ x1: x,
153
+ y1: y + ah,
154
+ x2: x + aw,
155
+ y2: y + ah,
156
+ });
157
+ }
158
+ if (ah >= MIN_LENGTH) {
159
+ segments.push({ id: `${base}-l`, x1: x, y1: y, x2: x, y2: y + ah });
160
+ segments.push({
161
+ id: `${base}-r`,
162
+ x1: x + aw,
163
+ y1: y,
164
+ x2: x + aw,
165
+ y2: y + ah,
166
+ });
167
+ }
168
+ }
169
+ const CTM_IDENTITY = [1, 0, 0, 1, 0, 0];
170
+ /** Concatenate two affine matrices: result = parent × child. */
171
+ function ctmConcat(p, c) {
172
+ return [
173
+ p[0] * c[0] + p[2] * c[1],
174
+ p[1] * c[0] + p[3] * c[1],
175
+ p[0] * c[2] + p[2] * c[3],
176
+ p[1] * c[2] + p[3] * c[3],
177
+ p[0] * c[4] + p[2] * c[5] + p[4],
178
+ p[1] * c[4] + p[3] * c[5] + p[5],
179
+ ];
180
+ }
181
+ function ctmApply(m, x, y) {
182
+ return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
183
+ }
184
+ // ---------------------------------------------------------------------------
185
+ // Content stream parsing
186
+ // ---------------------------------------------------------------------------
187
+ /**
188
+ * Parse a PDF content stream and extract line segments from thin filled
189
+ * rectangles (re+f), stroked rectangles (re+S), and explicit lines (m/l+S).
190
+ * Tracks the CTM via q/Q/cm operators so coordinates are in page space.
191
+ */
192
+ function extractSegmentsFromContentStream(raw, pageNumber) {
193
+ const segments = [];
194
+ const tokens = tokenizeContentStream(raw);
195
+ let idx = 0;
196
+ let strokeWidth = 1.0;
197
+ // Graphics state stack (q/Q): saves CTM + strokeWidth
198
+ let ctm = [...CTM_IDENTITY];
199
+ const stateStack = [];
200
+ // State for path building (in user coordinates, pre-CTM)
201
+ let curX = 0;
202
+ let curY = 0;
203
+ let pathStartX = 0;
204
+ let pathStartY = 0;
205
+ const pendingRects = [];
206
+ const pendingLines = [];
207
+ function flushPath(mode) {
208
+ const sid = () => `p${pageNumber}-s${segments.length}`;
209
+ if (mode === "fill") {
210
+ for (const r of pendingRects) {
211
+ // Transform the rect corners through CTM, then check if it's a thin line
212
+ const [x0, y0] = ctmApply(ctm, r.x, r.y);
213
+ const [x1, y1] = ctmApply(ctm, r.x + r.w, r.y + r.h);
214
+ const seg = thinRectToSegment(sid(), Math.min(x0, x1), Math.min(y0, y1), Math.abs(x1 - x0), Math.abs(y1 - y0));
215
+ if (seg)
216
+ segments.push(seg);
217
+ }
218
+ }
219
+ else if (mode === "stroke" && strokeWidth <= MAX_THICKNESS) {
220
+ for (const r of pendingRects) {
221
+ const [x0, y0] = ctmApply(ctm, r.x, r.y);
222
+ const [x1, y1] = ctmApply(ctm, r.x + r.w, r.y + r.h);
223
+ pushStrokedRectEdges(segments, sid(), Math.min(x0, x1), Math.min(y0, y1), Math.abs(x1 - x0), Math.abs(y1 - y0));
224
+ }
225
+ for (const l of pendingLines) {
226
+ const [lx1, ly1] = ctmApply(ctm, l.x1, l.y1);
227
+ const [lx2, ly2] = ctmApply(ctm, l.x2, l.y2);
228
+ const dx = Math.abs(lx2 - lx1);
229
+ const dy = Math.abs(ly2 - ly1);
230
+ // Only keep H/V lines
231
+ if ((dx >= MIN_LENGTH && dy < 1) || (dy >= MIN_LENGTH && dx < 1)) {
232
+ segments.push({ id: sid(), x1: lx1, y1: ly1, x2: lx2, y2: ly2 });
233
+ }
234
+ }
235
+ }
236
+ pendingRects.length = 0;
237
+ pendingLines.length = 0;
238
+ }
239
+ while (idx < tokens.length) {
240
+ const t = tokens[idx];
241
+ if (t === "q") {
242
+ stateStack.push({ ctm: [...ctm], strokeWidth });
243
+ }
244
+ else if (t === "Q") {
245
+ const saved = stateStack.pop();
246
+ if (saved) {
247
+ ctm = saved.ctm;
248
+ strokeWidth = saved.strokeWidth;
249
+ }
250
+ }
251
+ else if (t === "cm" && idx >= 6) {
252
+ const a = Number(tokens[idx - 6]);
253
+ const b = Number(tokens[idx - 5]);
254
+ const c = Number(tokens[idx - 4]);
255
+ const d = Number(tokens[idx - 3]);
256
+ const e = Number(tokens[idx - 2]);
257
+ const f = Number(tokens[idx - 1]);
258
+ ctm = ctmConcat(ctm, [a, b, c, d, e, f]);
259
+ }
260
+ else if (t === "w" && idx >= 1) {
261
+ strokeWidth = Number(tokens[idx - 1]) || strokeWidth;
262
+ }
263
+ else if (t === "re" && idx >= 4) {
264
+ const x = Number(tokens[idx - 4]);
265
+ const y = Number(tokens[idx - 3]);
266
+ const w = Number(tokens[idx - 2]);
267
+ const h = Number(tokens[idx - 1]);
268
+ if (Number.isFinite(x + y + w + h)) {
269
+ pendingRects.push({ x, y, w, h });
270
+ }
271
+ }
272
+ else if (t === "m" && idx >= 2) {
273
+ curX = Number(tokens[idx - 2]);
274
+ curY = Number(tokens[idx - 1]);
275
+ pathStartX = curX;
276
+ pathStartY = curY;
277
+ }
278
+ else if (t === "l" && idx >= 2) {
279
+ const x2 = Number(tokens[idx - 2]);
280
+ const y2 = Number(tokens[idx - 1]);
281
+ pendingLines.push({ x1: curX, y1: curY, x2, y2 });
282
+ curX = x2;
283
+ curY = y2;
284
+ }
285
+ else if (t === "h") {
286
+ // closePath: line back to start
287
+ if (curX !== pathStartX || curY !== pathStartY) {
288
+ pendingLines.push({
289
+ x1: curX,
290
+ y1: curY,
291
+ x2: pathStartX,
292
+ y2: pathStartY,
293
+ });
294
+ }
295
+ curX = pathStartX;
296
+ curY = pathStartY;
297
+ }
298
+ else if (t === "f" || t === "F" || t === "f*") {
299
+ flushPath("fill");
300
+ }
301
+ else if (t === "S" || t === "s") {
302
+ if (t === "s") {
303
+ // closeStroke: implicit closePath
304
+ if (curX !== pathStartX || curY !== pathStartY) {
305
+ pendingLines.push({
306
+ x1: curX,
307
+ y1: curY,
308
+ x2: pathStartX,
309
+ y2: pathStartY,
310
+ });
311
+ }
312
+ }
313
+ flushPath("stroke");
314
+ }
315
+ else if (t === "B" || t === "B*" || t === "b" || t === "b*") {
316
+ // fill + stroke combined
317
+ flushPath("fill");
318
+ flushPath("stroke");
319
+ }
320
+ else if (t === "n") {
321
+ // end path without painting — discard
322
+ pendingRects.length = 0;
323
+ pendingLines.length = 0;
324
+ }
325
+ idx++;
326
+ }
327
+ return segments;
328
+ }
329
+ /**
330
+ * Fast tokenizer for PDF content streams.
331
+ * Splits on whitespace, skipping comments and string literals.
332
+ */
333
+ function tokenizeContentStream(raw) {
334
+ const tokens = [];
335
+ const len = raw.length;
336
+ let i = 0;
337
+ while (i < len) {
338
+ const ch = raw.charCodeAt(i);
339
+ // Skip whitespace
340
+ if (ch <= 32) {
341
+ i++;
342
+ continue;
343
+ }
344
+ // Skip comments
345
+ if (ch === 37 /* % */) {
346
+ while (i < len && raw.charCodeAt(i) !== 10)
347
+ i++;
348
+ continue;
349
+ }
350
+ // Skip string literals (...)
351
+ if (ch === 40 /* ( */) {
352
+ let depth = 1;
353
+ i++;
354
+ while (i < len && depth > 0) {
355
+ const c = raw.charCodeAt(i);
356
+ if (c === 92 /* \ */) {
357
+ i++;
358
+ }
359
+ else if (c === 40) {
360
+ depth++;
361
+ }
362
+ else if (c === 41) {
363
+ depth--;
364
+ }
365
+ i++;
366
+ }
367
+ continue;
368
+ }
369
+ // Skip hex strings <...>
370
+ if (ch === 60 /* < */ && i + 1 < len && raw.charCodeAt(i + 1) !== 60) {
371
+ i++;
372
+ while (i < len && raw.charCodeAt(i) !== 62)
373
+ i++;
374
+ i++; // skip >
375
+ continue;
376
+ }
377
+ // Skip dict delimiters << >>
378
+ if (ch === 60 && i + 1 < len && raw.charCodeAt(i + 1) === 60) {
379
+ i += 2;
380
+ continue;
381
+ }
382
+ if (ch === 62 && i + 1 < len && raw.charCodeAt(i + 1) === 62) {
383
+ i += 2;
384
+ continue;
385
+ }
386
+ // Regular token: read until whitespace or delimiter
387
+ const start = i;
388
+ while (i < len) {
389
+ const c = raw.charCodeAt(i);
390
+ if (c <= 32 || c === 40 || c === 41 || c === 60 || c === 62 || c === 37)
391
+ break;
392
+ i++;
393
+ }
394
+ if (i > start) {
395
+ tokens.push(raw.substring(start, i));
396
+ }
397
+ }
398
+ return tokens;
399
+ }
400
+ // ---------------------------------------------------------------------------
401
+ // Image region detection
402
+ // ---------------------------------------------------------------------------
403
+ /** Minimum area (pts²) for an image to be considered a diagram, not an icon. */
404
+ const MIN_IMAGE_AREA = 5000;
405
+ function extractImageRegions(stext, pageNumber, pageHeight) {
406
+ const regions = [];
407
+ for (const block of stext.blocks) {
408
+ if (block.type !== "image")
409
+ continue;
410
+ const { x, y, w, h } = block.bbox;
411
+ if (w * h < MIN_IMAGE_AREA)
412
+ continue; // skip tiny icons
413
+ // Convert Y from mupdf (top-left) to PDF (bottom-left) for ordering
414
+ const pdfTopY = pageHeight - y;
415
+ regions.push({
416
+ id: `p${pageNumber}-img${regions.length}`,
417
+ pageNumber,
418
+ bbox: { x, y, w, h },
419
+ topY: pdfTopY,
420
+ });
421
+ }
422
+ return regions;
423
+ }
424
+ // ---------------------------------------------------------------------------
425
+ // Public API
426
+ // ---------------------------------------------------------------------------
427
+ /**
428
+ * Render an image region from a PDF page as a PNG buffer.
429
+ * Uses mupdf's DrawDevice to render just the cropped area at 2x resolution.
430
+ */
431
+ export function renderImageRegion(input, region) {
432
+ const mupdf = require("mupdf");
433
+ const doc = mupdf.Document.openDocument(input, "application/pdf");
434
+ const page = doc.loadPage(region.pageNumber - 1);
435
+ const pad = 10;
436
+ const bx = region.bbox.x - pad;
437
+ const by = region.bbox.y - pad;
438
+ const bw = region.bbox.w + 2 * pad;
439
+ const bh = region.bbox.h + 2 * pad;
440
+ const scale = 2;
441
+ const pw = Math.round(bw * scale);
442
+ const ph = Math.round(bh * scale);
443
+ const pix = new mupdf.Pixmap(mupdf.ColorSpace.DeviceRGB, [0, 0, pw, ph], false);
444
+ pix.clear(255);
445
+ const matrix = [scale, 0, 0, scale, -bx * scale, -by * scale];
446
+ const dl = page.toDisplayList();
447
+ const dev = new mupdf.DrawDevice(matrix, pix);
448
+ dl.run(dev, mupdf.Matrix.identity);
449
+ dev.close();
450
+ return pix.asPNG();
451
+ }
452
+ /**
453
+ * Extract text boxes and vector segments from all pages of a PDF buffer.
454
+ */
455
+ export async function extractPages(input) {
456
+ let mupdf;
457
+ try {
458
+ mupdf = await import("mupdf");
459
+ }
460
+ catch {
461
+ throw new Error("PDF support requires 'mupdf'. Install it: npm install mupdf");
462
+ }
463
+ const doc = mupdf.Document.openDocument(input, "application/pdf");
464
+ const pages = [];
465
+ for (let i = 0; i < doc.countPages(); i++) {
466
+ const pageNumber = i + 1;
467
+ const page = doc.loadPage(i);
468
+ const bounds = page.getBounds();
469
+ const pageHeight = bounds[3] - bounds[1];
470
+ // Single structured text pass with both flags
471
+ const stext = JSON.parse(page.toStructuredText("preserve-whitespace,preserve-images").asJSON());
472
+ // Extract text boxes and image regions from the same parse
473
+ const textBoxes = extractTextBoxes(page, pageNumber, pageHeight, stext);
474
+ const images = extractImageRegions(stext, pageNumber, pageHeight);
475
+ // Extract vector segments from raw content stream
476
+ let segments = [];
477
+ try {
478
+ const pageObj = page.getObject();
479
+ const contents = pageObj.get("Contents");
480
+ if (contents) {
481
+ let rawBytes;
482
+ if (contents.isArray()) {
483
+ // Multiple content streams — concatenate
484
+ const parts = [];
485
+ const len = contents.length ?? 0;
486
+ for (let j = 0; j < len; j++) {
487
+ const stream = contents.get(j);
488
+ if (stream?.readStream) {
489
+ parts.push(stream.readStream().asUint8Array());
490
+ }
491
+ }
492
+ const totalLen = parts.reduce((s, p) => s + p.length, 0);
493
+ rawBytes = new Uint8Array(totalLen);
494
+ let offset = 0;
495
+ for (const part of parts) {
496
+ rawBytes.set(part, offset);
497
+ offset += part.length;
498
+ }
499
+ }
500
+ else {
501
+ rawBytes = contents.readStream().asUint8Array();
502
+ }
503
+ const raw = new TextDecoder().decode(rawBytes);
504
+ segments = extractSegmentsFromContentStream(raw, pageNumber);
505
+ }
506
+ }
507
+ catch {
508
+ // Content stream extraction failed — proceed with text only
509
+ }
510
+ pages.push({ pageNumber, textBoxes, segments, images });
511
+ }
512
+ return pages;
513
+ }
@@ -0,0 +1,25 @@
1
+ /**
2
+ * Table grid detection from vector segments and text boxes.
3
+ *
4
+ * Ported from @oharato/pdf2md-ts with TypeScript types and without
5
+ * CJK-specific borderless table heuristics. The core algorithm:
6
+ *
7
+ * 1. Classify segments as horizontal or vertical lines
8
+ * 2. Group horizontal Y-lines into table groups (split by vertical gaps)
9
+ * 3. For each group:
10
+ * a. Full grid (H+V lines): build cells from grid intersections,
11
+ * place text via raycasting
12
+ * b. H-line only (no V lines): infer columns from text X positions
13
+ * 4. Prune empty rows/cols
14
+ *
15
+ * Coordinate system: PDF native (bottom-left origin, Y increases upward).
16
+ */
17
+ import type { Segment, TableGrid, TextBox } from "./types.js";
18
+ export interface GridResult {
19
+ grids: TableGrid[];
20
+ consumedIds: string[];
21
+ }
22
+ /**
23
+ * Detect all table grids on a single page from its text boxes and segments.
24
+ */
25
+ export declare function resolveTableGrids(pageNumber: number, textBoxes: TextBox[], segments: Segment[]): GridResult;