@oh-my-pi/pi-coding-agent 16.0.7 → 16.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +31 -0
- package/dist/cli.js +4752 -12462
- package/dist/types/cli/update-cli.d.ts +11 -0
- package/dist/types/debug/remote-debugger.d.ts +45 -0
- package/dist/types/internal-urls/docs-index.d.ts +19 -0
- package/dist/types/markit/converters/docx.d.ts +6 -0
- package/dist/types/markit/converters/epub.d.ts +15 -0
- package/dist/types/markit/converters/pdf/columns.d.ts +35 -0
- package/dist/types/markit/converters/pdf/extract.d.ts +10 -0
- package/dist/types/markit/converters/pdf/grid.d.ts +25 -0
- package/dist/types/markit/converters/pdf/headers.d.ts +24 -0
- package/dist/types/markit/converters/pdf/index.d.ts +6 -0
- package/dist/types/markit/converters/pdf/render.d.ts +24 -0
- package/dist/types/markit/converters/pdf/types.d.ts +75 -0
- package/dist/types/markit/converters/pptx.d.ts +57 -0
- package/dist/types/markit/converters/xlsx.d.ts +25 -0
- package/dist/types/markit/index.d.ts +2 -0
- package/dist/types/markit/registry.d.ts +16 -0
- package/dist/types/markit/types.d.ts +30 -0
- package/dist/types/session/agent-session.d.ts +7 -8
- package/dist/types/session/auth-storage.d.ts +3 -2
- package/dist/types/session/yield-queue.d.ts +3 -1
- package/dist/types/tools/browser/attach.d.ts +1 -1
- package/dist/types/utils/markit.d.ts +0 -8
- package/dist/types/utils/mupdf-wasm-embed.d.ts +1 -0
- package/dist/types/utils/turndown.d.ts +15 -0
- package/dist/types/utils/zip.d.ts +119 -0
- package/package.json +20 -18
- package/scripts/build-binary.ts +7 -3
- package/scripts/bundle-dist.ts +28 -12
- package/scripts/embed-mupdf-wasm.ts +67 -0
- package/scripts/generate-docs-index.ts +48 -32
- package/scripts/omp +1 -1
- package/src/advisor/__tests__/advisor.test.ts +83 -0
- package/src/advisor/runtime.ts +16 -1
- package/src/cli/auth-broker-cli.ts +1 -3
- package/src/cli/auth-gateway-cli.ts +2 -5
- package/src/cli/update-cli.ts +63 -3
- package/src/config/model-discovery.ts +20 -8
- package/src/config/models-config-schema.ts +8 -1
- package/src/debug/index.ts +44 -0
- package/src/debug/remote-debugger.ts +151 -0
- package/src/debug/report-bundle.ts +2 -1
- package/src/internal-urls/docs-index.generated.txt +2 -0
- package/src/internal-urls/docs-index.ts +102 -0
- package/src/internal-urls/omp-protocol.ts +10 -9
- package/src/markit/NOTICE +32 -0
- package/src/markit/converters/docx.ts +56 -0
- package/src/markit/converters/epub.ts +136 -0
- package/src/markit/converters/mammoth.d.ts +24 -0
- package/src/markit/converters/pdf/columns.ts +103 -0
- package/src/markit/converters/pdf/extract.ts +574 -0
- package/src/markit/converters/pdf/grid.ts +780 -0
- package/src/markit/converters/pdf/headers.ts +106 -0
- package/src/markit/converters/pdf/index.ts +146 -0
- package/src/markit/converters/pdf/render.ts +501 -0
- package/src/markit/converters/pdf/types.ts +84 -0
- package/src/markit/converters/pptx.ts +325 -0
- package/src/markit/converters/xlsx.ts +173 -0
- package/src/markit/index.ts +2 -0
- package/src/markit/registry.ts +59 -0
- package/src/markit/types.ts +35 -0
- package/src/modes/components/snapcompact-shape-preview-doc.md +14 -7
- package/src/modes/components/snapcompact-shape-preview.ts +2 -2
- package/src/modes/controllers/input-controller.ts +29 -8
- package/src/modes/interactive-mode.ts +26 -9
- package/src/prompts/advisor/system.md +1 -0
- package/src/sdk.ts +5 -9
- package/src/session/agent-session.ts +62 -40
- package/src/session/auth-storage.ts +2 -11
- package/src/session/yield-queue.ts +7 -1
- package/src/tools/browser/attach.ts +2 -2
- package/src/tools/fetch.ts +25 -60
- package/src/tools/read.ts +1 -1
- package/src/tools/search.ts +1 -6
- package/src/tools/write.ts +25 -65
- package/src/utils/markit.ts +25 -9
- package/src/utils/mupdf-wasm-embed.ts +12 -0
- package/src/utils/tools-manager.ts +2 -11
- package/src/utils/turndown.ts +83 -0
- package/src/{tools/archive-reader.ts → utils/zip.ts} +453 -83
- package/src/web/scrapers/types.ts +3 -46
- package/dist/types/internal-urls/docs-index.generated.d.ts +0 -2
- package/dist/types/tools/archive-reader.d.ts +0 -49
- package/src/internal-urls/docs-index.generated.ts +0 -120
|
@@ -0,0 +1,574 @@
|
|
|
1
|
+
// Adapted from markit-ai (MIT). See ../../NOTICE.
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* PDF content extraction using mupdf.
|
|
5
|
+
*
|
|
6
|
+
* Extracts text boxes (with position, font size, bold) and vector line
|
|
7
|
+
* segments (table borders) from each page. Uses mupdf's native WASM
|
|
8
|
+
* engine for fast parsing, and reads raw content streams for vector graphics.
|
|
9
|
+
*
|
|
10
|
+
* Coordinate system: PDF native (origin = bottom-left, Y increases upward).
|
|
11
|
+
*/
|
|
12
|
+
import type * as mupdf from "mupdf";
|
|
13
|
+
import type { ImageRegion, PageContent, Segment, TextBox } from "./types";
|
|
14
|
+
|
|
15
|
+
// mupdf instantiates its WASM module via a top-level await. A static
|
|
16
|
+
// `import * as mupdf` would pull that await into this module's init, which makes
|
|
17
|
+
// the whole bundled markit chunk's `__esm` init async — and bun's compiled
|
|
18
|
+
// bundler fails to await that init transitively through the `../markit` barrel,
|
|
19
|
+
// exposing the converter classes before their module-level consts initialize
|
|
20
|
+
// (e.g. `EXTENSIONS` reads as undefined). Importing mupdf lazily keeps the chunk
|
|
21
|
+
// init synchronous and also keeps the ~10MB wasm off non-PDF conversions.
|
|
22
|
+
let mupdfModule: typeof mupdf | undefined;
|
|
23
|
+
async function loadMupdf(): Promise<typeof mupdf> {
|
|
24
|
+
if (!mupdfModule) {
|
|
25
|
+
mupdfModule = await import("mupdf");
|
|
26
|
+
}
|
|
27
|
+
return mupdfModule;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/** mupdf structured-text JSON bounding box (top-left origin). */
|
|
31
|
+
interface StextBBox {
|
|
32
|
+
x: number;
|
|
33
|
+
y: number;
|
|
34
|
+
w: number;
|
|
35
|
+
h: number;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/** Font metadata attached to a structured-text line. */
|
|
39
|
+
interface StextFont {
|
|
40
|
+
size?: number;
|
|
41
|
+
weight?: string;
|
|
42
|
+
name?: string;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/** A line within a text block in mupdf structured-text JSON. */
|
|
46
|
+
interface StextLine {
|
|
47
|
+
text?: string;
|
|
48
|
+
font?: StextFont;
|
|
49
|
+
bbox: StextBBox;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/** A block (text or image) in mupdf structured-text JSON. */
|
|
53
|
+
interface StextBlock {
|
|
54
|
+
type: string;
|
|
55
|
+
bbox: StextBBox;
|
|
56
|
+
lines: StextLine[];
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/** Parsed mupdf structured-text JSON for a page. */
|
|
60
|
+
interface StructuredTextJSON {
|
|
61
|
+
blocks: StextBlock[];
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/** A raw text fragment before merging into word/phrase boxes. */
|
|
65
|
+
interface RawTextItem {
|
|
66
|
+
text: string;
|
|
67
|
+
x: number;
|
|
68
|
+
y: number;
|
|
69
|
+
width: number;
|
|
70
|
+
height: number;
|
|
71
|
+
fontSize: number;
|
|
72
|
+
isBold: boolean;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// ---------------------------------------------------------------------------
|
|
76
|
+
// Text extraction
|
|
77
|
+
// ---------------------------------------------------------------------------
|
|
78
|
+
/** Y tolerance for merging text fragments on the same visual line. */
|
|
79
|
+
const SAME_LINE_Y_TOLERANCE = 2;
|
|
80
|
+
/** Max horizontal gap (pts) to merge adjacent fragments into one text box. */
|
|
81
|
+
const MAX_MERGE_GAP = 14;
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Merge horizontally adjacent raw text items on the same visual line into
|
|
85
|
+
* word/phrase-level text boxes.
|
|
86
|
+
*/
|
|
87
|
+
function mergeIntoWords(raws: RawTextItem[]): RawTextItem[] {
|
|
88
|
+
if (raws.length === 0) return [];
|
|
89
|
+
// Sort by Y descending (top-first in bottom-left coords), then X ascending
|
|
90
|
+
const sorted = [...raws].sort((a, b) => {
|
|
91
|
+
const dy = b.y - a.y;
|
|
92
|
+
return Math.abs(dy) > SAME_LINE_Y_TOLERANCE ? dy : a.x - b.x;
|
|
93
|
+
});
|
|
94
|
+
const merged: RawTextItem[] = [];
|
|
95
|
+
let cur = { ...sorted[0] };
|
|
96
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
97
|
+
const next = sorted[i];
|
|
98
|
+
const sameY = Math.abs(next.y - cur.y) <= SAME_LINE_Y_TOLERANCE;
|
|
99
|
+
const close = next.x <= cur.x + cur.width + MAX_MERGE_GAP;
|
|
100
|
+
if (sameY && close) {
|
|
101
|
+
const gap = next.x - (cur.x + cur.width);
|
|
102
|
+
const sep = gap > 1 ? " " : "";
|
|
103
|
+
cur.text += sep + next.text;
|
|
104
|
+
cur.width = next.x + next.width - cur.x;
|
|
105
|
+
cur.height = Math.max(cur.height, next.height);
|
|
106
|
+
cur.fontSize = Math.max(cur.fontSize, next.fontSize);
|
|
107
|
+
cur.isBold = cur.isBold || next.isBold;
|
|
108
|
+
} else {
|
|
109
|
+
merged.push(cur);
|
|
110
|
+
cur = { ...next };
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
merged.push(cur);
|
|
114
|
+
return merged;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Extract text boxes from a mupdf page using structured text output.
|
|
119
|
+
*
|
|
120
|
+
* mupdf's structured text JSON uses top-left origin; we convert to
|
|
121
|
+
* bottom-left (standard PDF coordinates) using the page height.
|
|
122
|
+
*/
|
|
123
|
+
function extractTextBoxes(
|
|
124
|
+
page: mupdf.Page,
|
|
125
|
+
pageNumber: number,
|
|
126
|
+
pageHeight: number,
|
|
127
|
+
stext?: StructuredTextJSON,
|
|
128
|
+
): TextBox[] {
|
|
129
|
+
if (!stext) {
|
|
130
|
+
stext = JSON.parse(page.toStructuredText("preserve-whitespace").asJSON()) as StructuredTextJSON;
|
|
131
|
+
}
|
|
132
|
+
const raws: RawTextItem[] = [];
|
|
133
|
+
for (const block of stext.blocks) {
|
|
134
|
+
if (block.type !== "text") continue;
|
|
135
|
+
for (const line of block.lines) {
|
|
136
|
+
const text = line.text?.trim();
|
|
137
|
+
if (!text) continue;
|
|
138
|
+
const fontSize = line.font?.size ?? 0;
|
|
139
|
+
const weight = line.font?.weight ?? "normal";
|
|
140
|
+
const fontName = line.font?.name ?? "";
|
|
141
|
+
const isBold = weight === "bold" || /bold/i.test(fontName) || /Black|Heavy/i.test(fontName);
|
|
142
|
+
// mupdf bbox: {x, y, w, h} in top-left coords
|
|
143
|
+
// Convert to bottom-left: pdfY = pageHeight - (bbox.y + bbox.h)
|
|
144
|
+
const bboxY = line.bbox.y;
|
|
145
|
+
const bboxH = line.bbox.h;
|
|
146
|
+
const pdfY = pageHeight - (bboxY + bboxH);
|
|
147
|
+
raws.push({
|
|
148
|
+
text,
|
|
149
|
+
x: line.bbox.x,
|
|
150
|
+
y: pdfY,
|
|
151
|
+
width: line.bbox.w,
|
|
152
|
+
height: bboxH,
|
|
153
|
+
fontSize,
|
|
154
|
+
isBold,
|
|
155
|
+
});
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
const words = mergeIntoWords(raws);
|
|
159
|
+
return words
|
|
160
|
+
.map((w, i) => ({
|
|
161
|
+
id: `p${pageNumber}-t${i}`,
|
|
162
|
+
text: w.text.trim(),
|
|
163
|
+
pageNumber,
|
|
164
|
+
fontSize: w.fontSize,
|
|
165
|
+
isBold: w.isBold,
|
|
166
|
+
bounds: {
|
|
167
|
+
left: w.x,
|
|
168
|
+
right: w.x + w.width,
|
|
169
|
+
bottom: w.y,
|
|
170
|
+
top: w.y + w.height,
|
|
171
|
+
},
|
|
172
|
+
}))
|
|
173
|
+
.filter(b => b.text.length > 0);
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// ---------------------------------------------------------------------------
|
|
177
|
+
// Vector segment extraction from raw content stream
|
|
178
|
+
// ---------------------------------------------------------------------------
|
|
179
|
+
/** Minimum aspect ratio for a filled rect to be considered a line. */
|
|
180
|
+
const LINE_ASPECT_THRESHOLD = 6;
|
|
181
|
+
/** Minimum length (pts) for a segment to count. */
|
|
182
|
+
const MIN_LENGTH = 2;
|
|
183
|
+
/** Maximum thickness (pts) for a border line (filters out filled areas). */
|
|
184
|
+
const MAX_THICKNESS = 3;
|
|
185
|
+
|
|
186
|
+
/**
|
|
187
|
+
* Convert a thin filled rectangle to a horizontal or vertical segment.
|
|
188
|
+
* Returns null if the rect doesn't look like a border line.
|
|
189
|
+
*/
|
|
190
|
+
function thinRectToSegment(id: string, x: number, y: number, w: number, h: number): Segment | null {
|
|
191
|
+
const aw = Math.abs(w);
|
|
192
|
+
const ah = Math.abs(h);
|
|
193
|
+
if (aw > ah * LINE_ASPECT_THRESHOLD && aw >= MIN_LENGTH && ah <= MAX_THICKNESS) {
|
|
194
|
+
// Horizontal line
|
|
195
|
+
const cy = y + ah / 2;
|
|
196
|
+
return { id, x1: x, y1: cy, x2: x + aw, y2: cy };
|
|
197
|
+
}
|
|
198
|
+
if (ah > aw * LINE_ASPECT_THRESHOLD && ah >= MIN_LENGTH && aw <= MAX_THICKNESS) {
|
|
199
|
+
// Vertical line
|
|
200
|
+
const cx = x + aw / 2;
|
|
201
|
+
return { id, x1: cx, y1: y, x2: cx, y2: y + ah };
|
|
202
|
+
}
|
|
203
|
+
return null;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/**
|
|
207
|
+
* Emit 4 edge segments from a stroked rectangle.
|
|
208
|
+
*/
|
|
209
|
+
function pushStrokedRectEdges(segments: Segment[], id: string, x: number, y: number, w: number, h: number): void {
|
|
210
|
+
const aw = Math.abs(w);
|
|
211
|
+
const ah = Math.abs(h);
|
|
212
|
+
const base = id;
|
|
213
|
+
if (aw >= MIN_LENGTH) {
|
|
214
|
+
segments.push({ id: `${base}-b`, x1: x, y1: y, x2: x + aw, y2: y });
|
|
215
|
+
segments.push({
|
|
216
|
+
id: `${base}-t`,
|
|
217
|
+
x1: x,
|
|
218
|
+
y1: y + ah,
|
|
219
|
+
x2: x + aw,
|
|
220
|
+
y2: y + ah,
|
|
221
|
+
});
|
|
222
|
+
}
|
|
223
|
+
if (ah >= MIN_LENGTH) {
|
|
224
|
+
segments.push({ id: `${base}-l`, x1: x, y1: y, x2: x, y2: y + ah });
|
|
225
|
+
segments.push({
|
|
226
|
+
id: `${base}-r`,
|
|
227
|
+
x1: x + aw,
|
|
228
|
+
y1: y,
|
|
229
|
+
x2: x + aw,
|
|
230
|
+
y2: y + ah,
|
|
231
|
+
});
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
const CTM_IDENTITY = [1, 0, 0, 1, 0, 0];
|
|
236
|
+
|
|
237
|
+
/** Concatenate two affine matrices: result = parent × child. */
|
|
238
|
+
function ctmConcat(p: number[], c: number[]): number[] {
|
|
239
|
+
return [
|
|
240
|
+
p[0] * c[0] + p[2] * c[1],
|
|
241
|
+
p[1] * c[0] + p[3] * c[1],
|
|
242
|
+
p[0] * c[2] + p[2] * c[3],
|
|
243
|
+
p[1] * c[2] + p[3] * c[3],
|
|
244
|
+
p[0] * c[4] + p[2] * c[5] + p[4],
|
|
245
|
+
p[1] * c[4] + p[3] * c[5] + p[5],
|
|
246
|
+
];
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
function ctmApply(m: number[], x: number, y: number): [number, number] {
|
|
250
|
+
return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// ---------------------------------------------------------------------------
|
|
254
|
+
// Content stream parsing
|
|
255
|
+
// ---------------------------------------------------------------------------
|
|
256
|
+
/**
|
|
257
|
+
* Parse a PDF content stream and extract line segments from thin filled
|
|
258
|
+
* rectangles (re+f), stroked rectangles (re+S), and explicit lines (m/l+S).
|
|
259
|
+
* Tracks the CTM via q/Q/cm operators so coordinates are in page space.
|
|
260
|
+
*/
|
|
261
|
+
function extractSegmentsFromContentStream(raw: string, pageNumber: number): Segment[] {
|
|
262
|
+
const segments: Segment[] = [];
|
|
263
|
+
const tokens = tokenizeContentStream(raw);
|
|
264
|
+
let idx = 0;
|
|
265
|
+
let strokeWidth = 1.0;
|
|
266
|
+
// Graphics state stack (q/Q): saves CTM + strokeWidth
|
|
267
|
+
let ctm = [...CTM_IDENTITY];
|
|
268
|
+
const stateStack: Array<{ ctm: number[]; strokeWidth: number }> = [];
|
|
269
|
+
// State for path building (in user coordinates, pre-CTM)
|
|
270
|
+
let curX = 0;
|
|
271
|
+
let curY = 0;
|
|
272
|
+
let pathStartX = 0;
|
|
273
|
+
let pathStartY = 0;
|
|
274
|
+
const pendingRects: Array<{ x: number; y: number; w: number; h: number }> = [];
|
|
275
|
+
const pendingLines: Array<{ x1: number; y1: number; x2: number; y2: number }> = [];
|
|
276
|
+
function flushPath(mode: "fill" | "stroke"): void {
|
|
277
|
+
const sid = () => `p${pageNumber}-s${segments.length}`;
|
|
278
|
+
if (mode === "fill") {
|
|
279
|
+
for (const r of pendingRects) {
|
|
280
|
+
// Transform the rect corners through CTM, then check if it's a thin line
|
|
281
|
+
const [x0, y0] = ctmApply(ctm, r.x, r.y);
|
|
282
|
+
const [x1, y1] = ctmApply(ctm, r.x + r.w, r.y + r.h);
|
|
283
|
+
const seg = thinRectToSegment(
|
|
284
|
+
sid(),
|
|
285
|
+
Math.min(x0, x1),
|
|
286
|
+
Math.min(y0, y1),
|
|
287
|
+
Math.abs(x1 - x0),
|
|
288
|
+
Math.abs(y1 - y0),
|
|
289
|
+
);
|
|
290
|
+
if (seg) segments.push(seg);
|
|
291
|
+
}
|
|
292
|
+
} else if (mode === "stroke" && strokeWidth <= MAX_THICKNESS) {
|
|
293
|
+
for (const r of pendingRects) {
|
|
294
|
+
const [x0, y0] = ctmApply(ctm, r.x, r.y);
|
|
295
|
+
const [x1, y1] = ctmApply(ctm, r.x + r.w, r.y + r.h);
|
|
296
|
+
pushStrokedRectEdges(
|
|
297
|
+
segments,
|
|
298
|
+
sid(),
|
|
299
|
+
Math.min(x0, x1),
|
|
300
|
+
Math.min(y0, y1),
|
|
301
|
+
Math.abs(x1 - x0),
|
|
302
|
+
Math.abs(y1 - y0),
|
|
303
|
+
);
|
|
304
|
+
}
|
|
305
|
+
for (const l of pendingLines) {
|
|
306
|
+
const [lx1, ly1] = ctmApply(ctm, l.x1, l.y1);
|
|
307
|
+
const [lx2, ly2] = ctmApply(ctm, l.x2, l.y2);
|
|
308
|
+
const dx = Math.abs(lx2 - lx1);
|
|
309
|
+
const dy = Math.abs(ly2 - ly1);
|
|
310
|
+
// Only keep H/V lines
|
|
311
|
+
if ((dx >= MIN_LENGTH && dy < 1) || (dy >= MIN_LENGTH && dx < 1)) {
|
|
312
|
+
segments.push({ id: sid(), x1: lx1, y1: ly1, x2: lx2, y2: ly2 });
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
pendingRects.length = 0;
|
|
317
|
+
pendingLines.length = 0;
|
|
318
|
+
}
|
|
319
|
+
while (idx < tokens.length) {
|
|
320
|
+
const t = tokens[idx];
|
|
321
|
+
if (t === "q") {
|
|
322
|
+
stateStack.push({ ctm: [...ctm], strokeWidth });
|
|
323
|
+
} else if (t === "Q") {
|
|
324
|
+
const saved = stateStack.pop();
|
|
325
|
+
if (saved) {
|
|
326
|
+
ctm = saved.ctm;
|
|
327
|
+
strokeWidth = saved.strokeWidth;
|
|
328
|
+
}
|
|
329
|
+
} else if (t === "cm" && idx >= 6) {
|
|
330
|
+
const a = Number(tokens[idx - 6]);
|
|
331
|
+
const b = Number(tokens[idx - 5]);
|
|
332
|
+
const c = Number(tokens[idx - 4]);
|
|
333
|
+
const d = Number(tokens[idx - 3]);
|
|
334
|
+
const e = Number(tokens[idx - 2]);
|
|
335
|
+
const f = Number(tokens[idx - 1]);
|
|
336
|
+
ctm = ctmConcat(ctm, [a, b, c, d, e, f]);
|
|
337
|
+
} else if (t === "w" && idx >= 1) {
|
|
338
|
+
strokeWidth = Number(tokens[idx - 1]) || strokeWidth;
|
|
339
|
+
} else if (t === "re" && idx >= 4) {
|
|
340
|
+
const x = Number(tokens[idx - 4]);
|
|
341
|
+
const y = Number(tokens[idx - 3]);
|
|
342
|
+
const w = Number(tokens[idx - 2]);
|
|
343
|
+
const h = Number(tokens[idx - 1]);
|
|
344
|
+
if (Number.isFinite(x + y + w + h)) {
|
|
345
|
+
pendingRects.push({ x, y, w, h });
|
|
346
|
+
}
|
|
347
|
+
} else if (t === "m" && idx >= 2) {
|
|
348
|
+
curX = Number(tokens[idx - 2]);
|
|
349
|
+
curY = Number(tokens[idx - 1]);
|
|
350
|
+
pathStartX = curX;
|
|
351
|
+
pathStartY = curY;
|
|
352
|
+
} else if (t === "l" && idx >= 2) {
|
|
353
|
+
const x2 = Number(tokens[idx - 2]);
|
|
354
|
+
const y2 = Number(tokens[idx - 1]);
|
|
355
|
+
pendingLines.push({ x1: curX, y1: curY, x2, y2 });
|
|
356
|
+
curX = x2;
|
|
357
|
+
curY = y2;
|
|
358
|
+
} else if (t === "h") {
|
|
359
|
+
// closePath: line back to start
|
|
360
|
+
if (curX !== pathStartX || curY !== pathStartY) {
|
|
361
|
+
pendingLines.push({
|
|
362
|
+
x1: curX,
|
|
363
|
+
y1: curY,
|
|
364
|
+
x2: pathStartX,
|
|
365
|
+
y2: pathStartY,
|
|
366
|
+
});
|
|
367
|
+
}
|
|
368
|
+
curX = pathStartX;
|
|
369
|
+
curY = pathStartY;
|
|
370
|
+
} else if (t === "f" || t === "F" || t === "f*") {
|
|
371
|
+
flushPath("fill");
|
|
372
|
+
} else if (t === "S" || t === "s") {
|
|
373
|
+
if (t === "s") {
|
|
374
|
+
// closeStroke: implicit closePath
|
|
375
|
+
if (curX !== pathStartX || curY !== pathStartY) {
|
|
376
|
+
pendingLines.push({
|
|
377
|
+
x1: curX,
|
|
378
|
+
y1: curY,
|
|
379
|
+
x2: pathStartX,
|
|
380
|
+
y2: pathStartY,
|
|
381
|
+
});
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
flushPath("stroke");
|
|
385
|
+
} else if (t === "B" || t === "B*" || t === "b" || t === "b*") {
|
|
386
|
+
// fill + stroke combined
|
|
387
|
+
flushPath("fill");
|
|
388
|
+
flushPath("stroke");
|
|
389
|
+
} else if (t === "n") {
|
|
390
|
+
// end path without painting — discard
|
|
391
|
+
pendingRects.length = 0;
|
|
392
|
+
pendingLines.length = 0;
|
|
393
|
+
}
|
|
394
|
+
idx++;
|
|
395
|
+
}
|
|
396
|
+
return segments;
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
/**
|
|
400
|
+
* Fast tokenizer for PDF content streams.
|
|
401
|
+
* Splits on whitespace, skipping comments and string literals.
|
|
402
|
+
*/
|
|
403
|
+
function tokenizeContentStream(raw: string): string[] {
|
|
404
|
+
const tokens: string[] = [];
|
|
405
|
+
const len = raw.length;
|
|
406
|
+
let i = 0;
|
|
407
|
+
while (i < len) {
|
|
408
|
+
const ch = raw.charCodeAt(i);
|
|
409
|
+
// Skip whitespace
|
|
410
|
+
if (ch <= 32) {
|
|
411
|
+
i++;
|
|
412
|
+
continue;
|
|
413
|
+
}
|
|
414
|
+
// Skip comments
|
|
415
|
+
if (ch === 37 /* % */) {
|
|
416
|
+
while (i < len && raw.charCodeAt(i) !== 10) i++;
|
|
417
|
+
continue;
|
|
418
|
+
}
|
|
419
|
+
// Skip string literals (...)
|
|
420
|
+
if (ch === 40 /* ( */) {
|
|
421
|
+
let depth = 1;
|
|
422
|
+
i++;
|
|
423
|
+
while (i < len && depth > 0) {
|
|
424
|
+
const c = raw.charCodeAt(i);
|
|
425
|
+
if (c === 92 /* \ */) {
|
|
426
|
+
i++;
|
|
427
|
+
} else if (c === 40) {
|
|
428
|
+
depth++;
|
|
429
|
+
} else if (c === 41) {
|
|
430
|
+
depth--;
|
|
431
|
+
}
|
|
432
|
+
i++;
|
|
433
|
+
}
|
|
434
|
+
continue;
|
|
435
|
+
}
|
|
436
|
+
// Skip hex strings <...>
|
|
437
|
+
if (ch === 60 /* < */ && i + 1 < len && raw.charCodeAt(i + 1) !== 60) {
|
|
438
|
+
i++;
|
|
439
|
+
while (i < len && raw.charCodeAt(i) !== 62) i++;
|
|
440
|
+
i++; // skip >
|
|
441
|
+
continue;
|
|
442
|
+
}
|
|
443
|
+
// Skip dict delimiters << >>
|
|
444
|
+
if (ch === 60 && i + 1 < len && raw.charCodeAt(i + 1) === 60) {
|
|
445
|
+
i += 2;
|
|
446
|
+
continue;
|
|
447
|
+
}
|
|
448
|
+
if (ch === 62 && i + 1 < len && raw.charCodeAt(i + 1) === 62) {
|
|
449
|
+
i += 2;
|
|
450
|
+
continue;
|
|
451
|
+
}
|
|
452
|
+
// Regular token: read until whitespace or delimiter
|
|
453
|
+
const start = i;
|
|
454
|
+
while (i < len) {
|
|
455
|
+
const c = raw.charCodeAt(i);
|
|
456
|
+
if (c <= 32 || c === 40 || c === 41 || c === 60 || c === 62 || c === 37) break;
|
|
457
|
+
i++;
|
|
458
|
+
}
|
|
459
|
+
if (i > start) {
|
|
460
|
+
tokens.push(raw.substring(start, i));
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
return tokens;
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
// ---------------------------------------------------------------------------
|
|
467
|
+
// Image region detection
|
|
468
|
+
// ---------------------------------------------------------------------------
|
|
469
|
+
/** Minimum area (pts²) for an image to be considered a diagram, not an icon. */
|
|
470
|
+
const MIN_IMAGE_AREA = 5000;
|
|
471
|
+
|
|
472
|
+
function extractImageRegions(stext: StructuredTextJSON, pageNumber: number, pageHeight: number): ImageRegion[] {
|
|
473
|
+
const regions: ImageRegion[] = [];
|
|
474
|
+
for (const block of stext.blocks) {
|
|
475
|
+
if (block.type !== "image") continue;
|
|
476
|
+
const { x, y, w, h } = block.bbox;
|
|
477
|
+
if (w * h < MIN_IMAGE_AREA) continue; // skip tiny icons
|
|
478
|
+
// Convert Y from mupdf (top-left) to PDF (bottom-left) for ordering
|
|
479
|
+
const pdfTopY = pageHeight - y;
|
|
480
|
+
regions.push({
|
|
481
|
+
id: `p${pageNumber}-img${regions.length}`,
|
|
482
|
+
pageNumber,
|
|
483
|
+
bbox: { x, y, w, h },
|
|
484
|
+
topY: pdfTopY,
|
|
485
|
+
});
|
|
486
|
+
}
|
|
487
|
+
return regions;
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
// ---------------------------------------------------------------------------
|
|
491
|
+
// Public API
|
|
492
|
+
// ---------------------------------------------------------------------------
|
|
493
|
+
/**
|
|
494
|
+
* Render an image region from a PDF page as a PNG buffer.
|
|
495
|
+
* Uses mupdf's DrawDevice to render just the cropped area at 2x resolution.
|
|
496
|
+
*/
|
|
497
|
+
export async function renderImageRegion(input: Uint8Array, region: ImageRegion): Promise<Uint8Array> {
|
|
498
|
+
const m = await loadMupdf();
|
|
499
|
+
const doc = m.Document.openDocument(input, "application/pdf");
|
|
500
|
+
const page = doc.loadPage(region.pageNumber - 1);
|
|
501
|
+
const pad = 10;
|
|
502
|
+
const bx = region.bbox.x - pad;
|
|
503
|
+
const by = region.bbox.y - pad;
|
|
504
|
+
const bw = region.bbox.w + 2 * pad;
|
|
505
|
+
const bh = region.bbox.h + 2 * pad;
|
|
506
|
+
const scale = 2;
|
|
507
|
+
const pw = Math.round(bw * scale);
|
|
508
|
+
const ph = Math.round(bh * scale);
|
|
509
|
+
const pix = new m.Pixmap(m.ColorSpace.DeviceRGB, [0, 0, pw, ph], false);
|
|
510
|
+
pix.clear(255);
|
|
511
|
+
const matrix: mupdf.Matrix = [scale, 0, 0, scale, -bx * scale, -by * scale];
|
|
512
|
+
const dl = page.toDisplayList();
|
|
513
|
+
const dev = new m.DrawDevice(matrix, pix);
|
|
514
|
+
dl.run(dev, m.Matrix.identity);
|
|
515
|
+
dev.close();
|
|
516
|
+
return pix.asPNG();
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
/**
|
|
520
|
+
* Extract text boxes and vector segments from all pages of a PDF buffer.
|
|
521
|
+
*/
|
|
522
|
+
export async function extractPages(input: Uint8Array): Promise<PageContent[]> {
|
|
523
|
+
const m = await loadMupdf();
|
|
524
|
+
const doc = m.Document.openDocument(input, "application/pdf");
|
|
525
|
+
const pages: PageContent[] = [];
|
|
526
|
+
for (let i = 0; i < doc.countPages(); i++) {
|
|
527
|
+
const pageNumber = i + 1;
|
|
528
|
+
const page = doc.loadPage(i);
|
|
529
|
+
const bounds = page.getBounds();
|
|
530
|
+
const pageHeight = bounds[3] - bounds[1];
|
|
531
|
+
// Single structured text pass with both flags
|
|
532
|
+
const stext = JSON.parse(
|
|
533
|
+
page.toStructuredText("preserve-whitespace,preserve-images").asJSON(),
|
|
534
|
+
) as StructuredTextJSON;
|
|
535
|
+
// Extract text boxes and image regions from the same parse
|
|
536
|
+
const textBoxes = extractTextBoxes(page, pageNumber, pageHeight, stext);
|
|
537
|
+
const images = extractImageRegions(stext, pageNumber, pageHeight);
|
|
538
|
+
// Extract vector segments from raw content stream
|
|
539
|
+
let segments: Segment[] = [];
|
|
540
|
+
try {
|
|
541
|
+
const pageObj = (page as mupdf.PDFPage).getObject();
|
|
542
|
+
const contents = pageObj.get("Contents");
|
|
543
|
+
if (contents) {
|
|
544
|
+
let rawBytes: Uint8Array;
|
|
545
|
+
if (contents.isArray()) {
|
|
546
|
+
// Multiple content streams — concatenate
|
|
547
|
+
const parts: Uint8Array[] = [];
|
|
548
|
+
const len = contents.length ?? 0;
|
|
549
|
+
for (let j = 0; j < len; j++) {
|
|
550
|
+
const stream = contents.get(j);
|
|
551
|
+
if (stream?.readStream) {
|
|
552
|
+
parts.push(stream.readStream().asUint8Array());
|
|
553
|
+
}
|
|
554
|
+
}
|
|
555
|
+
const totalLen = parts.reduce((s, p) => s + p.length, 0);
|
|
556
|
+
rawBytes = new Uint8Array(totalLen);
|
|
557
|
+
let offset = 0;
|
|
558
|
+
for (const part of parts) {
|
|
559
|
+
rawBytes.set(part, offset);
|
|
560
|
+
offset += part.length;
|
|
561
|
+
}
|
|
562
|
+
} else {
|
|
563
|
+
rawBytes = contents.readStream().asUint8Array();
|
|
564
|
+
}
|
|
565
|
+
const raw = new TextDecoder().decode(rawBytes);
|
|
566
|
+
segments = extractSegmentsFromContentStream(raw, pageNumber);
|
|
567
|
+
}
|
|
568
|
+
} catch {
|
|
569
|
+
// Content stream extraction failed — proceed with text only
|
|
570
|
+
}
|
|
571
|
+
pages.push({ pageNumber, textBoxes, segments, images });
|
|
572
|
+
}
|
|
573
|
+
return pages;
|
|
574
|
+
}
|