@polotno/pdf-import 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,91 @@
1
+ // Extract raw image streams (e.g. JPEG) directly from PDF binary,
2
+ // avoiding pdfjs's decode-to-pixels path which loses compression.
3
+ /**
4
+ * Build a lookup from PDF object number to its raw JPEG stream data.
5
+ * Only indexes image XObjects with DCTDecode (JPEG) filter.
6
+ */
7
+ export function buildJpegIndex(pdfBytes) {
8
+ const result = new Map();
9
+ const latin1 = uint8ToLatin1(pdfBytes);
10
+ // Find each "N 0 obj <<" and parse the dictionary with proper nesting
11
+ const objStartRegex = /(\d+) 0 obj\s*<</g;
12
+ let match;
13
+ while ((match = objStartRegex.exec(latin1)) !== null) {
14
+ const objNum = parseInt(match[1], 10);
15
+ const dictStart = match.index + match[0].length;
16
+ // Walk forward handling nested << >> to find the closing >>
17
+ let depth = 1;
18
+ let pos = dictStart;
19
+ while (depth > 0 && pos < latin1.length - 1) {
20
+ if (latin1[pos] === '<' && latin1[pos + 1] === '<') {
21
+ depth++;
22
+ pos += 2;
23
+ }
24
+ else if (latin1[pos] === '>' && latin1[pos + 1] === '>') {
25
+ depth--;
26
+ pos += 2;
27
+ }
28
+ else {
29
+ pos++;
30
+ }
31
+ }
32
+ const dict = latin1.substring(dictStart, pos - 2);
33
+ // Check if "stream" follows the closing >>
34
+ const afterDict = latin1.substring(pos, pos + 20);
35
+ const streamMatch = afterDict.match(/^\s*stream\r?\n/);
36
+ if (!streamMatch)
37
+ continue;
38
+ // Only care about image XObjects with DCTDecode filter
39
+ if (!dict.includes('/Subtype /Image'))
40
+ continue;
41
+ if (!dict.includes('/DCTDecode'))
42
+ continue;
43
+ // Extract /Length value (may be a direct number or indirect ref "N 0 R")
44
+ const lengthMatch = dict.match(/\/Length\s+(\d+)(\s+0\s+R)?/);
45
+ if (!lengthMatch)
46
+ continue;
47
+ let length;
48
+ if (lengthMatch[2]) {
49
+ // Indirect reference — resolve by finding the referenced object
50
+ const refObjNum = lengthMatch[1];
51
+ const refMatch = latin1.match(new RegExp(refObjNum + '\\s+0\\s+obj\\s+(\\d+)'));
52
+ if (!refMatch)
53
+ continue;
54
+ length = parseInt(refMatch[1], 10);
55
+ }
56
+ else {
57
+ length = parseInt(lengthMatch[1], 10);
58
+ }
59
+ // Stream data starts right after "stream\r\n" or "stream\n"
60
+ const streamStart = pos + streamMatch[0].length;
61
+ const streamData = pdfBytes.slice(streamStart, streamStart + length);
62
+ // Verify JPEG magic bytes (FFD8)
63
+ if (streamData[0] !== 0xff || streamData[1] !== 0xd8)
64
+ continue;
65
+ result.set(objNum, {
66
+ data: streamData,
67
+ mimeType: 'image/jpeg',
68
+ });
69
+ }
70
+ return result;
71
+ }
72
+ /**
73
+ * Parse a pdfjs ref string like "44R" to extract the object number.
74
+ */
75
+ export function parseRef(ref) {
76
+ const m = ref.match(/^(\d+)R$/);
77
+ return m ? parseInt(m[1], 10) : null;
78
+ }
79
+ function uint8ToLatin1(bytes) {
80
+ if (typeof Buffer !== 'undefined') {
81
+ return Buffer.from(bytes).toString('latin1');
82
+ }
83
+ const chunks = [];
84
+ const chunkSize = 65536;
85
+ for (let i = 0; i < bytes.length; i += chunkSize) {
86
+ const slice = bytes.subarray(i, Math.min(i + chunkSize, bytes.length));
87
+ chunks.push(String.fromCharCode(...slice));
88
+ }
89
+ return chunks.join('');
90
+ }
91
+ //# sourceMappingURL=pdf-image-extractor.js.map
@@ -0,0 +1,23 @@
1
+ import type { ExtractedDrawing, DrawingItem } from './operator-list.js';
2
+ export declare function drawingToSvg(drawing: ExtractedDrawing, pageWidth: number, pageHeight: number): {
3
+ svg: string;
4
+ x: number;
5
+ y: number;
6
+ width: number;
7
+ height: number;
8
+ } | null;
9
+ export declare function clippedDrawingsToSvg(drawings: ExtractedDrawing[]): {
10
+ svg: string;
11
+ x: number;
12
+ y: number;
13
+ width: number;
14
+ height: number;
15
+ } | null;
16
+ /**
17
+ * Convert clip path items to an SVG string for use as clipSrc.
18
+ * Coordinates are made relative to the image's position (imgX, imgY).
19
+ * The viewBox matches the image dimensions so the clip aligns properly.
20
+ */
21
+ export declare function clipPathToSvg(items: DrawingItem[], imgX: number, imgY: number, imageWidth: number, imageHeight: number): string;
22
+ export declare function svgToDataUri(svg: string): string;
23
+ //# sourceMappingURL=svg-builder.d.ts.map
@@ -0,0 +1,213 @@
1
+ import { rgbTupleToHex } from './color-utils.js';
2
+ function computeDrawingBounds(drawing, pageWidth, pageHeight) {
3
+ let [x0, y0, x1, y1] = drawing.rect;
4
+ const sw = drawing.stroke ? drawing.strokeWidth || 1 : 0;
5
+ // Expand bounds by stroke width/2 so strokes aren't clipped at edges
6
+ if (sw > 0) {
7
+ const half = sw / 2;
8
+ x0 -= half;
9
+ y0 -= half;
10
+ x1 += half;
11
+ y1 += half;
12
+ }
13
+ // For stroked lines with zero/tiny width or height, expand to ensure visibility.
14
+ const minDim = Math.max(sw, 2); // at least 2px so lines are visible
15
+ if (x1 - x0 < minDim && sw > 0) {
16
+ const mid = (x0 + x1) / 2;
17
+ x0 = mid - minDim / 2;
18
+ x1 = mid + minDim / 2;
19
+ }
20
+ if (y1 - y0 < minDim && sw > 0) {
21
+ const mid = (y0 + y1) / 2;
22
+ y0 = mid - minDim / 2;
23
+ y1 = mid + minDim / 2;
24
+ }
25
+ const width = x1 - x0;
26
+ const height = y1 - y0;
27
+ if (width < 0.1 && height < 0.1)
28
+ return null;
29
+ // Skip full-page background rectangles
30
+ if (drawing.fill !== null &&
31
+ pageWidth > 0 &&
32
+ pageHeight > 0 &&
33
+ width >= pageWidth * 0.9 &&
34
+ height >= pageHeight * 0.9) {
35
+ return null;
36
+ }
37
+ return { x0, y0, width, height };
38
+ }
39
+ function buildPathData(items, originX, originY, shouldCloseFill, closePath) {
40
+ const pathParts = [];
41
+ let cur = null;
42
+ let subStart = null;
43
+ const EPS = 0.5;
44
+ function gap(px, py) {
45
+ return (cur === null || Math.abs(px - cur[0]) > EPS || Math.abs(py - cur[1]) > EPS);
46
+ }
47
+ function move(px, py) {
48
+ pathParts.push(`M ${px - originX} ${py - originY}`);
49
+ cur = [px, py];
50
+ subStart = [px, py];
51
+ }
52
+ for (const item of items) {
53
+ if (item.kind === 're') {
54
+ const rx = item.x - originX;
55
+ const ry = item.y - originY;
56
+ pathParts.push(`M ${rx} ${ry} h ${item.w} v ${item.h} h ${-item.w} Z`);
57
+ cur = [item.x, item.y];
58
+ subStart = cur;
59
+ }
60
+ else if (item.kind === 'm') {
61
+ move(item.x, item.y);
62
+ }
63
+ else if (item.kind === 'l') {
64
+ if (gap(item.x1, item.y1)) {
65
+ move(item.x1, item.y1);
66
+ }
67
+ pathParts.push(`L ${item.x2 - originX} ${item.y2 - originY}`);
68
+ cur = [item.x2, item.y2];
69
+ }
70
+ else if (item.kind === 'c') {
71
+ if (gap(item.x1, item.y1)) {
72
+ move(item.x1, item.y1);
73
+ }
74
+ pathParts.push(`C ${item.cpx1 - originX} ${item.cpy1 - originY} ${item.cpx2 - originX} ${item.cpy2 - originY} ${item.x2 - originX} ${item.y2 - originY}`);
75
+ cur = [item.x2, item.y2];
76
+ }
77
+ }
78
+ if (shouldCloseFill && subStart && cur) {
79
+ if (gap(subStart[0], subStart[1])) {
80
+ pathParts.push(`L ${subStart[0] - originX} ${subStart[1] - originY}`);
81
+ }
82
+ pathParts.push('Z');
83
+ }
84
+ else if (closePath) {
85
+ pathParts.push('Z');
86
+ }
87
+ return pathParts.join(' ');
88
+ }
89
+ export function drawingToSvg(drawing, pageWidth, pageHeight) {
90
+ const bounds = computeDrawingBounds(drawing, pageWidth, pageHeight);
91
+ if (!bounds)
92
+ return null;
93
+ const { x0, y0, width: w, height: h } = bounds;
94
+ const fillHex = drawing.fill ? rgbTupleToHex(...drawing.fill) : 'none';
95
+ const strokeHex = drawing.stroke
96
+ ? rgbTupleToHex(...drawing.stroke)
97
+ : 'none';
98
+ const dAttr = buildPathData(drawing.items, x0, y0, fillHex !== 'none', drawing.closePath);
99
+ const strokeAttr = drawing.stroke
100
+ ? `stroke="${strokeHex}" stroke-width="${drawing.strokeWidth}"`
101
+ : 'stroke="none"';
102
+ const fillRule = drawing.evenOdd ? ' fill-rule="evenodd"' : '';
103
+ // Build SVG with optional gradient definition
104
+ let defs = '';
105
+ let fillValue = fillHex;
106
+ if (drawing.gradient && drawing.gradient.stops.length > 0) {
107
+ const g = drawing.gradient;
108
+ // Detect grayscale gradients (luminosity masks used as overlays in PDF).
109
+ // Convert them to alpha-based gradients: luminosity → stop-opacity, color → black.
110
+ const isGrayscale = g.stops.every((s) => {
111
+ const m = s.color.match(/^#([0-9a-f]{2})([0-9a-f]{2})([0-9a-f]{2})$/i);
112
+ if (!m)
113
+ return false;
114
+ const r = parseInt(m[1], 16), gv = parseInt(m[2], 16), b = parseInt(m[3], 16);
115
+ return Math.abs(r - gv) <= 2 && Math.abs(gv - b) <= 2;
116
+ });
117
+ const stopsStr = g.stops
118
+ .map((s) => {
119
+ if (isGrayscale) {
120
+ const m = s.color.match(/^#([0-9a-f]{2})/i);
121
+ const luminosity = m ? parseInt(m[1], 16) / 255 : 0;
122
+ return `<stop offset="${s.offset}" stop-color="#000000" stop-opacity="${luminosity}"/>`;
123
+ }
124
+ return `<stop offset="${s.offset}" stop-color="${s.color}"/>`;
125
+ })
126
+ .join('');
127
+ if (g.type === 'linear') {
128
+ // Use objectBoundingBox units — normalize gradient direction to 0-1
129
+ // Determine primary direction from the raw gradient coordinates
130
+ const dx = (g.x2 ?? 0) - (g.x1 ?? 0);
131
+ const dy = (g.y2 ?? 0) - (g.y1 ?? 0);
132
+ const len = Math.sqrt(dx * dx + dy * dy);
133
+ // Normalize direction
134
+ let nx1 = 0, ny1 = 0.5, nx2 = 1, ny2 = 0.5; // default horizontal
135
+ if (len > 0) {
136
+ const ndx = dx / len;
137
+ const ndy = dy / len;
138
+ // Map direction to bounding box: 0,0 is top-left, 1,1 is bottom-right
139
+ nx1 = ndx < 0 ? 1 : 0;
140
+ ny1 = ndy < 0 ? 1 : 0;
141
+ nx2 = ndx < 0 ? 0 : (ndx > 0 ? 1 : nx1);
142
+ ny2 = ndy < 0 ? 0 : (ndy > 0 ? 1 : ny1);
143
+ }
144
+ defs = `<defs><linearGradient id="g" x1="${nx1}" y1="${ny1}" x2="${nx2}" y2="${ny2}">${stopsStr}</linearGradient></defs>`;
145
+ fillValue = 'url(#g)';
146
+ }
147
+ else {
148
+ // Radial gradient — use objectBoundingBox with center at 0.5,0.5
149
+ defs = `<defs><radialGradient id="g" cx="0.5" cy="0.5" r="0.5">${stopsStr}</radialGradient></defs>`;
150
+ fillValue = 'url(#g)';
151
+ }
152
+ }
153
+ const svgStr = `<svg viewBox="0 0 ${w} ${h}" xmlns="http://www.w3.org/2000/svg">${defs}<path d="${dAttr}" fill="${fillValue}" ${strokeAttr}${fillRule}/></svg>`;
154
+ return { svg: svgStr, x: x0, y: y0, width: w, height: h };
155
+ }
156
+ export function clippedDrawingsToSvg(drawings) {
157
+ if (drawings.length === 0 ||
158
+ !drawings[0].clipRect ||
159
+ !drawings[0].clipPath) {
160
+ return null;
161
+ }
162
+ const [x0, y0, x1, y1] = drawings[0].clipRect;
163
+ const width = x1 - x0;
164
+ const height = y1 - y0;
165
+ if (width < 0.1 || height < 0.1)
166
+ return null;
167
+ const clipD = buildPathData(drawings[0].clipPath, x0, y0, true, true);
168
+ const clipRule = drawings[0].evenOdd ? ' clip-rule="evenodd"' : '';
169
+ const body = drawings
170
+ .map((drawing) => {
171
+ const fillHex = drawing.fill ? rgbTupleToHex(...drawing.fill) : 'none';
172
+ const strokeHex = drawing.stroke
173
+ ? rgbTupleToHex(...drawing.stroke)
174
+ : 'none';
175
+ const dAttr = buildPathData(drawing.items, x0, y0, fillHex !== 'none', drawing.closePath);
176
+ const strokeAttr = drawing.stroke
177
+ ? `stroke="${strokeHex}" stroke-width="${drawing.strokeWidth}"`
178
+ : 'stroke="none"';
179
+ const fillRule = drawing.evenOdd ? ' fill-rule="evenodd"' : '';
180
+ return `<path d="${dAttr}" fill="${fillHex}" ${strokeAttr}${fillRule}/>`;
181
+ })
182
+ .join('');
183
+ const svgStr = `<svg viewBox="0 0 ${width} ${height}" xmlns="http://www.w3.org/2000/svg"><defs><clipPath id="clip"><path d="${clipD}"${clipRule}/></clipPath></defs><g clip-path="url(#clip)">${body}</g></svg>`;
184
+ return { svg: svgStr, x: x0, y: y0, width, height };
185
+ }
186
+ /**
187
+ * Convert clip path items to an SVG string for use as clipSrc.
188
+ * Coordinates are made relative to the image's position (imgX, imgY).
189
+ * The viewBox matches the image dimensions so the clip aligns properly.
190
+ */
191
+ export function clipPathToSvg(items, imgX, imgY, imageWidth, imageHeight) {
192
+ if (items.length === 0)
193
+ return '';
194
+ const dAttr = buildPathData(items, imgX, imgY, true, true);
195
+ return `<svg viewBox="0 0 ${imageWidth} ${imageHeight}" xmlns="http://www.w3.org/2000/svg"><path d="${dAttr}" fill="white"/></svg>`;
196
+ }
197
+ export function svgToDataUri(svg) {
198
+ let b64;
199
+ if (typeof Buffer !== 'undefined') {
200
+ b64 = Buffer.from(svg, 'utf-8').toString('base64');
201
+ }
202
+ else {
203
+ // Browser: encode UTF-8 bytes then btoa
204
+ const bytes = new TextEncoder().encode(svg);
205
+ let binary = '';
206
+ for (let i = 0; i < bytes.length; i++) {
207
+ binary += String.fromCharCode(bytes[i]);
208
+ }
209
+ b64 = btoa(binary);
210
+ }
211
+ return `data:image/svg+xml;base64,${b64}`;
212
+ }
213
+ //# sourceMappingURL=svg-builder.js.map
@@ -0,0 +1,6 @@
1
+ import type { TextBlock, TextSpan } from './text-types.js';
2
+ export declare function groupSpansByBlock(spans: TextSpan[]): TextBlock[];
3
+ export declare function detectAlignment(blockSpans: TextSpan[], pageWidth: number, leftMargin: number, rightMargin: number): string;
4
+ export declare function estimatePageMargins(spans: TextSpan[]): [number, number];
5
+ export declare function computeLineHeight(blockSpans: TextSpan[]): number;
6
+ //# sourceMappingURL=text-blocks.d.ts.map
@@ -0,0 +1,294 @@
1
+ import { DEFAULT_LINE_HEIGHT } from './constants.js';
2
+ // Group spans into blocks
3
+ export function groupSpansByBlock(spans) {
4
+ const blockMap = new Map();
5
+ for (const span of spans) {
6
+ const arr = blockMap.get(span.blockNo) || [];
7
+ arr.push(span);
8
+ blockMap.set(span.blockNo, arr);
9
+ }
10
+ const blocks = [];
11
+ for (const [, blockSpans] of [...blockMap.entries()].sort((a, b) => a[0] - b[0])) {
12
+ const columns = splitBlockByColumn(blockSpans);
13
+ for (const colSpans of columns) {
14
+ colSpans.sort((a, b) => a.lineNo - b.lineNo || a.x - b.x);
15
+ const x = Math.min(...colSpans.map((s) => s.x));
16
+ const y = Math.min(...colSpans.map((s) => s.y));
17
+ const x1 = Math.max(...colSpans.map((s) => s.x + s.width));
18
+ const y1 = Math.max(...colSpans.map((s) => s.y + s.height));
19
+ blocks.push({
20
+ spans: colSpans,
21
+ x,
22
+ y,
23
+ width: x1 - x,
24
+ height: y1 - y,
25
+ });
26
+ }
27
+ }
28
+ return blocks;
29
+ }
30
+ /**
31
+ * Split a block into columns if it looks like a table row.
32
+ * Two strategies:
33
+ * 1. Gap-based: gaps at identical X positions in 100% of multi-span lines (tight 2px tolerance)
34
+ * 2. X-range: spans form distinct non-overlapping X-range groups with clear gaps between them
35
+ */
36
+ function splitBlockByColumn(spans) {
37
+ if (spans.length <= 1)
38
+ return [spans];
39
+ // Strategy 1: Gap-based detection (for rows where columns share lines)
40
+ const lines = new Map();
41
+ for (const s of spans) {
42
+ const arr = lines.get(s.lineNo) || [];
43
+ arr.push(s);
44
+ lines.set(s.lineNo, arr);
45
+ }
46
+ const multiSpanLines = new Map();
47
+ for (const [lineNo, lineSpans] of lines) {
48
+ if (lineSpans.length > 1) {
49
+ multiSpanLines.set(lineNo, lineSpans);
50
+ }
51
+ }
52
+ if (multiSpanLines.size >= 2) {
53
+ const gapPositions = [];
54
+ for (const [, lineSpans] of multiSpanLines) {
55
+ const sorted = [...lineSpans].sort((a, b) => a.x - b.x);
56
+ for (let i = 1; i < sorted.length; i++) {
57
+ const prevEnd = sorted[i - 1].x + sorted[i - 1].width;
58
+ const gap = sorted[i].x - prevEnd;
59
+ if (gap > sorted[i].fontSize * 0.3) {
60
+ gapPositions.push(sorted[i].x);
61
+ }
62
+ }
63
+ }
64
+ if (gapPositions.length > 0) {
65
+ gapPositions.sort((a, b) => a - b);
66
+ const clusters = [
67
+ { x: gapPositions[0], count: 1 },
68
+ ];
69
+ for (let i = 1; i < gapPositions.length; i++) {
70
+ const last = clusters[clusters.length - 1];
71
+ if (gapPositions[i] - last.x < 2) {
72
+ last.x =
73
+ (last.x * last.count + gapPositions[i]) / (last.count + 1);
74
+ last.count++;
75
+ }
76
+ else {
77
+ clusters.push({ x: gapPositions[i], count: 1 });
78
+ }
79
+ }
80
+ const splitPositions = clusters
81
+ .filter((c) => c.count >= multiSpanLines.size)
82
+ .map((c) => c.x);
83
+ if (splitPositions.length > 0) {
84
+ return splitSpansAtPositions(spans, splitPositions);
85
+ }
86
+ }
87
+ }
88
+ // Strategy 2: X-range clustering (for rows where columns DON'T share lines)
89
+ // Group spans by their X start position into clusters
90
+ if (spans.length < 2)
91
+ return [spans];
92
+ const avgFontSize = spans.reduce((sum, s) => sum + s.fontSize, 0) / spans.length;
93
+ const sorted = [...spans].sort((a, b) => a.x - b.x);
94
+ const xGroups = [];
95
+ for (const s of sorted) {
96
+ const end = s.x + s.width;
97
+ const last = xGroups[xGroups.length - 1];
98
+ if (last && s.x < last.maxEnd + avgFontSize * 0.5) {
99
+ // Overlaps or nearly touches the current group
100
+ last.maxEnd = Math.max(last.maxEnd, end);
101
+ last.spans.push(s);
102
+ }
103
+ else {
104
+ xGroups.push({ minX: s.x, maxEnd: end, spans: [s] });
105
+ }
106
+ }
107
+ if (xGroups.length < 2)
108
+ return [spans];
109
+ // Verify clear gaps between groups (at least fontSize * 0.5)
110
+ let allClear = true;
111
+ for (let i = 1; i < xGroups.length; i++) {
112
+ const gap = xGroups[i].minX - xGroups[i - 1].maxEnd;
113
+ if (gap < avgFontSize * 0.5) {
114
+ allClear = false;
115
+ break;
116
+ }
117
+ }
118
+ if (!allClear)
119
+ return [spans];
120
+ // Single-line labels can be widely separated on the same row (diagram labels,
121
+ // navigation items). Split when gaps are much larger than normal word spacing.
122
+ if (lines.size === 1 && xGroups.length >= 2) {
123
+ const gaps = xGroups.slice(1).map((group, index) => {
124
+ return group.minX - xGroups[index].maxEnd;
125
+ });
126
+ if (gaps.every((gap) => gap >= avgFontSize * 3)) {
127
+ return xGroups.map((g) => g.spans);
128
+ }
129
+ }
130
+ // Strong signal: every X-range has repeated content across the block.
131
+ if (xGroups.every((g) => g.spans.length >= 2))
132
+ return xGroups.map((g) => g.spans);
133
+ // Diagram labels often mix stacked multi-line labels with nearby single-span
134
+ // labels on the same visual row. Keep a narrow fallback for these small-font,
135
+ // multi-line blocks so unrelated labels don't get merged into one text box.
136
+ if (lines.size >= 2 && avgFontSize <= 8 && xGroups.length >= 3) {
137
+ return xGroups.map((g) => g.spans);
138
+ }
139
+ return [spans];
140
+ }
141
+ function splitSpansAtPositions(spans, splitPositions) {
142
+ const nCols = splitPositions.length + 1;
143
+ const columns = Array.from({ length: nCols }, () => []);
144
+ for (const span of spans) {
145
+ let col = 0;
146
+ for (let i = 0; i < splitPositions.length; i++) {
147
+ if (span.x >= splitPositions[i] - 5)
148
+ col = i + 1;
149
+ }
150
+ columns[col].push(span);
151
+ }
152
+ return columns.filter((c) => c.length > 0);
153
+ }
154
+ // Alignment detection
155
+ export function detectAlignment(blockSpans, pageWidth, leftMargin, rightMargin) {
156
+ const lines = new Map();
157
+ for (const span of blockSpans) {
158
+ const arr = lines.get(span.lineNo) || [];
159
+ arr.push(span);
160
+ lines.set(span.lineNo, arr);
161
+ }
162
+ if (lines.size === 0)
163
+ return 'left';
164
+ const lineBounds = [...lines.values()].map((ls) => ({
165
+ x0: Math.min(...ls.map((s) => s.x)),
166
+ x1: Math.max(...ls.map((s) => s.x + s.width)),
167
+ }));
168
+ let textWidth = rightMargin - leftMargin;
169
+ if (textWidth <= 0)
170
+ textWidth = pageWidth;
171
+ // Single-line block
172
+ if (lineBounds.length === 1) {
173
+ const { x0, x1 } = lineBounds[0];
174
+ const blockCenter = (x0 + x1) / 2;
175
+ const pageCenter = (leftMargin + rightMargin) / 2;
176
+ const centerTol = textWidth * 0.05;
177
+ const rightTol = textWidth * 0.05;
178
+ if (Math.abs(blockCenter - pageCenter) < centerTol)
179
+ return 'center';
180
+ // Only classify as right-aligned if the line is short relative to the text
181
+ // area — a near-full-width line that happens to align with the right margin
182
+ // is more likely a paragraph line than a right-aligned label.
183
+ const lineWidth = x1 - x0;
184
+ if (Math.abs(x1 - rightMargin) < rightTol &&
185
+ x0 > leftMargin + centerTol &&
186
+ lineWidth < textWidth * 0.6)
187
+ return 'right';
188
+ return 'left';
189
+ }
190
+ // Multi-line block
191
+ const leftEdges = lineBounds.map((b) => b.x0);
192
+ const rightEdges = lineBounds.map((b) => b.x1);
193
+ const midpoints = lineBounds.map((b) => (b.x0 + b.x1) / 2);
194
+ function stddev(values) {
195
+ const mean = values.reduce((a, b) => a + b, 0) / values.length;
196
+ return Math.sqrt(values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / values.length);
197
+ }
198
+ const leftStd = stddev(leftEdges);
199
+ const rightStd = stddev(rightEdges);
200
+ const midStd = stddev(midpoints);
201
+ const THRESHOLD = 3.0;
202
+ if (leftStd < THRESHOLD && rightStd < THRESHOLD)
203
+ return 'justify';
204
+ // Justify detection with tolerance for short lines: last lines of paragraphs,
205
+ // headers, or dates mixed into the block won't reach the right margin.
206
+ // Filter to "full-width" lines (≥80% of the widest) and re-check.
207
+ if (lineBounds.length >= 3) {
208
+ const maxWidth = Math.max(...lineBounds.map((b) => b.x1 - b.x0));
209
+ const fullLines = lineBounds.filter((b) => b.x1 - b.x0 >= maxWidth * 0.8);
210
+ if (fullLines.length >= 3) {
211
+ const fullLeftStd = stddev(fullLines.map((b) => b.x0));
212
+ const fullRightStd = stddev(fullLines.map((b) => b.x1));
213
+ if (fullLeftStd < THRESHOLD && fullRightStd < THRESHOLD)
214
+ return 'justify';
215
+ }
216
+ }
217
+ if (midStd < THRESHOLD && midStd <= leftStd && midStd <= rightStd)
218
+ return 'center';
219
+ if (rightStd < THRESHOLD && rightStd < leftStd)
220
+ return 'right';
221
+ return 'left';
222
+ }
223
+ // Estimate page margins
224
+ export function estimatePageMargins(spans) {
225
+ if (spans.length === 0)
226
+ return [0, 0];
227
+ const blockX0 = new Map();
228
+ const blockX1 = new Map();
229
+ for (const s of spans) {
230
+ const bn = s.blockNo;
231
+ blockX0.set(bn, Math.min(blockX0.get(bn) ?? s.x, s.x));
232
+ blockX1.set(bn, Math.max(blockX1.get(bn) ?? s.x + s.width, s.x + s.width));
233
+ }
234
+ const leftEdges = [...blockX0.values()].sort((a, b) => a - b);
235
+ const rightEdges = [...blockX1.values()].sort((a, b) => a - b);
236
+ function percentile(values, p) {
237
+ const idx = Math.floor(values.length * p);
238
+ return values[Math.min(idx, values.length - 1)];
239
+ }
240
+ return [percentile(leftEdges, 0.1), percentile(rightEdges, 0.9)];
241
+ }
242
+ // Compute line height from baselines
243
+ export function computeLineHeight(blockSpans) {
244
+ const lines = new Map();
245
+ for (const span of blockSpans) {
246
+ const arr = lines.get(span.lineNo) || [];
247
+ arr.push(span);
248
+ lines.set(span.lineNo, arr);
249
+ }
250
+ const sortedLineNos = [...lines.keys()].sort((a, b) => a - b);
251
+ if (sortedLineNos.length < 2)
252
+ return DEFAULT_LINE_HEIGHT;
253
+ function medianY(spans) {
254
+ const ys = spans.map((s) => s.y).sort((a, b) => a - b);
255
+ const mid = Math.floor(ys.length / 2);
256
+ return ys.length % 2 ? ys[mid] : (ys[mid - 1] + ys[mid]) / 2;
257
+ }
258
+ const baselineYs = sortedLineNos.map((ln) => medianY(lines.get(ln)));
259
+ const gaps = [];
260
+ for (let i = 0; i < baselineYs.length - 1; i++) {
261
+ const gap = baselineYs[i + 1] - baselineYs[i];
262
+ if (gap > 0)
263
+ gaps.push(gap);
264
+ }
265
+ if (gaps.length === 0)
266
+ return DEFAULT_LINE_HEIGHT;
267
+ // Use median gap instead of average — robust against outlier gaps from
268
+ // mixed font sizes (e.g. header lines with larger fontSize in the same block).
269
+ const sortedGaps = [...gaps].sort((a, b) => a - b);
270
+ const mid = Math.floor(sortedGaps.length / 2);
271
+ const medianGap = sortedGaps.length % 2
272
+ ? sortedGaps[mid]
273
+ : (sortedGaps[mid - 1] + sortedGaps[mid]) / 2;
274
+ // Use the most common fontSize (by text length), not the largest —
275
+ // blocks can mix header lines (larger font) with body text.
276
+ const fontSizeCounts = new Map();
277
+ for (const span of blockSpans) {
278
+ const key = Math.round(span.fontSize * 10); // group by ~0.1px
279
+ fontSizeCounts.set(key, (fontSizeCounts.get(key) || 0) + span.text.length);
280
+ }
281
+ let dominantFontSize = blockSpans[0].fontSize;
282
+ let maxCount = 0;
283
+ for (const [key, count] of fontSizeCounts) {
284
+ if (count > maxCount) {
285
+ maxCount = count;
286
+ dominantFontSize = key / 10;
287
+ }
288
+ }
289
+ if (dominantFontSize <= 0)
290
+ return DEFAULT_LINE_HEIGHT;
291
+ const ratio = medianGap / dominantFontSize;
292
+ return Math.max(0.8, Math.min(3.0, ratio));
293
+ }
294
+ //# sourceMappingURL=text-blocks.js.map
@@ -0,0 +1,11 @@
1
+ export type { TextSpan, TextBlock } from './text-types.js';
2
+ export { groupSpansByBlock, detectAlignment, estimatePageMargins, computeLineHeight, } from './text-blocks.js';
3
+ export declare function groupTextItems(items: any[], // pdfjs TextItem[]
4
+ styles: Record<string, any>, yFlipOffset: number, positionColors: {
5
+ x: number;
6
+ y: number;
7
+ color: string;
8
+ fontName: string;
9
+ orderIndex: number;
10
+ }[], fontNameMap?: Map<string, string>, fontAscentMap?: Map<string, number>): import("./text-types.js").TextSpan[];
11
+ //# sourceMappingURL=text-grouper.d.ts.map
@@ -0,0 +1,11 @@
1
+ import { extractTextSpans } from './text-span-extractor.js';
2
+ import { assignTextLayout } from './text-layout.js';
3
+ export { groupSpansByBlock, detectAlignment, estimatePageMargins, computeLineHeight, } from './text-blocks.js';
4
+ // Group TextItems from pdfjs into logical blocks
5
+ export function groupTextItems(items, // pdfjs TextItem[]
6
+ styles, yFlipOffset, positionColors, fontNameMap, fontAscentMap) {
7
+ void styles;
8
+ const spans = extractTextSpans(items, yFlipOffset, positionColors, fontNameMap, fontAscentMap);
9
+ return assignTextLayout(spans);
10
+ }
11
+ //# sourceMappingURL=text-grouper.js.map
@@ -0,0 +1,3 @@
1
+ import type { TextSpan } from './text-types.js';
2
+ export declare function assignTextLayout(spans: TextSpan[]): TextSpan[];
3
+ //# sourceMappingURL=text-layout.d.ts.map