@polotno/pdf-import 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +39 -0
- package/lib/color-utils.d.ts +2 -0
- package/lib/color-utils.js +10 -0
- package/lib/constants.d.ts +13 -0
- package/lib/constants.js +111 -0
- package/lib/font-mapper.d.ts +7 -0
- package/lib/font-mapper.js +111 -0
- package/lib/font-matcher.d.ts +10 -0
- package/lib/font-matcher.js +89 -0
- package/lib/font-merger.d.ts +7 -0
- package/lib/font-merger.js +114 -0
- package/lib/font-registry.d.ts +15 -0
- package/lib/font-registry.js +110 -0
- package/lib/image-encoder.d.ts +3 -0
- package/lib/image-encoder.js +181 -0
- package/lib/index.d.ts +97 -0
- package/lib/index.js +1 -0
- package/lib/operator-list-helpers.d.ts +6 -0
- package/lib/operator-list-helpers.js +26 -0
- package/lib/operator-list.d.ts +99 -0
- package/lib/operator-list.js +528 -0
- package/lib/page-parser.d.ts +18 -0
- package/lib/page-parser.js +674 -0
- package/lib/pdf-image-extractor.d.ts +14 -0
- package/lib/pdf-image-extractor.js +91 -0
- package/lib/svg-builder.d.ts +23 -0
- package/lib/svg-builder.js +213 -0
- package/lib/text-blocks.d.ts +6 -0
- package/lib/text-blocks.js +294 -0
- package/lib/text-grouper.d.ts +11 -0
- package/lib/text-grouper.js +11 -0
- package/lib/text-layout.d.ts +3 -0
- package/lib/text-layout.js +318 -0
- package/lib/text-span-extractor.d.ts +5 -0
- package/lib/text-span-extractor.js +271 -0
- package/lib/text-types.d.ts +25 -0
- package/lib/text-types.js +2 -0
- package/package.json +46 -0
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
export function assignTextLayout(spans) {
|
|
2
|
+
// Assign line numbers by clustering Y positions
|
|
3
|
+
if (spans.length === 0)
|
|
4
|
+
return spans;
|
|
5
|
+
// Sort by Y then X
|
|
6
|
+
spans.sort((a, b) => a.y - b.y || a.x - b.x);
|
|
7
|
+
// Detect page columns first — spans that share the same Y line but are far apart
|
|
8
|
+
// horizontally belong to different columns and should be grouped independently
|
|
9
|
+
const columns = detectPageColumns(spans);
|
|
10
|
+
let lineNo = 0;
|
|
11
|
+
let blockNo = 0;
|
|
12
|
+
for (const colSpans of columns) {
|
|
13
|
+
// Assign line numbers within this column
|
|
14
|
+
colSpans.sort((a, b) => a.y - b.y || a.x - b.x);
|
|
15
|
+
let currentLineY = colSpans[0].y;
|
|
16
|
+
let lineMaxFontSize = colSpans[0].fontSize;
|
|
17
|
+
for (const span of colSpans) {
|
|
18
|
+
// Use the larger of current span's font size and line's max font size
|
|
19
|
+
// so superscripts/small caps don't break lines with large text
|
|
20
|
+
const tolerance = Math.max(span.fontSize, lineMaxFontSize) * 0.3;
|
|
21
|
+
// But if font sizes differ drastically (>3x ratio), treat as separate lines
|
|
22
|
+
// even if Y positions are close. This prevents e.g. a 20px label merging
|
|
23
|
+
// with a 232px heading that happens to be nearby.
|
|
24
|
+
const sizeRatio = Math.max(span.fontSize, lineMaxFontSize) /
|
|
25
|
+
Math.min(span.fontSize, lineMaxFontSize);
|
|
26
|
+
if (Math.abs(span.y - currentLineY) > tolerance || sizeRatio > 3) {
|
|
27
|
+
lineNo++;
|
|
28
|
+
currentLineY = span.y;
|
|
29
|
+
lineMaxFontSize = span.fontSize;
|
|
30
|
+
}
|
|
31
|
+
else {
|
|
32
|
+
// Update running Y to track the dominant baseline (weighted toward larger text)
|
|
33
|
+
if (span.fontSize >= lineMaxFontSize) {
|
|
34
|
+
currentLineY = span.y;
|
|
35
|
+
lineMaxFontSize = span.fontSize;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
span.lineNo = lineNo;
|
|
39
|
+
}
|
|
40
|
+
lineNo++;
|
|
41
|
+
// Assign block numbers within this column
|
|
42
|
+
assignBlockNumbers(colSpans, blockNo);
|
|
43
|
+
// Find max block number assigned
|
|
44
|
+
let maxBlock = blockNo;
|
|
45
|
+
for (const s of colSpans) {
|
|
46
|
+
if (s.blockNo > maxBlock)
|
|
47
|
+
maxBlock = s.blockNo;
|
|
48
|
+
}
|
|
49
|
+
blockNo = maxBlock + 1;
|
|
50
|
+
}
|
|
51
|
+
return spans;
|
|
52
|
+
}
|
|
53
|
+
// Detect page columns by looking at left-edge X positions of text spans.
|
|
54
|
+
// In a multi-column layout, spans cluster into distinct X-position ranges.
|
|
55
|
+
function detectPageColumns(spans) {
|
|
56
|
+
if (spans.length <= 1)
|
|
57
|
+
return [spans];
|
|
58
|
+
// Find X range
|
|
59
|
+
let pageMinX = Infinity;
|
|
60
|
+
let pageMaxX = -Infinity;
|
|
61
|
+
for (const s of spans) {
|
|
62
|
+
if (s.x < pageMinX)
|
|
63
|
+
pageMinX = s.x;
|
|
64
|
+
const x1 = s.x + s.width;
|
|
65
|
+
if (x1 > pageMaxX)
|
|
66
|
+
pageMaxX = x1;
|
|
67
|
+
}
|
|
68
|
+
const pageW = pageMaxX - pageMinX;
|
|
69
|
+
if (pageW < 100)
|
|
70
|
+
return [spans];
|
|
71
|
+
// Group spans by Y-line
|
|
72
|
+
const yLines = [];
|
|
73
|
+
let currentY = spans[0].y;
|
|
74
|
+
let currentLine = [spans[0]];
|
|
75
|
+
for (let i = 1; i < spans.length; i++) {
|
|
76
|
+
const s = spans[i];
|
|
77
|
+
if (Math.abs(s.y - currentY) <= s.fontSize * 0.3) {
|
|
78
|
+
currentLine.push(s);
|
|
79
|
+
}
|
|
80
|
+
else {
|
|
81
|
+
yLines.push(currentLine);
|
|
82
|
+
currentLine = [s];
|
|
83
|
+
currentY = s.y;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
yLines.push(currentLine);
|
|
87
|
+
// Build X histogram: for each X bucket, count how many Y-lines have text there
|
|
88
|
+
const BUCKET = 5;
|
|
89
|
+
const nBuckets = Math.ceil(pageW / BUCKET);
|
|
90
|
+
const hist = new Float32Array(nBuckets);
|
|
91
|
+
for (const line of yLines) {
|
|
92
|
+
const covered = new Uint8Array(nBuckets);
|
|
93
|
+
for (const s of line) {
|
|
94
|
+
const b0 = Math.max(0, Math.floor((s.x - pageMinX) / BUCKET));
|
|
95
|
+
const b1 = Math.min(nBuckets - 1, Math.floor((s.x + s.width - pageMinX) / BUCKET));
|
|
96
|
+
for (let b = b0; b <= b1; b++)
|
|
97
|
+
covered[b] = 1;
|
|
98
|
+
}
|
|
99
|
+
for (let b = 0; b < nBuckets; b++)
|
|
100
|
+
hist[b] += covered[b];
|
|
101
|
+
}
|
|
102
|
+
// Find gutters: runs of low-coverage buckets (empty in most lines)
|
|
103
|
+
const threshold = yLines.length * 0.15;
|
|
104
|
+
const gutterXs = [];
|
|
105
|
+
let runStart = -1;
|
|
106
|
+
for (let b = 0; b <= nBuckets; b++) {
|
|
107
|
+
const val = b < nBuckets ? hist[b] : threshold + 1;
|
|
108
|
+
if (val <= threshold) {
|
|
109
|
+
if (runStart < 0)
|
|
110
|
+
runStart = b;
|
|
111
|
+
}
|
|
112
|
+
else if (runStart >= 0) {
|
|
113
|
+
const startX = pageMinX + runStart * BUCKET;
|
|
114
|
+
const endX = pageMinX + b * BUCKET;
|
|
115
|
+
const width = endX - startX;
|
|
116
|
+
if (width >= 10 && startX > pageMinX + 20 && endX < pageMaxX - 20) {
|
|
117
|
+
gutterXs.push((startX + endX) / 2);
|
|
118
|
+
}
|
|
119
|
+
runStart = -1;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
if (gutterXs.length === 0)
|
|
123
|
+
return [spans];
|
|
124
|
+
// Split spans into columns at gutter positions.
|
|
125
|
+
// Process by Y-line and split at gutters to keep justified text together.
|
|
126
|
+
const nCols = gutterXs.length + 1;
|
|
127
|
+
const columns = Array.from({ length: nCols }, () => []);
|
|
128
|
+
for (const line of yLines) {
|
|
129
|
+
const sorted = [...line].sort((a, b) => a.x - b.x);
|
|
130
|
+
// Split into groups at gutter boundaries
|
|
131
|
+
const groups = [[]];
|
|
132
|
+
for (const span of sorted) {
|
|
133
|
+
const lastGroup = groups[groups.length - 1];
|
|
134
|
+
if (lastGroup.length > 0) {
|
|
135
|
+
const prevEnd = lastGroup[lastGroup.length - 1].x +
|
|
136
|
+
lastGroup[lastGroup.length - 1].width;
|
|
137
|
+
let crosses = false;
|
|
138
|
+
for (const gx of gutterXs) {
|
|
139
|
+
if (prevEnd < gx && span.x >= gx) {
|
|
140
|
+
crosses = true;
|
|
141
|
+
break;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
if (crosses)
|
|
145
|
+
groups.push([span]);
|
|
146
|
+
else
|
|
147
|
+
lastGroup.push(span);
|
|
148
|
+
}
|
|
149
|
+
else {
|
|
150
|
+
lastGroup.push(span);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
// Assign each group to column
|
|
154
|
+
for (const group of groups) {
|
|
155
|
+
if (group.length === 0)
|
|
156
|
+
continue;
|
|
157
|
+
let col = 0;
|
|
158
|
+
for (let g = 0; g < gutterXs.length; g++) {
|
|
159
|
+
if (group[0].x >= gutterXs[g])
|
|
160
|
+
col = g + 1;
|
|
161
|
+
}
|
|
162
|
+
for (const span of group)
|
|
163
|
+
columns[col].push(span);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
return columns.filter((c) => c.length > 0);
|
|
167
|
+
}
|
|
168
|
+
function assignBlockNumbers(spans, startBlockNo = 0) {
|
|
169
|
+
if (spans.length === 0)
|
|
170
|
+
return;
|
|
171
|
+
// Group spans by line
|
|
172
|
+
const lines = new Map();
|
|
173
|
+
for (const span of spans) {
|
|
174
|
+
const arr = lines.get(span.lineNo) || [];
|
|
175
|
+
arr.push(span);
|
|
176
|
+
lines.set(span.lineNo, arr);
|
|
177
|
+
}
|
|
178
|
+
const lineGapCandidates = new Map();
|
|
179
|
+
const lineSpansByNo = new Map();
|
|
180
|
+
const sortedLineNos = [...lines.keys()].sort((a, b) => a - b);
|
|
181
|
+
for (const ln of sortedLineNos) {
|
|
182
|
+
const ls = [...lines.get(ln)].sort((a, b) => a.x - b.x);
|
|
183
|
+
lineSpansByNo.set(ln, ls);
|
|
184
|
+
const avgFontSize = ls.reduce((sum, s) => sum + s.fontSize, 0) / ls.length;
|
|
185
|
+
if (avgFontSize < 12) {
|
|
186
|
+
lineGapCandidates.set(ln, []);
|
|
187
|
+
continue;
|
|
188
|
+
}
|
|
189
|
+
const splitGap = Math.max(avgFontSize * 3, 40);
|
|
190
|
+
const splitPositions = [];
|
|
191
|
+
for (let i = 1; i < ls.length; i++) {
|
|
192
|
+
const prev = ls[i - 1];
|
|
193
|
+
const curr = ls[i];
|
|
194
|
+
const gap = curr.x - (prev.x + prev.width);
|
|
195
|
+
const fontRatio = Math.max(prev.fontSize, curr.fontSize) /
|
|
196
|
+
Math.min(prev.fontSize, curr.fontSize);
|
|
197
|
+
// When font size clearly differs, spans are separate visual elements even if
|
|
198
|
+
// close together (e.g. bold heading label next to body text list). Use a
|
|
199
|
+
// smaller gap threshold for such cases.
|
|
200
|
+
const fontSizeDiffers = fontRatio > 1.1;
|
|
201
|
+
const effectiveSplitGap = fontSizeDiffers ? Math.max(avgFontSize * 0.5, 8) : splitGap;
|
|
202
|
+
if (gap > effectiveSplitGap) {
|
|
203
|
+
const forceSplit = prev.color !== curr.color ||
|
|
204
|
+
prev.fontName !== curr.fontName ||
|
|
205
|
+
fontSizeDiffers;
|
|
206
|
+
splitPositions.push({ x: curr.x, force: forceSplit });
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
lineGapCandidates.set(ln, splitPositions);
|
|
210
|
+
}
|
|
211
|
+
const lineBounds = [];
|
|
212
|
+
for (const ln of sortedLineNos) {
|
|
213
|
+
const ls = lineSpansByNo.get(ln);
|
|
214
|
+
const splitCandidates = lineGapCandidates.get(ln);
|
|
215
|
+
const avgFontSize = ls.reduce((sum, s) => sum + s.fontSize, 0) / ls.length;
|
|
216
|
+
const tolerance = Math.max(avgFontSize, 20);
|
|
217
|
+
const repeatedSplits = splitCandidates
|
|
218
|
+
.filter((candidate) => {
|
|
219
|
+
if (candidate.force)
|
|
220
|
+
return true;
|
|
221
|
+
return sortedLineNos.some((otherLn) => {
|
|
222
|
+
if (otherLn === ln)
|
|
223
|
+
return false;
|
|
224
|
+
if (Math.abs(otherLn - ln) > 2)
|
|
225
|
+
return false;
|
|
226
|
+
return (lineGapCandidates.get(otherLn) || []).some((otherPos) => Math.abs(otherPos.x - candidate.x) <= tolerance);
|
|
227
|
+
});
|
|
228
|
+
})
|
|
229
|
+
.map((candidate) => candidate.x);
|
|
230
|
+
const segments = [[]];
|
|
231
|
+
for (const span of ls) {
|
|
232
|
+
if (segments[segments.length - 1].length > 0 &&
|
|
233
|
+
repeatedSplits.some((pos) => span.x >= pos - 5)) {
|
|
234
|
+
const prevEnd = segments[segments.length - 1][segments[segments.length - 1].length - 1].x +
|
|
235
|
+
segments[segments.length - 1][segments[segments.length - 1].length - 1].width;
|
|
236
|
+
const crossingSplit = repeatedSplits.some((pos) => prevEnd < pos && span.x >= pos - 5);
|
|
237
|
+
if (crossingSplit) {
|
|
238
|
+
segments.push([span]);
|
|
239
|
+
continue;
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
segments[segments.length - 1].push(span);
|
|
243
|
+
}
|
|
244
|
+
for (const segment of segments) {
|
|
245
|
+
lineBounds.push({
|
|
246
|
+
lineNo: ln,
|
|
247
|
+
spans: segment,
|
|
248
|
+
x0: Math.min(...segment.map((s) => s.x)),
|
|
249
|
+
x1: Math.max(...segment.map((s) => s.x + s.width)),
|
|
250
|
+
y0: Math.min(...segment.map((s) => s.y)),
|
|
251
|
+
y1: Math.max(...segment.map((s) => s.y + s.height)),
|
|
252
|
+
});
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
// Merge consecutive lines into blocks if they overlap in X and are close in Y
|
|
256
|
+
let blockNo = startBlockNo;
|
|
257
|
+
let currentBlock = [lineBounds[0]];
|
|
258
|
+
function assignBlock(lbs, bn) {
|
|
259
|
+
for (const lb of lbs) {
|
|
260
|
+
for (const span of lb.spans) {
|
|
261
|
+
span.blockNo = bn;
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
for (let i = 1; i < lineBounds.length; i++) {
|
|
266
|
+
const prev = currentBlock[currentBlock.length - 1];
|
|
267
|
+
const curr = lineBounds[i];
|
|
268
|
+
// Check Y gap
|
|
269
|
+
const yGap = curr.y0 - prev.y1;
|
|
270
|
+
const prevFontSize = prev.spans[0].fontSize;
|
|
271
|
+
const currFontSize = curr.spans[0].fontSize;
|
|
272
|
+
const avgFontSize = (prevFontSize + currFontSize) / 2;
|
|
273
|
+
// A significant font size change signals a new section (e.g., heading → body)
|
|
274
|
+
// Only apply across different lines — within the same line, font size changes
|
|
275
|
+
// are usually small caps or superscripts, not section breaks
|
|
276
|
+
const fontSizeRatio = Math.max(prevFontSize, currFontSize) /
|
|
277
|
+
Math.min(prevFontSize, currFontSize);
|
|
278
|
+
const onSameLine = prev.spans[0].lineNo === curr.spans[0].lineNo;
|
|
279
|
+
const fontSizeChanged = !onSameLine && fontSizeRatio > 1.15;
|
|
280
|
+
// Color change between lines signals different text elements (e.g. diagram
|
|
281
|
+
// labels in different colors: gray "Force control signal" vs black "Inter-neurons").
|
|
282
|
+
// Only split on color when X overlap is weak — paragraph lines have strong overlap
|
|
283
|
+
// even if color extraction is noisy (e.g. 6.pdf body text on dark background).
|
|
284
|
+
const prevLineSpans = prev.spans;
|
|
285
|
+
const currLineSpans = curr.spans;
|
|
286
|
+
const prevColor = prevLineSpans.reduce((a, b) => a.text.length > b.text.length ? a : b).color;
|
|
287
|
+
const currColor = currLineSpans.reduce((a, b) => a.text.length > b.text.length ? a : b).color;
|
|
288
|
+
// Check X overlap
|
|
289
|
+
const xOverlap = Math.min(prev.x1, curr.x1) - Math.max(prev.x0, curr.x0);
|
|
290
|
+
const narrowerWidth = Math.min(prev.x1 - prev.x0, curr.x1 - curr.x0);
|
|
291
|
+
const colorChanged = !onSameLine &&
|
|
292
|
+
prevColor !== currColor &&
|
|
293
|
+
xOverlap < narrowerWidth * 0.5;
|
|
294
|
+
const prevCenter = (prev.x0 + prev.x1) / 2;
|
|
295
|
+
const currCenter = (curr.x0 + curr.x1) / 2;
|
|
296
|
+
const largeSingleSpanDisplayShift = !onSameLine &&
|
|
297
|
+
avgFontSize >= 80 &&
|
|
298
|
+
prevLineSpans.length === 1 &&
|
|
299
|
+
currLineSpans.length === 1 &&
|
|
300
|
+
Math.abs(prev.x0 - curr.x0) > avgFontSize * 0.2 &&
|
|
301
|
+
Math.abs(prev.x1 - curr.x1) > avgFontSize * 0.2 &&
|
|
302
|
+
Math.abs(prevCenter - currCenter) > avgFontSize * 0.2;
|
|
303
|
+
if (!fontSizeChanged &&
|
|
304
|
+
!colorChanged &&
|
|
305
|
+
!largeSingleSpanDisplayShift &&
|
|
306
|
+
yGap < avgFontSize * 0.5 &&
|
|
307
|
+
(xOverlap > 0 || Math.abs(prev.x0 - curr.x0) < avgFontSize * 2)) {
|
|
308
|
+
currentBlock.push(curr);
|
|
309
|
+
}
|
|
310
|
+
else {
|
|
311
|
+
assignBlock(currentBlock, blockNo);
|
|
312
|
+
blockNo++;
|
|
313
|
+
currentBlock = [curr];
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
assignBlock(currentBlock, blockNo);
|
|
317
|
+
}
|
|
318
|
+
//# sourceMappingURL=text-layout.js.map
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import type { PositionColor } from './operator-list.js';
|
|
2
|
+
import type { TextSpan } from './text-types.js';
|
|
3
|
+
export declare function extractTextSpans(items: any[], // pdfjs TextItem[]
|
|
4
|
+
yFlipOffset: number, positionColors: PositionColor[], fontNameMap?: Map<string, string>, fontAscentMap?: Map<string, number>): TextSpan[];
|
|
5
|
+
//# sourceMappingURL=text-span-extractor.d.ts.map
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
function hasSyntheticItalic(a, b, c, d) {
|
|
2
|
+
const xAxisLength = Math.hypot(a, b);
|
|
3
|
+
const yAxisLength = Math.hypot(c, d);
|
|
4
|
+
if (xAxisLength < 0.001 || yAxisLength < 0.001)
|
|
5
|
+
return false;
|
|
6
|
+
// Pure rotation keeps the text axes orthogonal. Synthetic italic skews them
|
|
7
|
+
// so the transformed X/Y axes stop being perpendicular.
|
|
8
|
+
const orthogonality = Math.abs(a * c + b * d) / (xAxisLength * yAxisLength);
|
|
9
|
+
return orthogonality > 0.15;
|
|
10
|
+
}
|
|
11
|
+
export function extractTextSpans(items, // pdfjs TextItem[]
|
|
12
|
+
yFlipOffset, positionColors, fontNameMap, fontAscentMap) {
|
|
13
|
+
const spans = [];
|
|
14
|
+
// Pre-index positionColors by fontName for O(1) lookup instead of O(n) filter
|
|
15
|
+
const pcByFont = new Map();
|
|
16
|
+
for (const pc of positionColors) {
|
|
17
|
+
let arr = pcByFont.get(pc.fontName);
|
|
18
|
+
if (!arr) {
|
|
19
|
+
arr = [];
|
|
20
|
+
pcByFont.set(pc.fontName, arr);
|
|
21
|
+
}
|
|
22
|
+
arr.push(pc);
|
|
23
|
+
}
|
|
24
|
+
for (const item of items) {
|
|
25
|
+
if (!item.str || !item.str.trim())
|
|
26
|
+
continue;
|
|
27
|
+
if (!item.transform)
|
|
28
|
+
continue;
|
|
29
|
+
const [a, b, c, d, tx, ty] = item.transform;
|
|
30
|
+
const fontSize = Math.sqrt(a * a + b * b);
|
|
31
|
+
if (fontSize < 4)
|
|
32
|
+
continue;
|
|
33
|
+
const x = tx;
|
|
34
|
+
const w = item.width;
|
|
35
|
+
// ty is baseline in PDF coords (bottom-up). Convert to top-left Y (top-down).
|
|
36
|
+
// Use font ascent to find glyph top (distance from baseline to top of glyphs).
|
|
37
|
+
// pdfjs item.height = fontSize, but the visual top is ascent * fontSize above baseline.
|
|
38
|
+
const loadedNameForAscent = item.fontName || '';
|
|
39
|
+
const ascent = fontAscentMap?.get(loadedNameForAscent) ?? 1.0;
|
|
40
|
+
const ascentPx = fontSize * Math.max(ascent, 0.5);
|
|
41
|
+
const y = yFlipOffset - ty - ascentPx;
|
|
42
|
+
const baselineY = y + ascentPx;
|
|
43
|
+
const h = item.height || fontSize;
|
|
44
|
+
const rawRotation = (-Math.atan2(b, a) * 180) / Math.PI;
|
|
45
|
+
const rotation = Math.abs(rawRotation) < 2 ? 0 : rawRotation;
|
|
46
|
+
// Resolve real PDF font name from pdfjs internal loadedName
|
|
47
|
+
const loadedName = item.fontName || '';
|
|
48
|
+
const fontName = fontNameMap?.get(loadedName) || loadedName;
|
|
49
|
+
// Find closest color and orderIndex from operator list.
|
|
50
|
+
// Match by font name + size first, then font name only, then any.
|
|
51
|
+
let color = '#000000';
|
|
52
|
+
let orderIndex = 0;
|
|
53
|
+
if (positionColors.length > 0) {
|
|
54
|
+
const sameFontCandidates = pcByFont.get(loadedName) || [];
|
|
55
|
+
const sameFontAndSize = sameFontCandidates.filter((pc) => Math.abs(pc.fontSize - fontSize) < fontSize * 0.1);
|
|
56
|
+
const pickBest = (candidates) => fontSize < 18
|
|
57
|
+
? selectClosestXPositionColor(candidates, x)
|
|
58
|
+
: selectBestPositionColor(candidates, x, baselineY, fontSize);
|
|
59
|
+
const bestMatch = pickBest(sameFontAndSize) ??
|
|
60
|
+
pickBest(sameFontCandidates) ??
|
|
61
|
+
pickBest(positionColors);
|
|
62
|
+
if (bestMatch) {
|
|
63
|
+
color = bestMatch.color;
|
|
64
|
+
}
|
|
65
|
+
const orderMatch = selectClosestXPositionColor(sameFontCandidates, x) ??
|
|
66
|
+
selectClosestXPositionColor(positionColors, x);
|
|
67
|
+
if (orderMatch)
|
|
68
|
+
orderIndex = orderMatch.orderIndex;
|
|
69
|
+
}
|
|
70
|
+
// Detect weight/style from font name
|
|
71
|
+
const lower = fontName.toLowerCase();
|
|
72
|
+
// Google Fonts variable font naming: Arimo_700wght
|
|
73
|
+
const wghtMatch = lower.match(/_(\d+)wght/);
|
|
74
|
+
const fontWeight = wghtMatch
|
|
75
|
+
? (parseInt(wghtMatch[1], 10) >= 600 ? 'bold' : 'normal')
|
|
76
|
+
: (lower.includes('bold') ||
|
|
77
|
+
lower.includes('heavy') ||
|
|
78
|
+
lower.includes('black'))
|
|
79
|
+
? 'bold'
|
|
80
|
+
: 'normal';
|
|
81
|
+
// Detect italic: from font name OR from non-orthogonal text axes
|
|
82
|
+
// (synthetic italic). Pure rotation should not count as italic.
|
|
83
|
+
const hasShear = hasSyntheticItalic(a, b, c, d);
|
|
84
|
+
const fontStyle = lower.includes('italic') || lower.includes('oblique') || hasShear
|
|
85
|
+
? 'italic'
|
|
86
|
+
: 'normal';
|
|
87
|
+
spans.push({
|
|
88
|
+
text: item.str,
|
|
89
|
+
fontName,
|
|
90
|
+
fontSize,
|
|
91
|
+
x,
|
|
92
|
+
y,
|
|
93
|
+
baselineY,
|
|
94
|
+
rotation,
|
|
95
|
+
width: w,
|
|
96
|
+
height: h,
|
|
97
|
+
color,
|
|
98
|
+
fontWeight,
|
|
99
|
+
fontStyle,
|
|
100
|
+
blockNo: -1, // assigned during grouping
|
|
101
|
+
lineNo: -1,
|
|
102
|
+
orderIndex,
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
// Remove exact duplicate spans — some PDFs paint the same text twice at the
|
|
106
|
+
// same position (e.g. fill + stroke, or accessibility overlay).
|
|
107
|
+
deduplicateSpans(spans);
|
|
108
|
+
dedupeOrderIndices(spans, positionColors);
|
|
109
|
+
return spans;
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* Remove duplicate/overlapping spans painted multiple times at the same position.
|
|
113
|
+
* Handles both exact duplicates and overlapping text passes (e.g. "LA · L" then
|
|
114
|
+
* "LA · LV" at nearly the same X/Y) — keeps only the longest span.
|
|
115
|
+
*/
|
|
116
|
+
function deduplicateSpans(spans) {
|
|
117
|
+
const removed = new Set();
|
|
118
|
+
// Group by rounded Y + fontSize to find co-located spans
|
|
119
|
+
const byLine = new Map();
|
|
120
|
+
for (let i = 0; i < spans.length; i++) {
|
|
121
|
+
const key = `${Math.round(spans[i].y)}|${Math.round(spans[i].fontSize)}`;
|
|
122
|
+
const arr = byLine.get(key) || [];
|
|
123
|
+
arr.push(i);
|
|
124
|
+
byLine.set(key, arr);
|
|
125
|
+
}
|
|
126
|
+
for (const indices of byLine.values()) {
|
|
127
|
+
for (let a = 0; a < indices.length; a++) {
|
|
128
|
+
if (removed.has(indices[a]))
|
|
129
|
+
continue;
|
|
130
|
+
const sa = spans[indices[a]];
|
|
131
|
+
const aEnd = sa.x + sa.width;
|
|
132
|
+
for (let b = a + 1; b < indices.length; b++) {
|
|
133
|
+
if (removed.has(indices[b]))
|
|
134
|
+
continue;
|
|
135
|
+
const sb = spans[indices[b]];
|
|
136
|
+
const bEnd = sb.x + sb.width;
|
|
137
|
+
// Check if one span's X range is fully contained within the other's
|
|
138
|
+
const aContainsB = sb.x >= sa.x - 1 && bEnd <= aEnd + 1;
|
|
139
|
+
const bContainsA = sa.x >= sb.x - 1 && aEnd <= bEnd + 1;
|
|
140
|
+
if (!aContainsB && !bContainsA)
|
|
141
|
+
continue;
|
|
142
|
+
// Exact duplicate or one text contains/matches the other
|
|
143
|
+
if (sa.text === sb.text || (aContainsB && sa.text.includes(sb.text))) {
|
|
144
|
+
removed.add(indices[b]);
|
|
145
|
+
}
|
|
146
|
+
else if (bContainsA && sb.text.includes(sa.text)) {
|
|
147
|
+
removed.add(indices[a]);
|
|
148
|
+
break; // sa is removed, stop comparing it
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
if (removed.size === 0)
|
|
154
|
+
return;
|
|
155
|
+
let write = 0;
|
|
156
|
+
for (let i = 0; i < spans.length; i++) {
|
|
157
|
+
if (!removed.has(i))
|
|
158
|
+
spans[write++] = spans[i];
|
|
159
|
+
}
|
|
160
|
+
spans.length = write;
|
|
161
|
+
}
|
|
162
|
+
function selectBestPositionColor(candidates, x, y, fontSize) {
|
|
163
|
+
if (candidates.length === 0)
|
|
164
|
+
return null;
|
|
165
|
+
let bestXDist = Infinity;
|
|
166
|
+
for (const pc of candidates) {
|
|
167
|
+
bestXDist = Math.min(bestXDist, Math.abs(pc.x - x));
|
|
168
|
+
}
|
|
169
|
+
const xMatches = candidates.filter((pc) => Math.abs(pc.x - x) <= bestXDist + 1);
|
|
170
|
+
const xTrustLimit = fontSize >= 40 ? fontSize * 0.5 : fontSize * 1.5;
|
|
171
|
+
const yTrustLimit = fontSize * 1.5;
|
|
172
|
+
if (xMatches.length === 1 &&
|
|
173
|
+
bestXDist <= xTrustLimit &&
|
|
174
|
+
Math.abs(xMatches[0].y - y) <= yTrustLimit) {
|
|
175
|
+
return xMatches[0];
|
|
176
|
+
}
|
|
177
|
+
if (xMatches.length > 1 && bestXDist <= xTrustLimit) {
|
|
178
|
+
if (fontSize < 18) {
|
|
179
|
+
return xMatches[0];
|
|
180
|
+
}
|
|
181
|
+
const saneXMatches = xMatches.filter((pc) => Math.abs(pc.y - y) <= yTrustLimit);
|
|
182
|
+
if (saneXMatches.length === 1) {
|
|
183
|
+
return saneXMatches[0];
|
|
184
|
+
}
|
|
185
|
+
if (saneXMatches.length > 1) {
|
|
186
|
+
let bestYDist = Infinity;
|
|
187
|
+
for (const pc of saneXMatches) {
|
|
188
|
+
bestYDist = Math.min(bestYDist, Math.abs(pc.y - y));
|
|
189
|
+
}
|
|
190
|
+
const tolerance = Math.max(fontSize * 0.3, 4);
|
|
191
|
+
const nearMatches = saneXMatches.filter((pc) => Math.abs(pc.y - y) <= bestYDist + tolerance);
|
|
192
|
+
return nearMatches.reduce((best, current) => Math.abs(current.y - y) < Math.abs(best.y - y) ? current : best);
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
let bestScore = Infinity;
|
|
196
|
+
let best = null;
|
|
197
|
+
const scored = [];
|
|
198
|
+
for (const pc of candidates) {
|
|
199
|
+
const xDist = Math.abs(pc.x - x);
|
|
200
|
+
const yDist = Math.abs(pc.y - y);
|
|
201
|
+
const score = xDist + yDist * 3;
|
|
202
|
+
scored.push({ pc, score });
|
|
203
|
+
if (score < bestScore) {
|
|
204
|
+
bestScore = score;
|
|
205
|
+
best = pc;
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
if (!best)
|
|
209
|
+
return null;
|
|
210
|
+
// PDFs sometimes paint the same text multiple times at nearly the same position
|
|
211
|
+
// (e.g. fill + shadow/overlay). For near-equivalent matches, prefer the last
|
|
212
|
+
// painted candidate since that's what ends up visually on top.
|
|
213
|
+
const tolerance = Math.max(fontSize * 0.3, 4);
|
|
214
|
+
const nearMatches = scored
|
|
215
|
+
.filter(({ score }) => score <= bestScore + tolerance)
|
|
216
|
+
.map(({ pc }) => pc);
|
|
217
|
+
return nearMatches.reduce((latest, current) => current.orderIndex > latest.orderIndex ? current : latest);
|
|
218
|
+
}
|
|
219
|
+
function selectClosestXPositionColor(candidates, x) {
|
|
220
|
+
if (candidates.length === 0)
|
|
221
|
+
return null;
|
|
222
|
+
let best = null;
|
|
223
|
+
let bestXDist = Infinity;
|
|
224
|
+
for (const pc of candidates) {
|
|
225
|
+
const xDist = Math.abs(pc.x - x);
|
|
226
|
+
if (xDist < bestXDist) {
|
|
227
|
+
bestXDist = xDist;
|
|
228
|
+
best = pc;
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
return best;
|
|
232
|
+
}
|
|
233
|
+
function dedupeOrderIndices(spans, positionColors) {
|
|
234
|
+
// Deduplicate orderIndex: when multiple spans share the same orderIndex AND
|
|
235
|
+
// nearly identical X (within 1px), they're stacked texts that all matched the
|
|
236
|
+
// same positionColor. Reassign using Y-sorted positionColors with same font/X.
|
|
237
|
+
if (positionColors.length === 0)
|
|
238
|
+
return;
|
|
239
|
+
const byOrder = new Map(); // orderIndex → span indices
|
|
240
|
+
for (let si = 0; si < spans.length; si++) {
|
|
241
|
+
const oi = spans[si].orderIndex;
|
|
242
|
+
const arr = byOrder.get(oi) || [];
|
|
243
|
+
arr.push(si);
|
|
244
|
+
byOrder.set(oi, arr);
|
|
245
|
+
}
|
|
246
|
+
for (const [oi, indices] of byOrder) {
|
|
247
|
+
if (indices.length <= 1)
|
|
248
|
+
continue;
|
|
249
|
+
// Check that all spans in this group have nearly identical X (stacked column)
|
|
250
|
+
const xs = indices.map((i) => spans[i].x);
|
|
251
|
+
const xRange = Math.max(...xs) - Math.min(...xs);
|
|
252
|
+
if (xRange > 1)
|
|
253
|
+
continue; // Not truly stacked — different X positions
|
|
254
|
+
// Find the matched positionColor
|
|
255
|
+
const matchedPC = positionColors.find((pc) => pc.orderIndex === oi);
|
|
256
|
+
if (!matchedPC)
|
|
257
|
+
continue;
|
|
258
|
+
// Find nearby positionColors: same font, X within 1px of matched PC
|
|
259
|
+
const nearbyPCs = positionColors
|
|
260
|
+
.filter((pc) => pc.fontName === matchedPC.fontName && Math.abs(pc.x - matchedPC.x) < 1)
|
|
261
|
+
.sort((a, b) => a.y - b.y);
|
|
262
|
+
if (nearbyPCs.length < indices.length)
|
|
263
|
+
continue;
|
|
264
|
+
// Sort spans by Y and assign positionColors in Y order
|
|
265
|
+
indices.sort((a, b) => spans[a].y - spans[b].y);
|
|
266
|
+
for (let k = 0; k < indices.length; k++) {
|
|
267
|
+
spans[indices[k]].orderIndex = nearbyPCs[k].orderIndex;
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
//# sourceMappingURL=text-span-extractor.js.map
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
export interface TextSpan {
|
|
2
|
+
text: string;
|
|
3
|
+
fontName: string;
|
|
4
|
+
fontSize: number;
|
|
5
|
+
x: number;
|
|
6
|
+
y: number;
|
|
7
|
+
baselineY: number;
|
|
8
|
+
rotation: number;
|
|
9
|
+
width: number;
|
|
10
|
+
height: number;
|
|
11
|
+
color: string;
|
|
12
|
+
fontWeight: string;
|
|
13
|
+
fontStyle: string;
|
|
14
|
+
blockNo: number;
|
|
15
|
+
lineNo: number;
|
|
16
|
+
orderIndex: number;
|
|
17
|
+
}
|
|
18
|
+
export interface TextBlock {
|
|
19
|
+
spans: TextSpan[];
|
|
20
|
+
x: number;
|
|
21
|
+
y: number;
|
|
22
|
+
width: number;
|
|
23
|
+
height: number;
|
|
24
|
+
}
|
|
25
|
+
//# sourceMappingURL=text-types.d.ts.map
|
package/package.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@polotno/pdf-import",
|
|
3
|
+
"version": "0.0.1",
|
|
4
|
+
"description": "Convert PDF files into Polotno JSON format",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "./lib/index.js",
|
|
7
|
+
"types": "./lib/index.d.ts",
|
|
8
|
+
"exports": {
|
|
9
|
+
".": {
|
|
10
|
+
"import": "./lib/index.js",
|
|
11
|
+
"types": "./lib/index.d.ts"
|
|
12
|
+
}
|
|
13
|
+
},
|
|
14
|
+
"scripts": {
|
|
15
|
+
"build": "tsc && node build.js",
|
|
16
|
+
"test": "vitest run tests/index.test.ts",
|
|
17
|
+
"test:update": "vitest run tests/index.test.ts --update",
|
|
18
|
+
"test:visual": "node --max-old-space-size=4096 ./node_modules/.bin/vitest run --config vitest.visual.config.ts tests/visual-regression.test.ts",
|
|
19
|
+
"dev": "vite client"
|
|
20
|
+
},
|
|
21
|
+
"author": "Anton Lavrenov",
|
|
22
|
+
"files": [
|
|
23
|
+
"lib",
|
|
24
|
+
"README.md"
|
|
25
|
+
],
|
|
26
|
+
"dependencies": {
|
|
27
|
+
"opentype.js": "^1.3.4",
|
|
28
|
+
"pdfjs-dist": "^4.10.38"
|
|
29
|
+
},
|
|
30
|
+
"devDependencies": {
|
|
31
|
+
"@types/node": "^25.3.3",
|
|
32
|
+
"@types/react": "^19.2.14",
|
|
33
|
+
"@types/react-dom": "^19.2.3",
|
|
34
|
+
"@vitejs/plugin-react": "^5.1.4",
|
|
35
|
+
"esbuild": "^0.27.3",
|
|
36
|
+
"polotno": "^2.37.1",
|
|
37
|
+
"polotno-node": "^2.15.13",
|
|
38
|
+
"react": "^18.3.1",
|
|
39
|
+
"react-dom": "^18",
|
|
40
|
+
"sharp": "^0.34.5",
|
|
41
|
+
"ssim.js": "^3.5.0",
|
|
42
|
+
"typescript": "~5.9.3",
|
|
43
|
+
"vite": "^7.3.1",
|
|
44
|
+
"vitest": "^4.0.18"
|
|
45
|
+
}
|
|
46
|
+
}
|