@polotno/pdf-export 0.1.36 → 0.1.38
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/index.js +2 -2
- package/lib/pdf-import/coordinate-transform.d.ts +51 -0
- package/lib/pdf-import/coordinate-transform.js +99 -0
- package/lib/pdf-import/element-builder.d.ts +21 -0
- package/lib/pdf-import/element-builder.js +163 -0
- package/lib/pdf-import/font-mapper.d.ts +17 -0
- package/lib/pdf-import/font-mapper.js +142 -0
- package/lib/pdf-import/index.d.ts +35 -0
- package/lib/pdf-import/index.js +105 -0
- package/lib/pdf-import/parser.d.ts +29 -0
- package/lib/pdf-import/parser.js +285 -0
- package/lib/pdf-import/text-analysis.d.ts +17 -0
- package/lib/pdf-import/text-analysis.js +186 -0
- package/lib/pdf-import/types.d.ts +101 -0
- package/lib/scripts/compare-json.d.ts +1 -0
- package/lib/scripts/compare-json.js +141 -0
- package/lib/text/fonts.d.ts +1 -0
- package/lib/text/fonts.js +49 -3
- package/lib/text.d.ts +0 -10
- package/lib/text.js +161 -862
- package/package.json +1 -1
- package/lib/browser-entry.d.ts +0 -7
- package/lib/browser-entry.js +0 -11
- package/lib/core/index.d.ts +0 -26
- package/lib/core/index.js +0 -87
- package/lib/platform/adapter.d.ts +0 -37
- package/lib/platform/adapter.js +0 -13
- package/lib/platform/browser-polyfill.js +0 -5
- package/lib/platform/browser.d.ts +0 -7
- package/lib/platform/browser.js +0 -145
- package/lib/platform/node.d.ts +0 -7
- package/lib/platform/node.js +0 -142
- /package/lib/{platform/browser-polyfill.d.ts → pdf-import/types.js} +0 -0
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
import * as pdfjsLib from 'pdfjs-dist/legacy/build/pdf.mjs';
|
|
2
|
+
import fs from 'fs';
|
|
3
|
+
import { createCanvas } from 'canvas';
|
|
4
|
+
import { calculateFontSize, extractPosition } from './coordinate-transform.js';
|
|
5
|
+
/**
|
|
6
|
+
* Load PDF document from file path or buffer
|
|
7
|
+
*/
|
|
8
|
+
export async function loadPDF(source) {
|
|
9
|
+
let data;
|
|
10
|
+
if (typeof source === 'string') {
|
|
11
|
+
// Load from file path
|
|
12
|
+
data = new Uint8Array(fs.readFileSync(source));
|
|
13
|
+
}
|
|
14
|
+
else {
|
|
15
|
+
// Use buffer directly
|
|
16
|
+
data = new Uint8Array(source);
|
|
17
|
+
}
|
|
18
|
+
const loadingTask = pdfjsLib.getDocument({
|
|
19
|
+
data,
|
|
20
|
+
// useSystemFonts: false,
|
|
21
|
+
// standardFontDataUrl: undefined,
|
|
22
|
+
// useWorkerFetch: false,
|
|
23
|
+
// isEvalSupported: true,
|
|
24
|
+
});
|
|
25
|
+
return await loadingTask.promise;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Get number of pages in PDF document
|
|
29
|
+
*/
|
|
30
|
+
export function getPageCount(pdfDoc) {
|
|
31
|
+
return pdfDoc.numPages;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Extract metadata from a single PDF page
|
|
35
|
+
*/
|
|
36
|
+
export async function extractPageMetadata(page, pageNumber) {
|
|
37
|
+
const viewport = page.getViewport({ scale: 1.0 });
|
|
38
|
+
return {
|
|
39
|
+
pageNumber,
|
|
40
|
+
width: viewport.width,
|
|
41
|
+
height: viewport.height,
|
|
42
|
+
rotate: viewport.rotation || 0,
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Extract text items from a PDF page with position and font metadata
|
|
47
|
+
*/
|
|
48
|
+
export async function extractTextItems(page) {
|
|
49
|
+
const textContent = await page.getTextContent({
|
|
50
|
+
includeMarkedContent: true
|
|
51
|
+
});
|
|
52
|
+
const items = [];
|
|
53
|
+
for (const item of textContent.items) {
|
|
54
|
+
// Skip empty text
|
|
55
|
+
if (!item.str || item.str.trim().length === 0) {
|
|
56
|
+
continue;
|
|
57
|
+
}
|
|
58
|
+
// Extract font size from transformation matrix
|
|
59
|
+
const fontSize = calculateFontSize(item.transform);
|
|
60
|
+
// Extract position
|
|
61
|
+
const position = extractPosition(item.transform);
|
|
62
|
+
// Get color if available (default to black)
|
|
63
|
+
let color = [0, 0, 0]; // Default black
|
|
64
|
+
if (item.color && Array.isArray(item.color)) {
|
|
65
|
+
color = item.color;
|
|
66
|
+
}
|
|
67
|
+
items.push({
|
|
68
|
+
str: item.str,
|
|
69
|
+
transform: item.transform,
|
|
70
|
+
width: item.width,
|
|
71
|
+
height: item.height,
|
|
72
|
+
fontName: textContent.styles[item.fontName].fontFamily,
|
|
73
|
+
fontSize,
|
|
74
|
+
hasEOL: item.hasEOL || false,
|
|
75
|
+
dir: item.dir || 'ltr',
|
|
76
|
+
color,
|
|
77
|
+
});
|
|
78
|
+
}
|
|
79
|
+
return items;
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Multiply two transformation matrices
|
|
83
|
+
*/
|
|
84
|
+
function multiplyMatrices(m1, m2) {
|
|
85
|
+
const [a1, b1, c1, d1, e1, f1] = m1;
|
|
86
|
+
const [a2, b2, c2, d2, e2, f2] = m2;
|
|
87
|
+
return [
|
|
88
|
+
a1 * a2 + b1 * c2,
|
|
89
|
+
a1 * b2 + b1 * d2,
|
|
90
|
+
c1 * a2 + d1 * c2,
|
|
91
|
+
c1 * b2 + d1 * d2,
|
|
92
|
+
e1 * a2 + f1 * c2 + e2,
|
|
93
|
+
e1 * b2 + f1 * d2 + f2
|
|
94
|
+
];
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Find transform matrix for an image by looking for the pattern:
|
|
98
|
+
* - Pure translation matrix (position)
|
|
99
|
+
* - Scale/flip matrix (dimensions)
|
|
100
|
+
*
|
|
101
|
+
* We combine these to get the final transform
|
|
102
|
+
*/
|
|
103
|
+
function findTransformForImage(ops, imageIndex) {
|
|
104
|
+
let translationTransform = null;
|
|
105
|
+
let scaleTransform = null;
|
|
106
|
+
// Walk backwards from the image to find the most recent transforms
|
|
107
|
+
for (let i = imageIndex - 1; i >= 0; i--) {
|
|
108
|
+
const fn = ops.fnArray[i];
|
|
109
|
+
// Stop if we hit a restore (means we're outside the image's graphics state)
|
|
110
|
+
if (fn === 32 || fn === 11) {
|
|
111
|
+
break;
|
|
112
|
+
}
|
|
113
|
+
if (fn === 35 || fn === 12) { // Transform operation
|
|
114
|
+
const transform = ops.argsArray[i];
|
|
115
|
+
if (transform && Array.isArray(transform) && transform.length >= 6) {
|
|
116
|
+
const [a, b, c, d, e, f] = transform;
|
|
117
|
+
// Check if this is a pure translation (identity scale/rotation with position)
|
|
118
|
+
if (Math.abs(a - 1) < 0.001 && Math.abs(b) < 0.001 &&
|
|
119
|
+
Math.abs(c) < 0.001 && Math.abs(Math.abs(d) - 1) < 0.001) {
|
|
120
|
+
if (!translationTransform) {
|
|
121
|
+
translationTransform = transform;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
else {
|
|
125
|
+
// This is a scale/rotation transform
|
|
126
|
+
if (!scaleTransform) {
|
|
127
|
+
scaleTransform = transform;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
// If we found both, we're done
|
|
131
|
+
if (translationTransform && scaleTransform) {
|
|
132
|
+
break;
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
// Build the final transform by combining translation and scale
|
|
138
|
+
if (translationTransform && scaleTransform) {
|
|
139
|
+
const [, , , , tx, ty] = translationTransform;
|
|
140
|
+
const [a, b, c, d, sx, sy] = scaleTransform;
|
|
141
|
+
// Combine: use scale/rotation from scaleTransform, position from translationTransform
|
|
142
|
+
return [a, b, c, d, tx, ty];
|
|
143
|
+
}
|
|
144
|
+
// Fallback to scale transform only if no translation found
|
|
145
|
+
if (scaleTransform) {
|
|
146
|
+
return scaleTransform;
|
|
147
|
+
}
|
|
148
|
+
// Fallback to translation only if no scale found
|
|
149
|
+
if (translationTransform) {
|
|
150
|
+
return translationTransform;
|
|
151
|
+
}
|
|
152
|
+
// Ultimate fallback: identity
|
|
153
|
+
return [1, 0, 0, 1, 0, 0];
|
|
154
|
+
}
|
|
155
|
+
/**
|
|
156
|
+
* Wait for a PDF object to be resolved with timeout and retry logic
|
|
157
|
+
*/
|
|
158
|
+
async function waitForObject(page, objectName, timeoutMs = 5000) {
|
|
159
|
+
const startTime = Date.now();
|
|
160
|
+
const retryDelay = 50; // ms between retries
|
|
161
|
+
while (Date.now() - startTime < timeoutMs) {
|
|
162
|
+
try {
|
|
163
|
+
// Check if object exists in the objects dictionary
|
|
164
|
+
if (page.objs.has(objectName)) {
|
|
165
|
+
const obj = page.objs.get(objectName);
|
|
166
|
+
if (obj) {
|
|
167
|
+
return obj;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
// Wait a bit before retrying
|
|
171
|
+
await new Promise(resolve => setTimeout(resolve, retryDelay));
|
|
172
|
+
}
|
|
173
|
+
catch (error) {
|
|
174
|
+
// Continue retrying
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
throw new Error(`Timeout waiting for ${objectName} after ${timeoutMs}ms`);
|
|
178
|
+
}
|
|
179
|
+
/**
|
|
180
|
+
* Extract images from a PDF page with enhanced async handling
|
|
181
|
+
*/
|
|
182
|
+
export async function extractImages(page) {
|
|
183
|
+
try {
|
|
184
|
+
const ops = await page.getOperatorList();
|
|
185
|
+
// First pass: collect all image names and their indices
|
|
186
|
+
const imageOperations = [];
|
|
187
|
+
for (let i = 0; i < ops.fnArray.length; i++) {
|
|
188
|
+
const fn = ops.fnArray[i];
|
|
189
|
+
const args = ops.argsArray[i];
|
|
190
|
+
// Check for image operations
|
|
191
|
+
// OPS.paintImageXObject = 85, OPS.paintInlineImageXObject = 86
|
|
192
|
+
if (fn === 85 || fn === 86) {
|
|
193
|
+
const imageName = args[0];
|
|
194
|
+
if (imageName) {
|
|
195
|
+
imageOperations.push({ name: imageName, index: i, opCode: fn });
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
// If no images found, return early
|
|
200
|
+
if (imageOperations.length === 0) {
|
|
201
|
+
return [];
|
|
202
|
+
}
|
|
203
|
+
// Second pass: extract all images in parallel with proper async handling
|
|
204
|
+
const imagePromises = imageOperations.map(async ({ name, index, opCode }) => {
|
|
205
|
+
try {
|
|
206
|
+
// Wait for the object to be resolved with timeout
|
|
207
|
+
const image = await waitForObject(page, name, 5000);
|
|
208
|
+
if (!image || !image.data) {
|
|
209
|
+
return null;
|
|
210
|
+
}
|
|
211
|
+
// PDF.js returns raw RGBA pixel data, we need to encode it as PNG/JPEG
|
|
212
|
+
const canvas = createCanvas(image.width, image.height);
|
|
213
|
+
const ctx = canvas.getContext('2d');
|
|
214
|
+
// Create ImageData from the raw pixel data
|
|
215
|
+
const imageData = ctx.createImageData(image.width, image.height);
|
|
216
|
+
imageData.data.set(image.data);
|
|
217
|
+
ctx.putImageData(imageData, 0, 0);
|
|
218
|
+
// Encode to buffer (PNG by default, JPEG if original was JPEG)
|
|
219
|
+
let buffer;
|
|
220
|
+
let mimeType;
|
|
221
|
+
if (image.kind === 1) {
|
|
222
|
+
// Original was JPEG - encode as JPEG with 90% quality
|
|
223
|
+
buffer = canvas.toBuffer('image/jpeg', { quality: 0.9 });
|
|
224
|
+
mimeType = 'image/jpeg';
|
|
225
|
+
}
|
|
226
|
+
else {
|
|
227
|
+
// Encode as PNG (lossless)
|
|
228
|
+
buffer = canvas.toBuffer('image/png');
|
|
229
|
+
mimeType = 'image/png';
|
|
230
|
+
}
|
|
231
|
+
// Get transformation matrix
|
|
232
|
+
const transform = findTransformForImage(ops, index);
|
|
233
|
+
return {
|
|
234
|
+
buffer,
|
|
235
|
+
mimeType,
|
|
236
|
+
width: image.width,
|
|
237
|
+
height: image.height,
|
|
238
|
+
transform,
|
|
239
|
+
};
|
|
240
|
+
}
|
|
241
|
+
catch (error) {
|
|
242
|
+
console.warn(`Failed to extract image ${name}:`, error instanceof Error ? error.message : error);
|
|
243
|
+
return null;
|
|
244
|
+
}
|
|
245
|
+
});
|
|
246
|
+
// Wait for all images to be processed
|
|
247
|
+
const results = await Promise.all(imagePromises);
|
|
248
|
+
// Filter out null results (failed extractions)
|
|
249
|
+
return results.filter((img) => img !== null);
|
|
250
|
+
}
|
|
251
|
+
catch (error) {
|
|
252
|
+
console.warn('Failed to extract images from page:', error);
|
|
253
|
+
return [];
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
/**
|
|
257
|
+
* Parse a single PDF page and extract all content
|
|
258
|
+
*/
|
|
259
|
+
export async function parsePage(pdfDoc, pageNumber) {
|
|
260
|
+
const page = await pdfDoc.getPage(pageNumber);
|
|
261
|
+
const [metadata, textItems, images] = await Promise.all([
|
|
262
|
+
extractPageMetadata(page, pageNumber),
|
|
263
|
+
extractTextItems(page),
|
|
264
|
+
extractImages(page),
|
|
265
|
+
]);
|
|
266
|
+
return {
|
|
267
|
+
metadata,
|
|
268
|
+
textItems,
|
|
269
|
+
images,
|
|
270
|
+
};
|
|
271
|
+
}
|
|
272
|
+
/**
|
|
273
|
+
* Parse entire PDF document or specific pages
|
|
274
|
+
*/
|
|
275
|
+
export async function parsePDF(source, pageNumbers) {
|
|
276
|
+
const pdfDoc = await loadPDF(source);
|
|
277
|
+
const totalPages = getPageCount(pdfDoc);
|
|
278
|
+
// Determine which pages to parse
|
|
279
|
+
const pagesToParse = pageNumbers && pageNumbers.length > 0
|
|
280
|
+
? pageNumbers.filter(n => n >= 1 && n <= totalPages)
|
|
281
|
+
: Array.from({ length: totalPages }, (_, i) => i + 1);
|
|
282
|
+
// Parse all pages in parallel
|
|
283
|
+
const pages = await Promise.all(pagesToParse.map(pageNum => parsePage(pdfDoc, pageNum)));
|
|
284
|
+
return pages;
|
|
285
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { PDFTextItem, TextBlock, PDFImportOptions } from './types.js';
|
|
2
|
+
/**
|
|
3
|
+
* Sort text items by position (top-to-bottom, left-to-right)
|
|
4
|
+
*/
|
|
5
|
+
export declare function sortTextItems(items: PDFTextItem[], pageHeight: number): PDFTextItem[];
|
|
6
|
+
/**
|
|
7
|
+
* Cluster text items into text blocks
|
|
8
|
+
*/
|
|
9
|
+
export declare function clusterTextItems(items: PDFTextItem[], pageHeight: number, pageWidth: number, options: PDFImportOptions): TextBlock[];
|
|
10
|
+
/**
|
|
11
|
+
* Detect text alignment based on position within page
|
|
12
|
+
*/
|
|
13
|
+
export declare function detectAlignment(block: TextBlock, pageWidth: number): 'left' | 'center' | 'right';
|
|
14
|
+
/**
|
|
15
|
+
* Apply alignment detection to all blocks
|
|
16
|
+
*/
|
|
17
|
+
export declare function applyAlignmentDetection(blocks: TextBlock[], pageWidth: number): TextBlock[];
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
import { extractPosition, extractRotation, pdfToPolotnoX, pdfToPolotnoY, pdfColorToHex, } from './coordinate-transform.js';
|
|
2
|
+
import { mapFont } from './font-mapper.js';
|
|
3
|
+
/**
|
|
4
|
+
* Sort text items by position (top-to-bottom, left-to-right)
|
|
5
|
+
*/
|
|
6
|
+
export function sortTextItems(items, pageHeight) {
|
|
7
|
+
return items.sort((a, b) => {
|
|
8
|
+
const posA = extractPosition(a.transform);
|
|
9
|
+
const posB = extractPosition(b.transform);
|
|
10
|
+
// Convert PDF Y (bottom-origin) to top-origin for comparison
|
|
11
|
+
const yA = pageHeight - posA.y;
|
|
12
|
+
const yB = pageHeight - posB.y;
|
|
13
|
+
// Same line threshold (5 pixels)
|
|
14
|
+
const lineThreshold = 5;
|
|
15
|
+
if (Math.abs(yA - yB) < lineThreshold) {
|
|
16
|
+
// Same line, sort by X (left to right)
|
|
17
|
+
return posA.x - posB.x;
|
|
18
|
+
}
|
|
19
|
+
// Different lines, sort by Y (top to bottom)
|
|
20
|
+
return yA - yB;
|
|
21
|
+
});
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Check if two text items should be in the same block based on proximity and font
|
|
25
|
+
*/
|
|
26
|
+
function shouldMerge(item1, item2, pageHeight, options) {
|
|
27
|
+
const pos1 = extractPosition(item1.transform);
|
|
28
|
+
const pos2 = extractPosition(item2.transform);
|
|
29
|
+
// Get thresholds from options
|
|
30
|
+
const verticalThreshold = item1.fontSize; // options.textClusterThreshold?.vertical ?? 20;
|
|
31
|
+
const horizontalThreshold = options.textClusterThreshold?.horizontal ?? 10;
|
|
32
|
+
// Convert PDF Y to top-origin
|
|
33
|
+
const y1 = pageHeight - pos1.y;
|
|
34
|
+
const y2 = pageHeight - pos2.y;
|
|
35
|
+
// Check vertical distance
|
|
36
|
+
const verticalDistance = Math.abs(y1 - y2);
|
|
37
|
+
if (verticalDistance > verticalThreshold) {
|
|
38
|
+
return false;
|
|
39
|
+
}
|
|
40
|
+
// Check horizontal distance (for same line)
|
|
41
|
+
const horizontalDistance = Math.abs(pos2.x - (pos1.x + item1.width));
|
|
42
|
+
if (verticalDistance < 5 && horizontalDistance > horizontalThreshold) {
|
|
43
|
+
return false;
|
|
44
|
+
}
|
|
45
|
+
// Check font consistency
|
|
46
|
+
if (item1.fontName !== item2.fontName) {
|
|
47
|
+
return false;
|
|
48
|
+
}
|
|
49
|
+
// Check font size consistency (within 1pt tolerance)
|
|
50
|
+
if (Math.abs((item1.fontSize || 0) - (item2.fontSize || 0)) > 1) {
|
|
51
|
+
return false;
|
|
52
|
+
}
|
|
53
|
+
return true;
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Cluster text items into text blocks
|
|
57
|
+
*/
|
|
58
|
+
export function clusterTextItems(items, pageHeight, pageWidth, options) {
|
|
59
|
+
if (items.length === 0) {
|
|
60
|
+
return [];
|
|
61
|
+
}
|
|
62
|
+
// Sort items first
|
|
63
|
+
const sortedItems = sortTextItems(items, pageHeight);
|
|
64
|
+
// Filter out items that are too small
|
|
65
|
+
const minSize = options.minTextBlockSize ?? 8;
|
|
66
|
+
const filteredItems = sortedItems.filter(item => (item.fontSize || 0) >= minSize);
|
|
67
|
+
if (filteredItems.length === 0) {
|
|
68
|
+
return [];
|
|
69
|
+
}
|
|
70
|
+
const blocks = [];
|
|
71
|
+
let currentBlock = [filteredItems[0]];
|
|
72
|
+
for (let i = 1; i < filteredItems.length; i++) {
|
|
73
|
+
const prevItem = filteredItems[i - 1];
|
|
74
|
+
const currentItem = filteredItems[i];
|
|
75
|
+
if (shouldMerge(prevItem, currentItem, pageHeight, options)) {
|
|
76
|
+
// Add to current block
|
|
77
|
+
currentBlock.push(currentItem);
|
|
78
|
+
}
|
|
79
|
+
else {
|
|
80
|
+
// Finalize current block and start new one
|
|
81
|
+
if (currentBlock.length > 0) {
|
|
82
|
+
blocks.push(createTextBlock(currentBlock, pageHeight, options));
|
|
83
|
+
}
|
|
84
|
+
currentBlock = [currentItem];
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
// Don't forget the last block
|
|
88
|
+
if (currentBlock.length > 0) {
|
|
89
|
+
blocks.push(createTextBlock(currentBlock, pageHeight, options));
|
|
90
|
+
}
|
|
91
|
+
return blocks;
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Create a text block from a cluster of text items
|
|
95
|
+
*/
|
|
96
|
+
function createTextBlock(items, pageHeight, options) {
|
|
97
|
+
// Combine text with proper spacing
|
|
98
|
+
let text = '';
|
|
99
|
+
for (let i = 0; i < items.length; i++) {
|
|
100
|
+
const item = items[i];
|
|
101
|
+
text += item.str;
|
|
102
|
+
// Add space if needed (not at end, and next item doesn't start with space)
|
|
103
|
+
if (i < items.length - 1) {
|
|
104
|
+
const nextItem = items[i + 1];
|
|
105
|
+
const pos = extractPosition(item.transform);
|
|
106
|
+
const nextPos = extractPosition(nextItem.transform);
|
|
107
|
+
// Check if items are on same line
|
|
108
|
+
const y = pageHeight - pos.y;
|
|
109
|
+
const nextY = pageHeight - nextPos.y;
|
|
110
|
+
if (Math.abs(y - nextY) < 5) {
|
|
111
|
+
// Same line - add space if there's a gap
|
|
112
|
+
const gap = nextPos.x - (pos.x + item.width);
|
|
113
|
+
if (gap > 2 && !nextItem.str.startsWith(' ') && !item.str.endsWith(' ')) {
|
|
114
|
+
text += ' ';
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
else {
|
|
118
|
+
// Different line - add newline
|
|
119
|
+
if (!text.endsWith('\n')) {
|
|
120
|
+
text += '\n';
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
// Calculate bounding box
|
|
126
|
+
const positions = items.map(item => extractPosition(item.transform));
|
|
127
|
+
const minX = Math.min(...positions.map(p => p.x));
|
|
128
|
+
const maxX = Math.max(...items.map((item, i) => positions[i].x + item.width));
|
|
129
|
+
const minY = Math.min(...positions.map(p => p.y));
|
|
130
|
+
const maxY = Math.max(...items.map((item, i) => positions[i].y + item.height));
|
|
131
|
+
const width = maxX - minX;
|
|
132
|
+
const height = maxY - minY;
|
|
133
|
+
// Use first item for font properties
|
|
134
|
+
const firstItem = items[0];
|
|
135
|
+
const fontSize = firstItem.fontSize || 12;
|
|
136
|
+
const rotation = extractRotation(firstItem.transform);
|
|
137
|
+
// Map font
|
|
138
|
+
const mappedFont = mapFont(firstItem.fontName, options.fontMapping);
|
|
139
|
+
// Extract color
|
|
140
|
+
const color = pdfColorToHex(firstItem.color || [0, 0, 0]);
|
|
141
|
+
// Transform coordinates
|
|
142
|
+
const x = pdfToPolotnoX(minX);
|
|
143
|
+
const y = pdfToPolotnoY(minY, height, pageHeight);
|
|
144
|
+
return {
|
|
145
|
+
text,
|
|
146
|
+
x,
|
|
147
|
+
y,
|
|
148
|
+
width,
|
|
149
|
+
height,
|
|
150
|
+
fontName: mappedFont.family,
|
|
151
|
+
fontSize,
|
|
152
|
+
fontWeight: mappedFont.weight,
|
|
153
|
+
fontStyle: mappedFont.style,
|
|
154
|
+
color,
|
|
155
|
+
rotation,
|
|
156
|
+
align: 'left', // Default to left alignment
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
/**
|
|
160
|
+
* Detect text alignment based on position within page
|
|
161
|
+
*/
|
|
162
|
+
export function detectAlignment(block, pageWidth) {
|
|
163
|
+
const blockCenter = block.x + block.width / 2;
|
|
164
|
+
const pageCenter = pageWidth / 2;
|
|
165
|
+
// Check if block is centered (within 10% of page width)
|
|
166
|
+
const centerThreshold = pageWidth * 0.1;
|
|
167
|
+
if (Math.abs(blockCenter - pageCenter) < centerThreshold) {
|
|
168
|
+
return 'center';
|
|
169
|
+
}
|
|
170
|
+
// Check if block is right-aligned (within 10% of right edge)
|
|
171
|
+
const rightEdge = pageWidth;
|
|
172
|
+
if (Math.abs(block.x + block.width - rightEdge) < centerThreshold) {
|
|
173
|
+
return 'right';
|
|
174
|
+
}
|
|
175
|
+
// Default to left
|
|
176
|
+
return 'left';
|
|
177
|
+
}
|
|
178
|
+
/**
|
|
179
|
+
* Apply alignment detection to all blocks
|
|
180
|
+
*/
|
|
181
|
+
export function applyAlignmentDetection(blocks, pageWidth) {
|
|
182
|
+
return blocks.map(block => ({
|
|
183
|
+
...block,
|
|
184
|
+
align: detectAlignment(block, pageWidth),
|
|
185
|
+
}));
|
|
186
|
+
}
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Options for PDF to JSON conversion
|
|
3
|
+
*/
|
|
4
|
+
export interface PDFImportOptions {
|
|
5
|
+
pageNumbers?: number[];
|
|
6
|
+
imageMode?: 'dataURL' | 'upload';
|
|
7
|
+
imageUploadFn?: (buffer: Buffer, mimeType: string) => Promise<string>;
|
|
8
|
+
fontMapping?: Record<string, string>;
|
|
9
|
+
minTextBlockSize?: number;
|
|
10
|
+
textClusterThreshold?: {
|
|
11
|
+
vertical?: number;
|
|
12
|
+
horizontal?: number;
|
|
13
|
+
};
|
|
14
|
+
outputUnit?: 'px' | 'cm' | 'in';
|
|
15
|
+
dpi?: number;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Raw text item extracted from PDF with position and font metadata
|
|
19
|
+
*/
|
|
20
|
+
export interface PDFTextItem {
|
|
21
|
+
str: string;
|
|
22
|
+
transform: number[];
|
|
23
|
+
width: number;
|
|
24
|
+
height: number;
|
|
25
|
+
fontName: string;
|
|
26
|
+
fontSize?: number;
|
|
27
|
+
hasEOL?: boolean;
|
|
28
|
+
dir?: string;
|
|
29
|
+
color?: number[];
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Clustered text block representing a semantic unit
|
|
33
|
+
*/
|
|
34
|
+
export interface TextBlock {
|
|
35
|
+
text: string;
|
|
36
|
+
x: number;
|
|
37
|
+
y: number;
|
|
38
|
+
width: number;
|
|
39
|
+
height: number;
|
|
40
|
+
fontName: string;
|
|
41
|
+
fontSize: number;
|
|
42
|
+
fontWeight: string;
|
|
43
|
+
fontStyle: string;
|
|
44
|
+
color: string;
|
|
45
|
+
rotation: number;
|
|
46
|
+
align?: string;
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Raw image object extracted from PDF
|
|
50
|
+
*/
|
|
51
|
+
export interface PDFImageObject {
|
|
52
|
+
buffer: Buffer;
|
|
53
|
+
mimeType: string;
|
|
54
|
+
width: number;
|
|
55
|
+
height: number;
|
|
56
|
+
transform: number[];
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Polotno image element ready for JSON
|
|
60
|
+
*/
|
|
61
|
+
export interface ImageBlock {
|
|
62
|
+
src: string;
|
|
63
|
+
x: number;
|
|
64
|
+
y: number;
|
|
65
|
+
width: number;
|
|
66
|
+
height: number;
|
|
67
|
+
rotation: number;
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Parsed font information from PDF font name
|
|
71
|
+
*/
|
|
72
|
+
export interface ParsedFont {
|
|
73
|
+
family: string;
|
|
74
|
+
weight: string;
|
|
75
|
+
style: string;
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Coordinate transformation context
|
|
79
|
+
*/
|
|
80
|
+
export interface CoordinateContext {
|
|
81
|
+
pageHeight: number;
|
|
82
|
+
pageWidth: number;
|
|
83
|
+
rotation?: number;
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* PDF page metadata
|
|
87
|
+
*/
|
|
88
|
+
export interface PDFPageMetadata {
|
|
89
|
+
pageNumber: number;
|
|
90
|
+
width: number;
|
|
91
|
+
height: number;
|
|
92
|
+
rotate?: number;
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* Result of PDF parsing for one page
|
|
96
|
+
*/
|
|
97
|
+
export interface ParsedPage {
|
|
98
|
+
metadata: PDFPageMetadata;
|
|
99
|
+
textItems: PDFTextItem[];
|
|
100
|
+
images: PDFImageObject[];
|
|
101
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|