@polotno/pdf-export 0.1.36 → 0.1.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,285 @@
1
+ import * as pdfjsLib from 'pdfjs-dist/legacy/build/pdf.mjs';
2
+ import fs from 'fs';
3
+ import { createCanvas } from 'canvas';
4
+ import { calculateFontSize, extractPosition } from './coordinate-transform.js';
5
+ /**
6
+ * Load PDF document from file path or buffer
7
+ */
8
+ export async function loadPDF(source) {
9
+ let data;
10
+ if (typeof source === 'string') {
11
+ // Load from file path
12
+ data = new Uint8Array(fs.readFileSync(source));
13
+ }
14
+ else {
15
+ // Use buffer directly
16
+ data = new Uint8Array(source);
17
+ }
18
+ const loadingTask = pdfjsLib.getDocument({
19
+ data,
20
+ // useSystemFonts: false,
21
+ // standardFontDataUrl: undefined,
22
+ // useWorkerFetch: false,
23
+ // isEvalSupported: true,
24
+ });
25
+ return await loadingTask.promise;
26
+ }
27
+ /**
28
+ * Get number of pages in PDF document
29
+ */
30
+ export function getPageCount(pdfDoc) {
31
+ return pdfDoc.numPages;
32
+ }
33
+ /**
34
+ * Extract metadata from a single PDF page
35
+ */
36
+ export async function extractPageMetadata(page, pageNumber) {
37
+ const viewport = page.getViewport({ scale: 1.0 });
38
+ return {
39
+ pageNumber,
40
+ width: viewport.width,
41
+ height: viewport.height,
42
+ rotate: viewport.rotation || 0,
43
+ };
44
+ }
45
+ /**
46
+ * Extract text items from a PDF page with position and font metadata
47
+ */
48
+ export async function extractTextItems(page) {
49
+ const textContent = await page.getTextContent({
50
+ includeMarkedContent: true
51
+ });
52
+ const items = [];
53
+ for (const item of textContent.items) {
54
+ // Skip empty text
55
+ if (!item.str || item.str.trim().length === 0) {
56
+ continue;
57
+ }
58
+ // Extract font size from transformation matrix
59
+ const fontSize = calculateFontSize(item.transform);
60
+ // Extract position
61
+ const position = extractPosition(item.transform);
62
+ // Get color if available (default to black)
63
+ let color = [0, 0, 0]; // Default black
64
+ if (item.color && Array.isArray(item.color)) {
65
+ color = item.color;
66
+ }
67
+ items.push({
68
+ str: item.str,
69
+ transform: item.transform,
70
+ width: item.width,
71
+ height: item.height,
72
+ fontName: textContent.styles[item.fontName].fontFamily,
73
+ fontSize,
74
+ hasEOL: item.hasEOL || false,
75
+ dir: item.dir || 'ltr',
76
+ color,
77
+ });
78
+ }
79
+ return items;
80
+ }
81
+ /**
82
+ * Multiply two transformation matrices
83
+ */
84
+ function multiplyMatrices(m1, m2) {
85
+ const [a1, b1, c1, d1, e1, f1] = m1;
86
+ const [a2, b2, c2, d2, e2, f2] = m2;
87
+ return [
88
+ a1 * a2 + b1 * c2,
89
+ a1 * b2 + b1 * d2,
90
+ c1 * a2 + d1 * c2,
91
+ c1 * b2 + d1 * d2,
92
+ e1 * a2 + f1 * c2 + e2,
93
+ e1 * b2 + f1 * d2 + f2
94
+ ];
95
+ }
96
+ /**
97
+ * Find transform matrix for an image by looking for the pattern:
98
+ * - Pure translation matrix (position)
99
+ * - Scale/flip matrix (dimensions)
100
+ *
101
+ * We combine these to get the final transform
102
+ */
103
+ function findTransformForImage(ops, imageIndex) {
104
+ let translationTransform = null;
105
+ let scaleTransform = null;
106
+ // Walk backwards from the image to find the most recent transforms
107
+ for (let i = imageIndex - 1; i >= 0; i--) {
108
+ const fn = ops.fnArray[i];
109
+ // Stop if we hit a restore (means we're outside the image's graphics state)
110
+ if (fn === 32 || fn === 11) {
111
+ break;
112
+ }
113
+ if (fn === 35 || fn === 12) { // Transform operation
114
+ const transform = ops.argsArray[i];
115
+ if (transform && Array.isArray(transform) && transform.length >= 6) {
116
+ const [a, b, c, d, e, f] = transform;
117
+ // Check if this is a pure translation (identity scale/rotation with position)
118
+ if (Math.abs(a - 1) < 0.001 && Math.abs(b) < 0.001 &&
119
+ Math.abs(c) < 0.001 && Math.abs(Math.abs(d) - 1) < 0.001) {
120
+ if (!translationTransform) {
121
+ translationTransform = transform;
122
+ }
123
+ }
124
+ else {
125
+ // This is a scale/rotation transform
126
+ if (!scaleTransform) {
127
+ scaleTransform = transform;
128
+ }
129
+ }
130
+ // If we found both, we're done
131
+ if (translationTransform && scaleTransform) {
132
+ break;
133
+ }
134
+ }
135
+ }
136
+ }
137
+ // Build the final transform by combining translation and scale
138
+ if (translationTransform && scaleTransform) {
139
+ const [, , , , tx, ty] = translationTransform;
140
+ const [a, b, c, d, sx, sy] = scaleTransform;
141
+ // Combine: use scale/rotation from scaleTransform, position from translationTransform
142
+ return [a, b, c, d, tx, ty];
143
+ }
144
+ // Fallback to scale transform only if no translation found
145
+ if (scaleTransform) {
146
+ return scaleTransform;
147
+ }
148
+ // Fallback to translation only if no scale found
149
+ if (translationTransform) {
150
+ return translationTransform;
151
+ }
152
+ // Ultimate fallback: identity
153
+ return [1, 0, 0, 1, 0, 0];
154
+ }
155
+ /**
156
+ * Wait for a PDF object to be resolved with timeout and retry logic
157
+ */
158
+ async function waitForObject(page, objectName, timeoutMs = 5000) {
159
+ const startTime = Date.now();
160
+ const retryDelay = 50; // ms between retries
161
+ while (Date.now() - startTime < timeoutMs) {
162
+ try {
163
+ // Check if object exists in the objects dictionary
164
+ if (page.objs.has(objectName)) {
165
+ const obj = page.objs.get(objectName);
166
+ if (obj) {
167
+ return obj;
168
+ }
169
+ }
170
+ // Wait a bit before retrying
171
+ await new Promise(resolve => setTimeout(resolve, retryDelay));
172
+ }
173
+ catch (error) {
174
+ // Continue retrying
175
+ }
176
+ }
177
+ throw new Error(`Timeout waiting for ${objectName} after ${timeoutMs}ms`);
178
+ }
179
+ /**
180
+ * Extract images from a PDF page with enhanced async handling
181
+ */
182
+ export async function extractImages(page) {
183
+ try {
184
+ const ops = await page.getOperatorList();
185
+ // First pass: collect all image names and their indices
186
+ const imageOperations = [];
187
+ for (let i = 0; i < ops.fnArray.length; i++) {
188
+ const fn = ops.fnArray[i];
189
+ const args = ops.argsArray[i];
190
+ // Check for image operations
191
+ // OPS.paintImageXObject = 85, OPS.paintInlineImageXObject = 86
192
+ if (fn === 85 || fn === 86) {
193
+ const imageName = args[0];
194
+ if (imageName) {
195
+ imageOperations.push({ name: imageName, index: i, opCode: fn });
196
+ }
197
+ }
198
+ }
199
+ // If no images found, return early
200
+ if (imageOperations.length === 0) {
201
+ return [];
202
+ }
203
+ // Second pass: extract all images in parallel with proper async handling
204
+ const imagePromises = imageOperations.map(async ({ name, index, opCode }) => {
205
+ try {
206
+ // Wait for the object to be resolved with timeout
207
+ const image = await waitForObject(page, name, 5000);
208
+ if (!image || !image.data) {
209
+ return null;
210
+ }
211
+ // PDF.js returns raw RGBA pixel data, we need to encode it as PNG/JPEG
212
+ const canvas = createCanvas(image.width, image.height);
213
+ const ctx = canvas.getContext('2d');
214
+ // Create ImageData from the raw pixel data
215
+ const imageData = ctx.createImageData(image.width, image.height);
216
+ imageData.data.set(image.data);
217
+ ctx.putImageData(imageData, 0, 0);
218
+ // Encode to buffer (PNG by default, JPEG if original was JPEG)
219
+ let buffer;
220
+ let mimeType;
221
+ if (image.kind === 1) {
222
+ // Original was JPEG - encode as JPEG with 90% quality
223
+ buffer = canvas.toBuffer('image/jpeg', { quality: 0.9 });
224
+ mimeType = 'image/jpeg';
225
+ }
226
+ else {
227
+ // Encode as PNG (lossless)
228
+ buffer = canvas.toBuffer('image/png');
229
+ mimeType = 'image/png';
230
+ }
231
+ // Get transformation matrix
232
+ const transform = findTransformForImage(ops, index);
233
+ return {
234
+ buffer,
235
+ mimeType,
236
+ width: image.width,
237
+ height: image.height,
238
+ transform,
239
+ };
240
+ }
241
+ catch (error) {
242
+ console.warn(`Failed to extract image ${name}:`, error instanceof Error ? error.message : error);
243
+ return null;
244
+ }
245
+ });
246
+ // Wait for all images to be processed
247
+ const results = await Promise.all(imagePromises);
248
+ // Filter out null results (failed extractions)
249
+ return results.filter((img) => img !== null);
250
+ }
251
+ catch (error) {
252
+ console.warn('Failed to extract images from page:', error);
253
+ return [];
254
+ }
255
+ }
256
+ /**
257
+ * Parse a single PDF page and extract all content
258
+ */
259
+ export async function parsePage(pdfDoc, pageNumber) {
260
+ const page = await pdfDoc.getPage(pageNumber);
261
+ const [metadata, textItems, images] = await Promise.all([
262
+ extractPageMetadata(page, pageNumber),
263
+ extractTextItems(page),
264
+ extractImages(page),
265
+ ]);
266
+ return {
267
+ metadata,
268
+ textItems,
269
+ images,
270
+ };
271
+ }
272
+ /**
273
+ * Parse entire PDF document or specific pages
274
+ */
275
+ export async function parsePDF(source, pageNumbers) {
276
+ const pdfDoc = await loadPDF(source);
277
+ const totalPages = getPageCount(pdfDoc);
278
+ // Determine which pages to parse
279
+ const pagesToParse = pageNumbers && pageNumbers.length > 0
280
+ ? pageNumbers.filter(n => n >= 1 && n <= totalPages)
281
+ : Array.from({ length: totalPages }, (_, i) => i + 1);
282
+ // Parse all pages in parallel
283
+ const pages = await Promise.all(pagesToParse.map(pageNum => parsePage(pdfDoc, pageNum)));
284
+ return pages;
285
+ }
@@ -0,0 +1,17 @@
1
+ import type { PDFTextItem, TextBlock, PDFImportOptions } from './types.js';
2
+ /**
3
+ * Sort text items by position (top-to-bottom, left-to-right)
4
+ */
5
+ export declare function sortTextItems(items: PDFTextItem[], pageHeight: number): PDFTextItem[];
6
+ /**
7
+ * Cluster text items into text blocks
8
+ */
9
+ export declare function clusterTextItems(items: PDFTextItem[], pageHeight: number, pageWidth: number, options: PDFImportOptions): TextBlock[];
10
+ /**
11
+ * Detect text alignment based on position within page
12
+ */
13
+ export declare function detectAlignment(block: TextBlock, pageWidth: number): 'left' | 'center' | 'right';
14
+ /**
15
+ * Apply alignment detection to all blocks
16
+ */
17
+ export declare function applyAlignmentDetection(blocks: TextBlock[], pageWidth: number): TextBlock[];
@@ -0,0 +1,186 @@
1
+ import { extractPosition, extractRotation, pdfToPolotnoX, pdfToPolotnoY, pdfColorToHex, } from './coordinate-transform.js';
2
+ import { mapFont } from './font-mapper.js';
3
+ /**
4
+ * Sort text items by position (top-to-bottom, left-to-right)
5
+ */
6
+ export function sortTextItems(items, pageHeight) {
7
+ return items.sort((a, b) => {
8
+ const posA = extractPosition(a.transform);
9
+ const posB = extractPosition(b.transform);
10
+ // Convert PDF Y (bottom-origin) to top-origin for comparison
11
+ const yA = pageHeight - posA.y;
12
+ const yB = pageHeight - posB.y;
13
+ // Same line threshold (5 pixels)
14
+ const lineThreshold = 5;
15
+ if (Math.abs(yA - yB) < lineThreshold) {
16
+ // Same line, sort by X (left to right)
17
+ return posA.x - posB.x;
18
+ }
19
+ // Different lines, sort by Y (top to bottom)
20
+ return yA - yB;
21
+ });
22
+ }
23
+ /**
24
+ * Check if two text items should be in the same block based on proximity and font
25
+ */
26
+ function shouldMerge(item1, item2, pageHeight, options) {
27
+ const pos1 = extractPosition(item1.transform);
28
+ const pos2 = extractPosition(item2.transform);
29
+ // Get thresholds from options
30
+ const verticalThreshold = item1.fontSize; // options.textClusterThreshold?.vertical ?? 20;
31
+ const horizontalThreshold = options.textClusterThreshold?.horizontal ?? 10;
32
+ // Convert PDF Y to top-origin
33
+ const y1 = pageHeight - pos1.y;
34
+ const y2 = pageHeight - pos2.y;
35
+ // Check vertical distance
36
+ const verticalDistance = Math.abs(y1 - y2);
37
+ if (verticalDistance > verticalThreshold) {
38
+ return false;
39
+ }
40
+ // Check horizontal distance (for same line)
41
+ const horizontalDistance = Math.abs(pos2.x - (pos1.x + item1.width));
42
+ if (verticalDistance < 5 && horizontalDistance > horizontalThreshold) {
43
+ return false;
44
+ }
45
+ // Check font consistency
46
+ if (item1.fontName !== item2.fontName) {
47
+ return false;
48
+ }
49
+ // Check font size consistency (within 1pt tolerance)
50
+ if (Math.abs((item1.fontSize || 0) - (item2.fontSize || 0)) > 1) {
51
+ return false;
52
+ }
53
+ return true;
54
+ }
55
+ /**
56
+ * Cluster text items into text blocks
57
+ */
58
+ export function clusterTextItems(items, pageHeight, pageWidth, options) {
59
+ if (items.length === 0) {
60
+ return [];
61
+ }
62
+ // Sort items first
63
+ const sortedItems = sortTextItems(items, pageHeight);
64
+ // Filter out items that are too small
65
+ const minSize = options.minTextBlockSize ?? 8;
66
+ const filteredItems = sortedItems.filter(item => (item.fontSize || 0) >= minSize);
67
+ if (filteredItems.length === 0) {
68
+ return [];
69
+ }
70
+ const blocks = [];
71
+ let currentBlock = [filteredItems[0]];
72
+ for (let i = 1; i < filteredItems.length; i++) {
73
+ const prevItem = filteredItems[i - 1];
74
+ const currentItem = filteredItems[i];
75
+ if (shouldMerge(prevItem, currentItem, pageHeight, options)) {
76
+ // Add to current block
77
+ currentBlock.push(currentItem);
78
+ }
79
+ else {
80
+ // Finalize current block and start new one
81
+ if (currentBlock.length > 0) {
82
+ blocks.push(createTextBlock(currentBlock, pageHeight, options));
83
+ }
84
+ currentBlock = [currentItem];
85
+ }
86
+ }
87
+ // Don't forget the last block
88
+ if (currentBlock.length > 0) {
89
+ blocks.push(createTextBlock(currentBlock, pageHeight, options));
90
+ }
91
+ return blocks;
92
+ }
93
+ /**
94
+ * Create a text block from a cluster of text items
95
+ */
96
+ function createTextBlock(items, pageHeight, options) {
97
+ // Combine text with proper spacing
98
+ let text = '';
99
+ for (let i = 0; i < items.length; i++) {
100
+ const item = items[i];
101
+ text += item.str;
102
+ // Add space if needed (not at end, and next item doesn't start with space)
103
+ if (i < items.length - 1) {
104
+ const nextItem = items[i + 1];
105
+ const pos = extractPosition(item.transform);
106
+ const nextPos = extractPosition(nextItem.transform);
107
+ // Check if items are on same line
108
+ const y = pageHeight - pos.y;
109
+ const nextY = pageHeight - nextPos.y;
110
+ if (Math.abs(y - nextY) < 5) {
111
+ // Same line - add space if there's a gap
112
+ const gap = nextPos.x - (pos.x + item.width);
113
+ if (gap > 2 && !nextItem.str.startsWith(' ') && !item.str.endsWith(' ')) {
114
+ text += ' ';
115
+ }
116
+ }
117
+ else {
118
+ // Different line - add newline
119
+ if (!text.endsWith('\n')) {
120
+ text += '\n';
121
+ }
122
+ }
123
+ }
124
+ }
125
+ // Calculate bounding box
126
+ const positions = items.map(item => extractPosition(item.transform));
127
+ const minX = Math.min(...positions.map(p => p.x));
128
+ const maxX = Math.max(...items.map((item, i) => positions[i].x + item.width));
129
+ const minY = Math.min(...positions.map(p => p.y));
130
+ const maxY = Math.max(...items.map((item, i) => positions[i].y + item.height));
131
+ const width = maxX - minX;
132
+ const height = maxY - minY;
133
+ // Use first item for font properties
134
+ const firstItem = items[0];
135
+ const fontSize = firstItem.fontSize || 12;
136
+ const rotation = extractRotation(firstItem.transform);
137
+ // Map font
138
+ const mappedFont = mapFont(firstItem.fontName, options.fontMapping);
139
+ // Extract color
140
+ const color = pdfColorToHex(firstItem.color || [0, 0, 0]);
141
+ // Transform coordinates
142
+ const x = pdfToPolotnoX(minX);
143
+ const y = pdfToPolotnoY(minY, height, pageHeight);
144
+ return {
145
+ text,
146
+ x,
147
+ y,
148
+ width,
149
+ height,
150
+ fontName: mappedFont.family,
151
+ fontSize,
152
+ fontWeight: mappedFont.weight,
153
+ fontStyle: mappedFont.style,
154
+ color,
155
+ rotation,
156
+ align: 'left', // Default to left alignment
157
+ };
158
+ }
159
+ /**
160
+ * Detect text alignment based on position within page
161
+ */
162
+ export function detectAlignment(block, pageWidth) {
163
+ const blockCenter = block.x + block.width / 2;
164
+ const pageCenter = pageWidth / 2;
165
+ // Check if block is centered (within 10% of page width)
166
+ const centerThreshold = pageWidth * 0.1;
167
+ if (Math.abs(blockCenter - pageCenter) < centerThreshold) {
168
+ return 'center';
169
+ }
170
+ // Check if block is right-aligned (within 10% of right edge)
171
+ const rightEdge = pageWidth;
172
+ if (Math.abs(block.x + block.width - rightEdge) < centerThreshold) {
173
+ return 'right';
174
+ }
175
+ // Default to left
176
+ return 'left';
177
+ }
178
+ /**
179
+ * Apply alignment detection to all blocks
180
+ */
181
+ export function applyAlignmentDetection(blocks, pageWidth) {
182
+ return blocks.map(block => ({
183
+ ...block,
184
+ align: detectAlignment(block, pageWidth),
185
+ }));
186
+ }
@@ -0,0 +1,101 @@
1
+ /**
2
+ * Options for PDF to JSON conversion
3
+ */
4
+ export interface PDFImportOptions {
5
+ pageNumbers?: number[];
6
+ imageMode?: 'dataURL' | 'upload';
7
+ imageUploadFn?: (buffer: Buffer, mimeType: string) => Promise<string>;
8
+ fontMapping?: Record<string, string>;
9
+ minTextBlockSize?: number;
10
+ textClusterThreshold?: {
11
+ vertical?: number;
12
+ horizontal?: number;
13
+ };
14
+ outputUnit?: 'px' | 'cm' | 'in';
15
+ dpi?: number;
16
+ }
17
+ /**
18
+ * Raw text item extracted from PDF with position and font metadata
19
+ */
20
+ export interface PDFTextItem {
21
+ str: string;
22
+ transform: number[];
23
+ width: number;
24
+ height: number;
25
+ fontName: string;
26
+ fontSize?: number;
27
+ hasEOL?: boolean;
28
+ dir?: string;
29
+ color?: number[];
30
+ }
31
+ /**
32
+ * Clustered text block representing a semantic unit
33
+ */
34
+ export interface TextBlock {
35
+ text: string;
36
+ x: number;
37
+ y: number;
38
+ width: number;
39
+ height: number;
40
+ fontName: string;
41
+ fontSize: number;
42
+ fontWeight: string;
43
+ fontStyle: string;
44
+ color: string;
45
+ rotation: number;
46
+ align?: string;
47
+ }
48
+ /**
49
+ * Raw image object extracted from PDF
50
+ */
51
+ export interface PDFImageObject {
52
+ buffer: Buffer;
53
+ mimeType: string;
54
+ width: number;
55
+ height: number;
56
+ transform: number[];
57
+ }
58
+ /**
59
+ * Polotno image element ready for JSON
60
+ */
61
+ export interface ImageBlock {
62
+ src: string;
63
+ x: number;
64
+ y: number;
65
+ width: number;
66
+ height: number;
67
+ rotation: number;
68
+ }
69
+ /**
70
+ * Parsed font information from PDF font name
71
+ */
72
+ export interface ParsedFont {
73
+ family: string;
74
+ weight: string;
75
+ style: string;
76
+ }
77
+ /**
78
+ * Coordinate transformation context
79
+ */
80
+ export interface CoordinateContext {
81
+ pageHeight: number;
82
+ pageWidth: number;
83
+ rotation?: number;
84
+ }
85
+ /**
86
+ * PDF page metadata
87
+ */
88
+ export interface PDFPageMetadata {
89
+ pageNumber: number;
90
+ width: number;
91
+ height: number;
92
+ rotate?: number;
93
+ }
94
+ /**
95
+ * Result of PDF parsing for one page
96
+ */
97
+ export interface ParsedPage {
98
+ metadata: PDFPageMetadata;
99
+ textItems: PDFTextItem[];
100
+ images: PDFImageObject[];
101
+ }
@@ -0,0 +1 @@
1
+ export {};