@polotno/pdf-export 0.1.38 → 0.1.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/README.md +61 -8
  2. package/lib/index.d.ts +66 -8
  3. package/lib/index.js +25 -145
  4. package/package.json +17 -18
  5. package/lib/compare-render.d.ts +0 -1
  6. package/lib/compare-render.js +0 -185
  7. package/lib/figure.d.ts +0 -10
  8. package/lib/figure.js +0 -54
  9. package/lib/filters.d.ts +0 -2
  10. package/lib/filters.js +0 -163
  11. package/lib/ghostscript.d.ts +0 -21
  12. package/lib/ghostscript.js +0 -132
  13. package/lib/group.d.ts +0 -5
  14. package/lib/group.js +0 -5
  15. package/lib/image.d.ts +0 -38
  16. package/lib/image.js +0 -279
  17. package/lib/line.d.ts +0 -10
  18. package/lib/line.js +0 -66
  19. package/lib/pdf-import/coordinate-transform.d.ts +0 -51
  20. package/lib/pdf-import/coordinate-transform.js +0 -99
  21. package/lib/pdf-import/element-builder.d.ts +0 -21
  22. package/lib/pdf-import/element-builder.js +0 -163
  23. package/lib/pdf-import/font-mapper.d.ts +0 -17
  24. package/lib/pdf-import/font-mapper.js +0 -142
  25. package/lib/pdf-import/index.d.ts +0 -35
  26. package/lib/pdf-import/index.js +0 -105
  27. package/lib/pdf-import/parser.d.ts +0 -29
  28. package/lib/pdf-import/parser.js +0 -285
  29. package/lib/pdf-import/text-analysis.d.ts +0 -17
  30. package/lib/pdf-import/text-analysis.js +0 -186
  31. package/lib/pdf-import/types.d.ts +0 -101
  32. package/lib/pdf-import/types.js +0 -1
  33. package/lib/scripts/compare-json.d.ts +0 -1
  34. package/lib/scripts/compare-json.js +0 -141
  35. package/lib/spot-colors.d.ts +0 -38
  36. package/lib/spot-colors.js +0 -141
  37. package/lib/svg-render.d.ts +0 -9
  38. package/lib/svg-render.js +0 -63
  39. package/lib/svg.d.ts +0 -12
  40. package/lib/svg.js +0 -224
  41. package/lib/text/fonts.d.ts +0 -16
  42. package/lib/text/fonts.js +0 -113
  43. package/lib/text/index.d.ts +0 -8
  44. package/lib/text/index.js +0 -42
  45. package/lib/text/layout.d.ts +0 -22
  46. package/lib/text/layout.js +0 -522
  47. package/lib/text/parser.d.ts +0 -46
  48. package/lib/text/parser.js +0 -415
  49. package/lib/text/render.d.ts +0 -8
  50. package/lib/text/render.js +0 -237
  51. package/lib/text/types.d.ts +0 -91
  52. package/lib/text/types.js +0 -1
  53. package/lib/text.d.ts +0 -39
  54. package/lib/text.js +0 -576
  55. package/lib/utils.d.ts +0 -16
  56. package/lib/utils.js +0 -124
@@ -1,17 +0,0 @@
1
- import type { ParsedFont } from './types.js';
2
- /**
3
- * Parse PDF font name to extract family, weight, and style
4
- * Examples:
5
- * "Arial-BoldItalic" → { family: "Arial", weight: "700", style: "italic" }
6
- * "Helvetica" → { family: "Arial", weight: "400", style: "normal" }
7
- * "TimesNewRomanPS-BoldMT" → { family: "Times New Roman", weight: "700", style: "normal" }
8
- */
9
- export declare function parseFontName(pdfFontName: string): ParsedFont;
10
- /**
11
- * Apply custom font mapping from user options
12
- */
13
- export declare function applyCustomMapping(parsedFont: ParsedFont, customMapping?: Record<string, string>): ParsedFont;
14
- /**
15
- * Main function to map PDF font to Polotno-compatible font
16
- */
17
- export declare function mapFont(pdfFontName: string, customMapping?: Record<string, string>): ParsedFont;
@@ -1,142 +0,0 @@
1
- /**
2
- * Map of common PDF font names to Google Fonts or web-safe equivalents
3
- */
4
- const FONT_MAPPING = {
5
- // Serif fonts
6
- 'Times': 'Times New Roman',
7
- 'Times-Roman': 'Times New Roman',
8
- 'Times-Bold': 'Times New Roman',
9
- 'Times-Italic': 'Times New Roman',
10
- 'Times-BoldItalic': 'Times New Roman',
11
- 'TimesNewRoman': 'Times New Roman',
12
- 'TimesNewRomanPS': 'Times New Roman',
13
- 'Georgia': 'Georgia',
14
- 'Garamond': 'Garamond',
15
- // Sans-serif fonts
16
- 'Helvetica': 'Arial',
17
- 'Helvetica-Bold': 'Arial',
18
- 'Helvetica-Oblique': 'Arial',
19
- 'Helvetica-BoldOblique': 'Arial',
20
- 'Arial': 'Arial',
21
- 'ArialMT': 'Arial',
22
- 'Arial-BoldMT': 'Arial',
23
- 'Verdana': 'Verdana',
24
- 'Tahoma': 'Tahoma',
25
- 'Trebuchet': 'Trebuchet MS',
26
- 'Calibri': 'Calibri',
27
- 'Roboto': 'Roboto',
28
- // Monospace fonts
29
- 'Courier': 'Courier New',
30
- 'Courier-Bold': 'Courier New',
31
- 'Courier-Oblique': 'Courier New',
32
- 'Courier-BoldOblique': 'Courier New',
33
- 'CourierNew': 'Courier New',
34
- 'Consolas': 'Consolas',
35
- 'Monaco': 'Monaco',
36
- // Other common fonts
37
- 'Symbol': 'Symbol',
38
- 'ZapfDingbats': 'Zapf Dingbats',
39
- 'ComicSansMS': 'Comic Sans MS',
40
- 'Impact': 'Impact',
41
- };
42
- /**
43
- * Font weight keywords and their numeric equivalents
44
- */
45
- const WEIGHT_MAPPING = {
46
- 'Thin': '100',
47
- 'ExtraLight': '200',
48
- 'UltraLight': '200',
49
- 'Light': '300',
50
- 'Normal': '400',
51
- 'Regular': '400',
52
- 'Medium': '500',
53
- 'SemiBold': '600',
54
- 'DemiBold': '600',
55
- 'Bold': '700',
56
- 'ExtraBold': '800',
57
- 'UltraBold': '800',
58
- 'Black': '900',
59
- 'Heavy': '900',
60
- };
61
- /**
62
- * Parse PDF font name to extract family, weight, and style
63
- * Examples:
64
- * "Arial-BoldItalic" → { family: "Arial", weight: "700", style: "italic" }
65
- * "Helvetica" → { family: "Arial", weight: "400", style: "normal" }
66
- * "TimesNewRomanPS-BoldMT" → { family: "Times New Roman", weight: "700", style: "normal" }
67
- */
68
- export function parseFontName(pdfFontName) {
69
- // Remove common suffixes and prefixes
70
- let cleanName = pdfFontName
71
- .replace(/^SUBSET\+/, '') // Remove subset prefix
72
- .replace(/PS$/, '') // Remove PostScript suffix
73
- .replace(/MT$/, '') // Remove MT suffix
74
- .replace(/,/g, ''); // Remove commas
75
- // Check for italic
76
- const hasItalic = /Italic|Oblique|It$/i.test(cleanName);
77
- const style = hasItalic ? 'italic' : 'normal';
78
- // Check for bold and other weights
79
- let weight = '400'; // Default to normal weight
80
- for (const [keyword, numeric] of Object.entries(WEIGHT_MAPPING)) {
81
- if (cleanName.includes(keyword)) {
82
- weight = numeric;
83
- // Remove the weight keyword from name
84
- cleanName = cleanName.replace(keyword, '');
85
- break;
86
- }
87
- }
88
- // Remove style indicators from name
89
- cleanName = cleanName
90
- .replace(/[-_]?(Bold|Italic|Oblique|Regular|Normal|It)/gi, '')
91
- .replace(/^[-_]+|[-_]+$/g, '') // Remove leading/trailing separators
92
- .trim();
93
- // Look up in mapping table
94
- let family = FONT_MAPPING[pdfFontName] || FONT_MAPPING[cleanName] || cleanName;
95
- // If no mapping found and name looks like it has no spaces, try to split camelCase
96
- if (!FONT_MAPPING[pdfFontName] && !family.includes(' ')) {
97
- family = splitCamelCase(family);
98
- }
99
- // Fallback to Roboto if font is empty or looks like a generic placeholder
100
- if (!family || family.length < 2 || /^[A-Z]{6}\+/.test(pdfFontName)) {
101
- family = 'Roboto';
102
- }
103
- return {
104
- family,
105
- weight,
106
- style,
107
- };
108
- }
109
- /**
110
- * Split camelCase font names into spaced names
111
- * Example: "TimesNewRoman" → "Times New Roman"
112
- */
113
- function splitCamelCase(text) {
114
- return text
115
- .replace(/([a-z])([A-Z])/g, '$1 $2')
116
- .replace(/([A-Z])([A-Z][a-z])/g, '$1 $2')
117
- .trim();
118
- }
119
- /**
120
- * Apply custom font mapping from user options
121
- */
122
- export function applyCustomMapping(parsedFont, customMapping) {
123
- if (!customMapping) {
124
- return parsedFont;
125
- }
126
- // Check if there's a custom mapping for this font family
127
- const mappedFamily = customMapping[parsedFont.family];
128
- if (mappedFamily) {
129
- return {
130
- ...parsedFont,
131
- family: mappedFamily,
132
- };
133
- }
134
- return parsedFont;
135
- }
136
- /**
137
- * Main function to map PDF font to Polotno-compatible font
138
- */
139
- export function mapFont(pdfFontName, customMapping) {
140
- const parsed = parseFontName(pdfFontName);
141
- return applyCustomMapping(parsed, customMapping);
142
- }
@@ -1,35 +0,0 @@
1
- import type { PDFImportOptions } from './types.js';
2
- import type { PolotnoJSON } from '../index.js';
3
- /**
4
- * Convert PDF to Polotno JSON
5
- * @param source - PDF file path or buffer
6
- * @param options - Conversion options
7
- * @returns Polotno JSON object
8
- *
9
- * @example
10
- * ```typescript
11
- * // Basic usage with embedded images
12
- * const json = await pdfToJSON('document.pdf');
13
- *
14
- * // With custom options
15
- * const json = await pdfToJSON('document.pdf', {
16
- * imageMode: 'dataURL',
17
- * minTextBlockSize: 10,
18
- * fontMapping: {
19
- * 'Helvetica': 'Roboto',
20
- * 'Times': 'Merriweather'
21
- * }
22
- * });
23
- *
24
- * // With image upload
25
- * const json = await pdfToJSON('document.pdf', {
26
- * imageMode: 'upload',
27
- * imageUploadFn: async (buffer, mimeType) => {
28
- * // Upload to your storage and return URL
29
- * return 'https://your-cdn.com/image.jpg';
30
- * }
31
- * });
32
- * ```
33
- */
34
- export declare function pdfToJSON(source: string | Buffer, options?: PDFImportOptions): Promise<PolotnoJSON>;
35
- export type { PDFImportOptions } from './types.js';
@@ -1,105 +0,0 @@
1
- import { parsePDF } from './parser.js';
2
- import { clusterTextItems, applyAlignmentDetection } from './text-analysis.js';
3
- import { buildTextElement, processImages } from './element-builder.js';
4
- import { convertUnits } from './coordinate-transform.js';
5
- /**
6
- * Generate a random ID for Polotno pages
7
- */
8
- function randomId() {
9
- const chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789';
10
- let result = '';
11
- for (let i = 0; i < 10; i++) {
12
- result += chars.charAt(Math.floor(Math.random() * chars.length));
13
- }
14
- return result;
15
- }
16
- /**
17
- * Convert PDF to Polotno JSON
18
- * @param source - PDF file path or buffer
19
- * @param options - Conversion options
20
- * @returns Polotno JSON object
21
- *
22
- * @example
23
- * ```typescript
24
- * // Basic usage with embedded images
25
- * const json = await pdfToJSON('document.pdf');
26
- *
27
- * // With custom options
28
- * const json = await pdfToJSON('document.pdf', {
29
- * imageMode: 'dataURL',
30
- * minTextBlockSize: 10,
31
- * fontMapping: {
32
- * 'Helvetica': 'Roboto',
33
- * 'Times': 'Merriweather'
34
- * }
35
- * });
36
- *
37
- * // With image upload
38
- * const json = await pdfToJSON('document.pdf', {
39
- * imageMode: 'upload',
40
- * imageUploadFn: async (buffer, mimeType) => {
41
- * // Upload to your storage and return URL
42
- * return 'https://your-cdn.com/image.jpg';
43
- * }
44
- * });
45
- * ```
46
- */
47
- export async function pdfToJSON(source, options = {}) {
48
- // Set default options
49
- const opts = {
50
- imageMode: 'dataURL',
51
- minTextBlockSize: 8,
52
- textClusterThreshold: {
53
- vertical: 20,
54
- horizontal: 20,
55
- },
56
- outputUnit: 'px',
57
- dpi: 72,
58
- ...options,
59
- };
60
- // Validate options
61
- if (opts.imageMode === 'upload' && !opts.imageUploadFn) {
62
- throw new Error('imageUploadFn is required when imageMode is "upload"');
63
- }
64
- // Parse PDF
65
- const pages = await parsePDF(source, opts.pageNumbers);
66
- if (pages.length === 0) {
67
- throw new Error('No pages found in PDF or invalid page numbers specified');
68
- }
69
- // Get document dimensions from first page
70
- const firstPage = pages[0];
71
- const documentWidth = convertUnits(firstPage.metadata.width, opts.outputUnit, opts.dpi);
72
- const documentHeight = convertUnits(firstPage.metadata.height, opts.outputUnit, opts.dpi);
73
- // Build Polotno JSON structure
74
- const polotnoJSON = {
75
- width: documentWidth,
76
- height: documentHeight,
77
- fonts: [], // No custom fonts in MVP
78
- pages: [],
79
- };
80
- // Process each page
81
- for (const page of pages) {
82
- const pageWidth = page.metadata.width;
83
- const pageHeight = page.metadata.height;
84
- // Cluster text items into text blocks
85
- let textBlocks = clusterTextItems(page.textItems, pageHeight, pageWidth, opts);
86
- // Apply alignment detection
87
- textBlocks = applyAlignmentDetection(textBlocks, pageWidth);
88
- // Build text elements
89
- const textElements = textBlocks.map(block => buildTextElement(block));
90
- // Process images
91
- const imageElements = await processImages(page.images, pageHeight, opts);
92
- // Combine all elements
93
- const children = [
94
- ...textElements,
95
- ...imageElements,
96
- ];
97
- // Add page to JSON
98
- polotnoJSON.pages.push({
99
- background: 'white',
100
- children,
101
- id: randomId(),
102
- });
103
- }
104
- return polotnoJSON;
105
- }
@@ -1,29 +0,0 @@
1
- import type { PDFTextItem, PDFImageObject, ParsedPage, PDFPageMetadata } from './types.js';
2
- /**
3
- * Load PDF document from file path or buffer
4
- */
5
- export declare function loadPDF(source: string | Buffer): Promise<any>;
6
- /**
7
- * Get number of pages in PDF document
8
- */
9
- export declare function getPageCount(pdfDoc: any): number;
10
- /**
11
- * Extract metadata from a single PDF page
12
- */
13
- export declare function extractPageMetadata(page: any, pageNumber: number): Promise<PDFPageMetadata>;
14
- /**
15
- * Extract text items from a PDF page with position and font metadata
16
- */
17
- export declare function extractTextItems(page: any): Promise<PDFTextItem[]>;
18
- /**
19
- * Extract images from a PDF page with enhanced async handling
20
- */
21
- export declare function extractImages(page: any): Promise<PDFImageObject[]>;
22
- /**
23
- * Parse a single PDF page and extract all content
24
- */
25
- export declare function parsePage(pdfDoc: any, pageNumber: number): Promise<ParsedPage>;
26
- /**
27
- * Parse entire PDF document or specific pages
28
- */
29
- export declare function parsePDF(source: string | Buffer, pageNumbers?: number[]): Promise<ParsedPage[]>;
@@ -1,285 +0,0 @@
1
- import * as pdfjsLib from 'pdfjs-dist/legacy/build/pdf.mjs';
2
- import fs from 'fs';
3
- import { createCanvas } from 'canvas';
4
- import { calculateFontSize, extractPosition } from './coordinate-transform.js';
5
- /**
6
- * Load PDF document from file path or buffer
7
- */
8
- export async function loadPDF(source) {
9
- let data;
10
- if (typeof source === 'string') {
11
- // Load from file path
12
- data = new Uint8Array(fs.readFileSync(source));
13
- }
14
- else {
15
- // Use buffer directly
16
- data = new Uint8Array(source);
17
- }
18
- const loadingTask = pdfjsLib.getDocument({
19
- data,
20
- // useSystemFonts: false,
21
- // standardFontDataUrl: undefined,
22
- // useWorkerFetch: false,
23
- // isEvalSupported: true,
24
- });
25
- return await loadingTask.promise;
26
- }
27
- /**
28
- * Get number of pages in PDF document
29
- */
30
- export function getPageCount(pdfDoc) {
31
- return pdfDoc.numPages;
32
- }
33
- /**
34
- * Extract metadata from a single PDF page
35
- */
36
- export async function extractPageMetadata(page, pageNumber) {
37
- const viewport = page.getViewport({ scale: 1.0 });
38
- return {
39
- pageNumber,
40
- width: viewport.width,
41
- height: viewport.height,
42
- rotate: viewport.rotation || 0,
43
- };
44
- }
45
- /**
46
- * Extract text items from a PDF page with position and font metadata
47
- */
48
- export async function extractTextItems(page) {
49
- const textContent = await page.getTextContent({
50
- includeMarkedContent: true
51
- });
52
- const items = [];
53
- for (const item of textContent.items) {
54
- // Skip empty text
55
- if (!item.str || item.str.trim().length === 0) {
56
- continue;
57
- }
58
- // Extract font size from transformation matrix
59
- const fontSize = calculateFontSize(item.transform);
60
- // Extract position
61
- const position = extractPosition(item.transform);
62
- // Get color if available (default to black)
63
- let color = [0, 0, 0]; // Default black
64
- if (item.color && Array.isArray(item.color)) {
65
- color = item.color;
66
- }
67
- items.push({
68
- str: item.str,
69
- transform: item.transform,
70
- width: item.width,
71
- height: item.height,
72
- fontName: textContent.styles[item.fontName].fontFamily,
73
- fontSize,
74
- hasEOL: item.hasEOL || false,
75
- dir: item.dir || 'ltr',
76
- color,
77
- });
78
- }
79
- return items;
80
- }
81
- /**
82
- * Multiply two transformation matrices
83
- */
84
- function multiplyMatrices(m1, m2) {
85
- const [a1, b1, c1, d1, e1, f1] = m1;
86
- const [a2, b2, c2, d2, e2, f2] = m2;
87
- return [
88
- a1 * a2 + b1 * c2,
89
- a1 * b2 + b1 * d2,
90
- c1 * a2 + d1 * c2,
91
- c1 * b2 + d1 * d2,
92
- e1 * a2 + f1 * c2 + e2,
93
- e1 * b2 + f1 * d2 + f2
94
- ];
95
- }
96
- /**
97
- * Find transform matrix for an image by looking for the pattern:
98
- * - Pure translation matrix (position)
99
- * - Scale/flip matrix (dimensions)
100
- *
101
- * We combine these to get the final transform
102
- */
103
- function findTransformForImage(ops, imageIndex) {
104
- let translationTransform = null;
105
- let scaleTransform = null;
106
- // Walk backwards from the image to find the most recent transforms
107
- for (let i = imageIndex - 1; i >= 0; i--) {
108
- const fn = ops.fnArray[i];
109
- // Stop if we hit a restore (means we're outside the image's graphics state)
110
- if (fn === 32 || fn === 11) {
111
- break;
112
- }
113
- if (fn === 35 || fn === 12) { // Transform operation
114
- const transform = ops.argsArray[i];
115
- if (transform && Array.isArray(transform) && transform.length >= 6) {
116
- const [a, b, c, d, e, f] = transform;
117
- // Check if this is a pure translation (identity scale/rotation with position)
118
- if (Math.abs(a - 1) < 0.001 && Math.abs(b) < 0.001 &&
119
- Math.abs(c) < 0.001 && Math.abs(Math.abs(d) - 1) < 0.001) {
120
- if (!translationTransform) {
121
- translationTransform = transform;
122
- }
123
- }
124
- else {
125
- // This is a scale/rotation transform
126
- if (!scaleTransform) {
127
- scaleTransform = transform;
128
- }
129
- }
130
- // If we found both, we're done
131
- if (translationTransform && scaleTransform) {
132
- break;
133
- }
134
- }
135
- }
136
- }
137
- // Build the final transform by combining translation and scale
138
- if (translationTransform && scaleTransform) {
139
- const [, , , , tx, ty] = translationTransform;
140
- const [a, b, c, d, sx, sy] = scaleTransform;
141
- // Combine: use scale/rotation from scaleTransform, position from translationTransform
142
- return [a, b, c, d, tx, ty];
143
- }
144
- // Fallback to scale transform only if no translation found
145
- if (scaleTransform) {
146
- return scaleTransform;
147
- }
148
- // Fallback to translation only if no scale found
149
- if (translationTransform) {
150
- return translationTransform;
151
- }
152
- // Ultimate fallback: identity
153
- return [1, 0, 0, 1, 0, 0];
154
- }
155
- /**
156
- * Wait for a PDF object to be resolved with timeout and retry logic
157
- */
158
- async function waitForObject(page, objectName, timeoutMs = 5000) {
159
- const startTime = Date.now();
160
- const retryDelay = 50; // ms between retries
161
- while (Date.now() - startTime < timeoutMs) {
162
- try {
163
- // Check if object exists in the objects dictionary
164
- if (page.objs.has(objectName)) {
165
- const obj = page.objs.get(objectName);
166
- if (obj) {
167
- return obj;
168
- }
169
- }
170
- // Wait a bit before retrying
171
- await new Promise(resolve => setTimeout(resolve, retryDelay));
172
- }
173
- catch (error) {
174
- // Continue retrying
175
- }
176
- }
177
- throw new Error(`Timeout waiting for ${objectName} after ${timeoutMs}ms`);
178
- }
179
- /**
180
- * Extract images from a PDF page with enhanced async handling
181
- */
182
- export async function extractImages(page) {
183
- try {
184
- const ops = await page.getOperatorList();
185
- // First pass: collect all image names and their indices
186
- const imageOperations = [];
187
- for (let i = 0; i < ops.fnArray.length; i++) {
188
- const fn = ops.fnArray[i];
189
- const args = ops.argsArray[i];
190
- // Check for image operations
191
- // OPS.paintImageXObject = 85, OPS.paintInlineImageXObject = 86
192
- if (fn === 85 || fn === 86) {
193
- const imageName = args[0];
194
- if (imageName) {
195
- imageOperations.push({ name: imageName, index: i, opCode: fn });
196
- }
197
- }
198
- }
199
- // If no images found, return early
200
- if (imageOperations.length === 0) {
201
- return [];
202
- }
203
- // Second pass: extract all images in parallel with proper async handling
204
- const imagePromises = imageOperations.map(async ({ name, index, opCode }) => {
205
- try {
206
- // Wait for the object to be resolved with timeout
207
- const image = await waitForObject(page, name, 5000);
208
- if (!image || !image.data) {
209
- return null;
210
- }
211
- // PDF.js returns raw RGBA pixel data, we need to encode it as PNG/JPEG
212
- const canvas = createCanvas(image.width, image.height);
213
- const ctx = canvas.getContext('2d');
214
- // Create ImageData from the raw pixel data
215
- const imageData = ctx.createImageData(image.width, image.height);
216
- imageData.data.set(image.data);
217
- ctx.putImageData(imageData, 0, 0);
218
- // Encode to buffer (PNG by default, JPEG if original was JPEG)
219
- let buffer;
220
- let mimeType;
221
- if (image.kind === 1) {
222
- // Original was JPEG - encode as JPEG with 90% quality
223
- buffer = canvas.toBuffer('image/jpeg', { quality: 0.9 });
224
- mimeType = 'image/jpeg';
225
- }
226
- else {
227
- // Encode as PNG (lossless)
228
- buffer = canvas.toBuffer('image/png');
229
- mimeType = 'image/png';
230
- }
231
- // Get transformation matrix
232
- const transform = findTransformForImage(ops, index);
233
- return {
234
- buffer,
235
- mimeType,
236
- width: image.width,
237
- height: image.height,
238
- transform,
239
- };
240
- }
241
- catch (error) {
242
- console.warn(`Failed to extract image ${name}:`, error instanceof Error ? error.message : error);
243
- return null;
244
- }
245
- });
246
- // Wait for all images to be processed
247
- const results = await Promise.all(imagePromises);
248
- // Filter out null results (failed extractions)
249
- return results.filter((img) => img !== null);
250
- }
251
- catch (error) {
252
- console.warn('Failed to extract images from page:', error);
253
- return [];
254
- }
255
- }
256
- /**
257
- * Parse a single PDF page and extract all content
258
- */
259
- export async function parsePage(pdfDoc, pageNumber) {
260
- const page = await pdfDoc.getPage(pageNumber);
261
- const [metadata, textItems, images] = await Promise.all([
262
- extractPageMetadata(page, pageNumber),
263
- extractTextItems(page),
264
- extractImages(page),
265
- ]);
266
- return {
267
- metadata,
268
- textItems,
269
- images,
270
- };
271
- }
272
- /**
273
- * Parse entire PDF document or specific pages
274
- */
275
- export async function parsePDF(source, pageNumbers) {
276
- const pdfDoc = await loadPDF(source);
277
- const totalPages = getPageCount(pdfDoc);
278
- // Determine which pages to parse
279
- const pagesToParse = pageNumbers && pageNumbers.length > 0
280
- ? pageNumbers.filter(n => n >= 1 && n <= totalPages)
281
- : Array.from({ length: totalPages }, (_, i) => i + 1);
282
- // Parse all pages in parallel
283
- const pages = await Promise.all(pagesToParse.map(pageNum => parsePage(pdfDoc, pageNum)));
284
- return pages;
285
- }
@@ -1,17 +0,0 @@
1
- import type { PDFTextItem, TextBlock, PDFImportOptions } from './types.js';
2
- /**
3
- * Sort text items by position (top-to-bottom, left-to-right)
4
- */
5
- export declare function sortTextItems(items: PDFTextItem[], pageHeight: number): PDFTextItem[];
6
- /**
7
- * Cluster text items into text blocks
8
- */
9
- export declare function clusterTextItems(items: PDFTextItem[], pageHeight: number, pageWidth: number, options: PDFImportOptions): TextBlock[];
10
- /**
11
- * Detect text alignment based on position within page
12
- */
13
- export declare function detectAlignment(block: TextBlock, pageWidth: number): 'left' | 'center' | 'right';
14
- /**
15
- * Apply alignment detection to all blocks
16
- */
17
- export declare function applyAlignmentDetection(blocks: TextBlock[], pageWidth: number): TextBlock[];