@polotno/pdf-export 0.1.38 → 0.1.40
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +61 -8
- package/lib/index.d.ts +66 -8
- package/lib/index.js +25 -145
- package/package.json +17 -18
- package/lib/compare-render.d.ts +0 -1
- package/lib/compare-render.js +0 -185
- package/lib/figure.d.ts +0 -10
- package/lib/figure.js +0 -54
- package/lib/filters.d.ts +0 -2
- package/lib/filters.js +0 -163
- package/lib/ghostscript.d.ts +0 -21
- package/lib/ghostscript.js +0 -132
- package/lib/group.d.ts +0 -5
- package/lib/group.js +0 -5
- package/lib/image.d.ts +0 -38
- package/lib/image.js +0 -279
- package/lib/line.d.ts +0 -10
- package/lib/line.js +0 -66
- package/lib/pdf-import/coordinate-transform.d.ts +0 -51
- package/lib/pdf-import/coordinate-transform.js +0 -99
- package/lib/pdf-import/element-builder.d.ts +0 -21
- package/lib/pdf-import/element-builder.js +0 -163
- package/lib/pdf-import/font-mapper.d.ts +0 -17
- package/lib/pdf-import/font-mapper.js +0 -142
- package/lib/pdf-import/index.d.ts +0 -35
- package/lib/pdf-import/index.js +0 -105
- package/lib/pdf-import/parser.d.ts +0 -29
- package/lib/pdf-import/parser.js +0 -285
- package/lib/pdf-import/text-analysis.d.ts +0 -17
- package/lib/pdf-import/text-analysis.js +0 -186
- package/lib/pdf-import/types.d.ts +0 -101
- package/lib/pdf-import/types.js +0 -1
- package/lib/scripts/compare-json.d.ts +0 -1
- package/lib/scripts/compare-json.js +0 -141
- package/lib/spot-colors.d.ts +0 -38
- package/lib/spot-colors.js +0 -141
- package/lib/svg-render.d.ts +0 -9
- package/lib/svg-render.js +0 -63
- package/lib/svg.d.ts +0 -12
- package/lib/svg.js +0 -224
- package/lib/text/fonts.d.ts +0 -16
- package/lib/text/fonts.js +0 -113
- package/lib/text/index.d.ts +0 -8
- package/lib/text/index.js +0 -42
- package/lib/text/layout.d.ts +0 -22
- package/lib/text/layout.js +0 -522
- package/lib/text/parser.d.ts +0 -46
- package/lib/text/parser.js +0 -415
- package/lib/text/render.d.ts +0 -8
- package/lib/text/render.js +0 -237
- package/lib/text/types.d.ts +0 -91
- package/lib/text/types.js +0 -1
- package/lib/text.d.ts +0 -39
- package/lib/text.js +0 -576
- package/lib/utils.d.ts +0 -16
- package/lib/utils.js +0 -124
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
import type { ParsedFont } from './types.js';
|
|
2
|
-
/**
|
|
3
|
-
* Parse PDF font name to extract family, weight, and style
|
|
4
|
-
* Examples:
|
|
5
|
-
* "Arial-BoldItalic" → { family: "Arial", weight: "700", style: "italic" }
|
|
6
|
-
* "Helvetica" → { family: "Arial", weight: "400", style: "normal" }
|
|
7
|
-
* "TimesNewRomanPS-BoldMT" → { family: "Times New Roman", weight: "700", style: "normal" }
|
|
8
|
-
*/
|
|
9
|
-
export declare function parseFontName(pdfFontName: string): ParsedFont;
|
|
10
|
-
/**
|
|
11
|
-
* Apply custom font mapping from user options
|
|
12
|
-
*/
|
|
13
|
-
export declare function applyCustomMapping(parsedFont: ParsedFont, customMapping?: Record<string, string>): ParsedFont;
|
|
14
|
-
/**
|
|
15
|
-
* Main function to map PDF font to Polotno-compatible font
|
|
16
|
-
*/
|
|
17
|
-
export declare function mapFont(pdfFontName: string, customMapping?: Record<string, string>): ParsedFont;
|
|
@@ -1,142 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Map of common PDF font names to Google Fonts or web-safe equivalents
|
|
3
|
-
*/
|
|
4
|
-
const FONT_MAPPING = {
|
|
5
|
-
// Serif fonts
|
|
6
|
-
'Times': 'Times New Roman',
|
|
7
|
-
'Times-Roman': 'Times New Roman',
|
|
8
|
-
'Times-Bold': 'Times New Roman',
|
|
9
|
-
'Times-Italic': 'Times New Roman',
|
|
10
|
-
'Times-BoldItalic': 'Times New Roman',
|
|
11
|
-
'TimesNewRoman': 'Times New Roman',
|
|
12
|
-
'TimesNewRomanPS': 'Times New Roman',
|
|
13
|
-
'Georgia': 'Georgia',
|
|
14
|
-
'Garamond': 'Garamond',
|
|
15
|
-
// Sans-serif fonts
|
|
16
|
-
'Helvetica': 'Arial',
|
|
17
|
-
'Helvetica-Bold': 'Arial',
|
|
18
|
-
'Helvetica-Oblique': 'Arial',
|
|
19
|
-
'Helvetica-BoldOblique': 'Arial',
|
|
20
|
-
'Arial': 'Arial',
|
|
21
|
-
'ArialMT': 'Arial',
|
|
22
|
-
'Arial-BoldMT': 'Arial',
|
|
23
|
-
'Verdana': 'Verdana',
|
|
24
|
-
'Tahoma': 'Tahoma',
|
|
25
|
-
'Trebuchet': 'Trebuchet MS',
|
|
26
|
-
'Calibri': 'Calibri',
|
|
27
|
-
'Roboto': 'Roboto',
|
|
28
|
-
// Monospace fonts
|
|
29
|
-
'Courier': 'Courier New',
|
|
30
|
-
'Courier-Bold': 'Courier New',
|
|
31
|
-
'Courier-Oblique': 'Courier New',
|
|
32
|
-
'Courier-BoldOblique': 'Courier New',
|
|
33
|
-
'CourierNew': 'Courier New',
|
|
34
|
-
'Consolas': 'Consolas',
|
|
35
|
-
'Monaco': 'Monaco',
|
|
36
|
-
// Other common fonts
|
|
37
|
-
'Symbol': 'Symbol',
|
|
38
|
-
'ZapfDingbats': 'Zapf Dingbats',
|
|
39
|
-
'ComicSansMS': 'Comic Sans MS',
|
|
40
|
-
'Impact': 'Impact',
|
|
41
|
-
};
|
|
42
|
-
/**
|
|
43
|
-
* Font weight keywords and their numeric equivalents
|
|
44
|
-
*/
|
|
45
|
-
const WEIGHT_MAPPING = {
|
|
46
|
-
'Thin': '100',
|
|
47
|
-
'ExtraLight': '200',
|
|
48
|
-
'UltraLight': '200',
|
|
49
|
-
'Light': '300',
|
|
50
|
-
'Normal': '400',
|
|
51
|
-
'Regular': '400',
|
|
52
|
-
'Medium': '500',
|
|
53
|
-
'SemiBold': '600',
|
|
54
|
-
'DemiBold': '600',
|
|
55
|
-
'Bold': '700',
|
|
56
|
-
'ExtraBold': '800',
|
|
57
|
-
'UltraBold': '800',
|
|
58
|
-
'Black': '900',
|
|
59
|
-
'Heavy': '900',
|
|
60
|
-
};
|
|
61
|
-
/**
|
|
62
|
-
* Parse PDF font name to extract family, weight, and style
|
|
63
|
-
* Examples:
|
|
64
|
-
* "Arial-BoldItalic" → { family: "Arial", weight: "700", style: "italic" }
|
|
65
|
-
* "Helvetica" → { family: "Arial", weight: "400", style: "normal" }
|
|
66
|
-
* "TimesNewRomanPS-BoldMT" → { family: "Times New Roman", weight: "700", style: "normal" }
|
|
67
|
-
*/
|
|
68
|
-
export function parseFontName(pdfFontName) {
|
|
69
|
-
// Remove common suffixes and prefixes
|
|
70
|
-
let cleanName = pdfFontName
|
|
71
|
-
.replace(/^SUBSET\+/, '') // Remove subset prefix
|
|
72
|
-
.replace(/PS$/, '') // Remove PostScript suffix
|
|
73
|
-
.replace(/MT$/, '') // Remove MT suffix
|
|
74
|
-
.replace(/,/g, ''); // Remove commas
|
|
75
|
-
// Check for italic
|
|
76
|
-
const hasItalic = /Italic|Oblique|It$/i.test(cleanName);
|
|
77
|
-
const style = hasItalic ? 'italic' : 'normal';
|
|
78
|
-
// Check for bold and other weights
|
|
79
|
-
let weight = '400'; // Default to normal weight
|
|
80
|
-
for (const [keyword, numeric] of Object.entries(WEIGHT_MAPPING)) {
|
|
81
|
-
if (cleanName.includes(keyword)) {
|
|
82
|
-
weight = numeric;
|
|
83
|
-
// Remove the weight keyword from name
|
|
84
|
-
cleanName = cleanName.replace(keyword, '');
|
|
85
|
-
break;
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
// Remove style indicators from name
|
|
89
|
-
cleanName = cleanName
|
|
90
|
-
.replace(/[-_]?(Bold|Italic|Oblique|Regular|Normal|It)/gi, '')
|
|
91
|
-
.replace(/^[-_]+|[-_]+$/g, '') // Remove leading/trailing separators
|
|
92
|
-
.trim();
|
|
93
|
-
// Look up in mapping table
|
|
94
|
-
let family = FONT_MAPPING[pdfFontName] || FONT_MAPPING[cleanName] || cleanName;
|
|
95
|
-
// If no mapping found and name looks like it has no spaces, try to split camelCase
|
|
96
|
-
if (!FONT_MAPPING[pdfFontName] && !family.includes(' ')) {
|
|
97
|
-
family = splitCamelCase(family);
|
|
98
|
-
}
|
|
99
|
-
// Fallback to Roboto if font is empty or looks like a generic placeholder
|
|
100
|
-
if (!family || family.length < 2 || /^[A-Z]{6}\+/.test(pdfFontName)) {
|
|
101
|
-
family = 'Roboto';
|
|
102
|
-
}
|
|
103
|
-
return {
|
|
104
|
-
family,
|
|
105
|
-
weight,
|
|
106
|
-
style,
|
|
107
|
-
};
|
|
108
|
-
}
|
|
109
|
-
/**
|
|
110
|
-
* Split camelCase font names into spaced names
|
|
111
|
-
* Example: "TimesNewRoman" → "Times New Roman"
|
|
112
|
-
*/
|
|
113
|
-
function splitCamelCase(text) {
|
|
114
|
-
return text
|
|
115
|
-
.replace(/([a-z])([A-Z])/g, '$1 $2')
|
|
116
|
-
.replace(/([A-Z])([A-Z][a-z])/g, '$1 $2')
|
|
117
|
-
.trim();
|
|
118
|
-
}
|
|
119
|
-
/**
|
|
120
|
-
* Apply custom font mapping from user options
|
|
121
|
-
*/
|
|
122
|
-
export function applyCustomMapping(parsedFont, customMapping) {
|
|
123
|
-
if (!customMapping) {
|
|
124
|
-
return parsedFont;
|
|
125
|
-
}
|
|
126
|
-
// Check if there's a custom mapping for this font family
|
|
127
|
-
const mappedFamily = customMapping[parsedFont.family];
|
|
128
|
-
if (mappedFamily) {
|
|
129
|
-
return {
|
|
130
|
-
...parsedFont,
|
|
131
|
-
family: mappedFamily,
|
|
132
|
-
};
|
|
133
|
-
}
|
|
134
|
-
return parsedFont;
|
|
135
|
-
}
|
|
136
|
-
/**
|
|
137
|
-
* Main function to map PDF font to Polotno-compatible font
|
|
138
|
-
*/
|
|
139
|
-
export function mapFont(pdfFontName, customMapping) {
|
|
140
|
-
const parsed = parseFontName(pdfFontName);
|
|
141
|
-
return applyCustomMapping(parsed, customMapping);
|
|
142
|
-
}
|
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
import type { PDFImportOptions } from './types.js';
|
|
2
|
-
import type { PolotnoJSON } from '../index.js';
|
|
3
|
-
/**
|
|
4
|
-
* Convert PDF to Polotno JSON
|
|
5
|
-
* @param source - PDF file path or buffer
|
|
6
|
-
* @param options - Conversion options
|
|
7
|
-
* @returns Polotno JSON object
|
|
8
|
-
*
|
|
9
|
-
* @example
|
|
10
|
-
* ```typescript
|
|
11
|
-
* // Basic usage with embedded images
|
|
12
|
-
* const json = await pdfToJSON('document.pdf');
|
|
13
|
-
*
|
|
14
|
-
* // With custom options
|
|
15
|
-
* const json = await pdfToJSON('document.pdf', {
|
|
16
|
-
* imageMode: 'dataURL',
|
|
17
|
-
* minTextBlockSize: 10,
|
|
18
|
-
* fontMapping: {
|
|
19
|
-
* 'Helvetica': 'Roboto',
|
|
20
|
-
* 'Times': 'Merriweather'
|
|
21
|
-
* }
|
|
22
|
-
* });
|
|
23
|
-
*
|
|
24
|
-
* // With image upload
|
|
25
|
-
* const json = await pdfToJSON('document.pdf', {
|
|
26
|
-
* imageMode: 'upload',
|
|
27
|
-
* imageUploadFn: async (buffer, mimeType) => {
|
|
28
|
-
* // Upload to your storage and return URL
|
|
29
|
-
* return 'https://your-cdn.com/image.jpg';
|
|
30
|
-
* }
|
|
31
|
-
* });
|
|
32
|
-
* ```
|
|
33
|
-
*/
|
|
34
|
-
export declare function pdfToJSON(source: string | Buffer, options?: PDFImportOptions): Promise<PolotnoJSON>;
|
|
35
|
-
export type { PDFImportOptions } from './types.js';
|
package/lib/pdf-import/index.js
DELETED
|
@@ -1,105 +0,0 @@
|
|
|
1
|
-
import { parsePDF } from './parser.js';
|
|
2
|
-
import { clusterTextItems, applyAlignmentDetection } from './text-analysis.js';
|
|
3
|
-
import { buildTextElement, processImages } from './element-builder.js';
|
|
4
|
-
import { convertUnits } from './coordinate-transform.js';
|
|
5
|
-
/**
|
|
6
|
-
* Generate a random ID for Polotno pages
|
|
7
|
-
*/
|
|
8
|
-
function randomId() {
|
|
9
|
-
const chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789';
|
|
10
|
-
let result = '';
|
|
11
|
-
for (let i = 0; i < 10; i++) {
|
|
12
|
-
result += chars.charAt(Math.floor(Math.random() * chars.length));
|
|
13
|
-
}
|
|
14
|
-
return result;
|
|
15
|
-
}
|
|
16
|
-
/**
|
|
17
|
-
* Convert PDF to Polotno JSON
|
|
18
|
-
* @param source - PDF file path or buffer
|
|
19
|
-
* @param options - Conversion options
|
|
20
|
-
* @returns Polotno JSON object
|
|
21
|
-
*
|
|
22
|
-
* @example
|
|
23
|
-
* ```typescript
|
|
24
|
-
* // Basic usage with embedded images
|
|
25
|
-
* const json = await pdfToJSON('document.pdf');
|
|
26
|
-
*
|
|
27
|
-
* // With custom options
|
|
28
|
-
* const json = await pdfToJSON('document.pdf', {
|
|
29
|
-
* imageMode: 'dataURL',
|
|
30
|
-
* minTextBlockSize: 10,
|
|
31
|
-
* fontMapping: {
|
|
32
|
-
* 'Helvetica': 'Roboto',
|
|
33
|
-
* 'Times': 'Merriweather'
|
|
34
|
-
* }
|
|
35
|
-
* });
|
|
36
|
-
*
|
|
37
|
-
* // With image upload
|
|
38
|
-
* const json = await pdfToJSON('document.pdf', {
|
|
39
|
-
* imageMode: 'upload',
|
|
40
|
-
* imageUploadFn: async (buffer, mimeType) => {
|
|
41
|
-
* // Upload to your storage and return URL
|
|
42
|
-
* return 'https://your-cdn.com/image.jpg';
|
|
43
|
-
* }
|
|
44
|
-
* });
|
|
45
|
-
* ```
|
|
46
|
-
*/
|
|
47
|
-
export async function pdfToJSON(source, options = {}) {
|
|
48
|
-
// Set default options
|
|
49
|
-
const opts = {
|
|
50
|
-
imageMode: 'dataURL',
|
|
51
|
-
minTextBlockSize: 8,
|
|
52
|
-
textClusterThreshold: {
|
|
53
|
-
vertical: 20,
|
|
54
|
-
horizontal: 20,
|
|
55
|
-
},
|
|
56
|
-
outputUnit: 'px',
|
|
57
|
-
dpi: 72,
|
|
58
|
-
...options,
|
|
59
|
-
};
|
|
60
|
-
// Validate options
|
|
61
|
-
if (opts.imageMode === 'upload' && !opts.imageUploadFn) {
|
|
62
|
-
throw new Error('imageUploadFn is required when imageMode is "upload"');
|
|
63
|
-
}
|
|
64
|
-
// Parse PDF
|
|
65
|
-
const pages = await parsePDF(source, opts.pageNumbers);
|
|
66
|
-
if (pages.length === 0) {
|
|
67
|
-
throw new Error('No pages found in PDF or invalid page numbers specified');
|
|
68
|
-
}
|
|
69
|
-
// Get document dimensions from first page
|
|
70
|
-
const firstPage = pages[0];
|
|
71
|
-
const documentWidth = convertUnits(firstPage.metadata.width, opts.outputUnit, opts.dpi);
|
|
72
|
-
const documentHeight = convertUnits(firstPage.metadata.height, opts.outputUnit, opts.dpi);
|
|
73
|
-
// Build Polotno JSON structure
|
|
74
|
-
const polotnoJSON = {
|
|
75
|
-
width: documentWidth,
|
|
76
|
-
height: documentHeight,
|
|
77
|
-
fonts: [], // No custom fonts in MVP
|
|
78
|
-
pages: [],
|
|
79
|
-
};
|
|
80
|
-
// Process each page
|
|
81
|
-
for (const page of pages) {
|
|
82
|
-
const pageWidth = page.metadata.width;
|
|
83
|
-
const pageHeight = page.metadata.height;
|
|
84
|
-
// Cluster text items into text blocks
|
|
85
|
-
let textBlocks = clusterTextItems(page.textItems, pageHeight, pageWidth, opts);
|
|
86
|
-
// Apply alignment detection
|
|
87
|
-
textBlocks = applyAlignmentDetection(textBlocks, pageWidth);
|
|
88
|
-
// Build text elements
|
|
89
|
-
const textElements = textBlocks.map(block => buildTextElement(block));
|
|
90
|
-
// Process images
|
|
91
|
-
const imageElements = await processImages(page.images, pageHeight, opts);
|
|
92
|
-
// Combine all elements
|
|
93
|
-
const children = [
|
|
94
|
-
...textElements,
|
|
95
|
-
...imageElements,
|
|
96
|
-
];
|
|
97
|
-
// Add page to JSON
|
|
98
|
-
polotnoJSON.pages.push({
|
|
99
|
-
background: 'white',
|
|
100
|
-
children,
|
|
101
|
-
id: randomId(),
|
|
102
|
-
});
|
|
103
|
-
}
|
|
104
|
-
return polotnoJSON;
|
|
105
|
-
}
|
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
import type { PDFTextItem, PDFImageObject, ParsedPage, PDFPageMetadata } from './types.js';
|
|
2
|
-
/**
|
|
3
|
-
* Load PDF document from file path or buffer
|
|
4
|
-
*/
|
|
5
|
-
export declare function loadPDF(source: string | Buffer): Promise<any>;
|
|
6
|
-
/**
|
|
7
|
-
* Get number of pages in PDF document
|
|
8
|
-
*/
|
|
9
|
-
export declare function getPageCount(pdfDoc: any): number;
|
|
10
|
-
/**
|
|
11
|
-
* Extract metadata from a single PDF page
|
|
12
|
-
*/
|
|
13
|
-
export declare function extractPageMetadata(page: any, pageNumber: number): Promise<PDFPageMetadata>;
|
|
14
|
-
/**
|
|
15
|
-
* Extract text items from a PDF page with position and font metadata
|
|
16
|
-
*/
|
|
17
|
-
export declare function extractTextItems(page: any): Promise<PDFTextItem[]>;
|
|
18
|
-
/**
|
|
19
|
-
* Extract images from a PDF page with enhanced async handling
|
|
20
|
-
*/
|
|
21
|
-
export declare function extractImages(page: any): Promise<PDFImageObject[]>;
|
|
22
|
-
/**
|
|
23
|
-
* Parse a single PDF page and extract all content
|
|
24
|
-
*/
|
|
25
|
-
export declare function parsePage(pdfDoc: any, pageNumber: number): Promise<ParsedPage>;
|
|
26
|
-
/**
|
|
27
|
-
* Parse entire PDF document or specific pages
|
|
28
|
-
*/
|
|
29
|
-
export declare function parsePDF(source: string | Buffer, pageNumbers?: number[]): Promise<ParsedPage[]>;
|
package/lib/pdf-import/parser.js
DELETED
|
@@ -1,285 +0,0 @@
|
|
|
1
|
-
import * as pdfjsLib from 'pdfjs-dist/legacy/build/pdf.mjs';
|
|
2
|
-
import fs from 'fs';
|
|
3
|
-
import { createCanvas } from 'canvas';
|
|
4
|
-
import { calculateFontSize, extractPosition } from './coordinate-transform.js';
|
|
5
|
-
/**
|
|
6
|
-
* Load PDF document from file path or buffer
|
|
7
|
-
*/
|
|
8
|
-
export async function loadPDF(source) {
|
|
9
|
-
let data;
|
|
10
|
-
if (typeof source === 'string') {
|
|
11
|
-
// Load from file path
|
|
12
|
-
data = new Uint8Array(fs.readFileSync(source));
|
|
13
|
-
}
|
|
14
|
-
else {
|
|
15
|
-
// Use buffer directly
|
|
16
|
-
data = new Uint8Array(source);
|
|
17
|
-
}
|
|
18
|
-
const loadingTask = pdfjsLib.getDocument({
|
|
19
|
-
data,
|
|
20
|
-
// useSystemFonts: false,
|
|
21
|
-
// standardFontDataUrl: undefined,
|
|
22
|
-
// useWorkerFetch: false,
|
|
23
|
-
// isEvalSupported: true,
|
|
24
|
-
});
|
|
25
|
-
return await loadingTask.promise;
|
|
26
|
-
}
|
|
27
|
-
/**
|
|
28
|
-
* Get number of pages in PDF document
|
|
29
|
-
*/
|
|
30
|
-
export function getPageCount(pdfDoc) {
|
|
31
|
-
return pdfDoc.numPages;
|
|
32
|
-
}
|
|
33
|
-
/**
|
|
34
|
-
* Extract metadata from a single PDF page
|
|
35
|
-
*/
|
|
36
|
-
export async function extractPageMetadata(page, pageNumber) {
|
|
37
|
-
const viewport = page.getViewport({ scale: 1.0 });
|
|
38
|
-
return {
|
|
39
|
-
pageNumber,
|
|
40
|
-
width: viewport.width,
|
|
41
|
-
height: viewport.height,
|
|
42
|
-
rotate: viewport.rotation || 0,
|
|
43
|
-
};
|
|
44
|
-
}
|
|
45
|
-
/**
|
|
46
|
-
* Extract text items from a PDF page with position and font metadata
|
|
47
|
-
*/
|
|
48
|
-
export async function extractTextItems(page) {
|
|
49
|
-
const textContent = await page.getTextContent({
|
|
50
|
-
includeMarkedContent: true
|
|
51
|
-
});
|
|
52
|
-
const items = [];
|
|
53
|
-
for (const item of textContent.items) {
|
|
54
|
-
// Skip empty text
|
|
55
|
-
if (!item.str || item.str.trim().length === 0) {
|
|
56
|
-
continue;
|
|
57
|
-
}
|
|
58
|
-
// Extract font size from transformation matrix
|
|
59
|
-
const fontSize = calculateFontSize(item.transform);
|
|
60
|
-
// Extract position
|
|
61
|
-
const position = extractPosition(item.transform);
|
|
62
|
-
// Get color if available (default to black)
|
|
63
|
-
let color = [0, 0, 0]; // Default black
|
|
64
|
-
if (item.color && Array.isArray(item.color)) {
|
|
65
|
-
color = item.color;
|
|
66
|
-
}
|
|
67
|
-
items.push({
|
|
68
|
-
str: item.str,
|
|
69
|
-
transform: item.transform,
|
|
70
|
-
width: item.width,
|
|
71
|
-
height: item.height,
|
|
72
|
-
fontName: textContent.styles[item.fontName].fontFamily,
|
|
73
|
-
fontSize,
|
|
74
|
-
hasEOL: item.hasEOL || false,
|
|
75
|
-
dir: item.dir || 'ltr',
|
|
76
|
-
color,
|
|
77
|
-
});
|
|
78
|
-
}
|
|
79
|
-
return items;
|
|
80
|
-
}
|
|
81
|
-
/**
|
|
82
|
-
* Multiply two transformation matrices
|
|
83
|
-
*/
|
|
84
|
-
function multiplyMatrices(m1, m2) {
|
|
85
|
-
const [a1, b1, c1, d1, e1, f1] = m1;
|
|
86
|
-
const [a2, b2, c2, d2, e2, f2] = m2;
|
|
87
|
-
return [
|
|
88
|
-
a1 * a2 + b1 * c2,
|
|
89
|
-
a1 * b2 + b1 * d2,
|
|
90
|
-
c1 * a2 + d1 * c2,
|
|
91
|
-
c1 * b2 + d1 * d2,
|
|
92
|
-
e1 * a2 + f1 * c2 + e2,
|
|
93
|
-
e1 * b2 + f1 * d2 + f2
|
|
94
|
-
];
|
|
95
|
-
}
|
|
96
|
-
/**
|
|
97
|
-
* Find transform matrix for an image by looking for the pattern:
|
|
98
|
-
* - Pure translation matrix (position)
|
|
99
|
-
* - Scale/flip matrix (dimensions)
|
|
100
|
-
*
|
|
101
|
-
* We combine these to get the final transform
|
|
102
|
-
*/
|
|
103
|
-
function findTransformForImage(ops, imageIndex) {
|
|
104
|
-
let translationTransform = null;
|
|
105
|
-
let scaleTransform = null;
|
|
106
|
-
// Walk backwards from the image to find the most recent transforms
|
|
107
|
-
for (let i = imageIndex - 1; i >= 0; i--) {
|
|
108
|
-
const fn = ops.fnArray[i];
|
|
109
|
-
// Stop if we hit a restore (means we're outside the image's graphics state)
|
|
110
|
-
if (fn === 32 || fn === 11) {
|
|
111
|
-
break;
|
|
112
|
-
}
|
|
113
|
-
if (fn === 35 || fn === 12) { // Transform operation
|
|
114
|
-
const transform = ops.argsArray[i];
|
|
115
|
-
if (transform && Array.isArray(transform) && transform.length >= 6) {
|
|
116
|
-
const [a, b, c, d, e, f] = transform;
|
|
117
|
-
// Check if this is a pure translation (identity scale/rotation with position)
|
|
118
|
-
if (Math.abs(a - 1) < 0.001 && Math.abs(b) < 0.001 &&
|
|
119
|
-
Math.abs(c) < 0.001 && Math.abs(Math.abs(d) - 1) < 0.001) {
|
|
120
|
-
if (!translationTransform) {
|
|
121
|
-
translationTransform = transform;
|
|
122
|
-
}
|
|
123
|
-
}
|
|
124
|
-
else {
|
|
125
|
-
// This is a scale/rotation transform
|
|
126
|
-
if (!scaleTransform) {
|
|
127
|
-
scaleTransform = transform;
|
|
128
|
-
}
|
|
129
|
-
}
|
|
130
|
-
// If we found both, we're done
|
|
131
|
-
if (translationTransform && scaleTransform) {
|
|
132
|
-
break;
|
|
133
|
-
}
|
|
134
|
-
}
|
|
135
|
-
}
|
|
136
|
-
}
|
|
137
|
-
// Build the final transform by combining translation and scale
|
|
138
|
-
if (translationTransform && scaleTransform) {
|
|
139
|
-
const [, , , , tx, ty] = translationTransform;
|
|
140
|
-
const [a, b, c, d, sx, sy] = scaleTransform;
|
|
141
|
-
// Combine: use scale/rotation from scaleTransform, position from translationTransform
|
|
142
|
-
return [a, b, c, d, tx, ty];
|
|
143
|
-
}
|
|
144
|
-
// Fallback to scale transform only if no translation found
|
|
145
|
-
if (scaleTransform) {
|
|
146
|
-
return scaleTransform;
|
|
147
|
-
}
|
|
148
|
-
// Fallback to translation only if no scale found
|
|
149
|
-
if (translationTransform) {
|
|
150
|
-
return translationTransform;
|
|
151
|
-
}
|
|
152
|
-
// Ultimate fallback: identity
|
|
153
|
-
return [1, 0, 0, 1, 0, 0];
|
|
154
|
-
}
|
|
155
|
-
/**
|
|
156
|
-
* Wait for a PDF object to be resolved with timeout and retry logic
|
|
157
|
-
*/
|
|
158
|
-
async function waitForObject(page, objectName, timeoutMs = 5000) {
|
|
159
|
-
const startTime = Date.now();
|
|
160
|
-
const retryDelay = 50; // ms between retries
|
|
161
|
-
while (Date.now() - startTime < timeoutMs) {
|
|
162
|
-
try {
|
|
163
|
-
// Check if object exists in the objects dictionary
|
|
164
|
-
if (page.objs.has(objectName)) {
|
|
165
|
-
const obj = page.objs.get(objectName);
|
|
166
|
-
if (obj) {
|
|
167
|
-
return obj;
|
|
168
|
-
}
|
|
169
|
-
}
|
|
170
|
-
// Wait a bit before retrying
|
|
171
|
-
await new Promise(resolve => setTimeout(resolve, retryDelay));
|
|
172
|
-
}
|
|
173
|
-
catch (error) {
|
|
174
|
-
// Continue retrying
|
|
175
|
-
}
|
|
176
|
-
}
|
|
177
|
-
throw new Error(`Timeout waiting for ${objectName} after ${timeoutMs}ms`);
|
|
178
|
-
}
|
|
179
|
-
/**
|
|
180
|
-
* Extract images from a PDF page with enhanced async handling
|
|
181
|
-
*/
|
|
182
|
-
export async function extractImages(page) {
|
|
183
|
-
try {
|
|
184
|
-
const ops = await page.getOperatorList();
|
|
185
|
-
// First pass: collect all image names and their indices
|
|
186
|
-
const imageOperations = [];
|
|
187
|
-
for (let i = 0; i < ops.fnArray.length; i++) {
|
|
188
|
-
const fn = ops.fnArray[i];
|
|
189
|
-
const args = ops.argsArray[i];
|
|
190
|
-
// Check for image operations
|
|
191
|
-
// OPS.paintImageXObject = 85, OPS.paintInlineImageXObject = 86
|
|
192
|
-
if (fn === 85 || fn === 86) {
|
|
193
|
-
const imageName = args[0];
|
|
194
|
-
if (imageName) {
|
|
195
|
-
imageOperations.push({ name: imageName, index: i, opCode: fn });
|
|
196
|
-
}
|
|
197
|
-
}
|
|
198
|
-
}
|
|
199
|
-
// If no images found, return early
|
|
200
|
-
if (imageOperations.length === 0) {
|
|
201
|
-
return [];
|
|
202
|
-
}
|
|
203
|
-
// Second pass: extract all images in parallel with proper async handling
|
|
204
|
-
const imagePromises = imageOperations.map(async ({ name, index, opCode }) => {
|
|
205
|
-
try {
|
|
206
|
-
// Wait for the object to be resolved with timeout
|
|
207
|
-
const image = await waitForObject(page, name, 5000);
|
|
208
|
-
if (!image || !image.data) {
|
|
209
|
-
return null;
|
|
210
|
-
}
|
|
211
|
-
// PDF.js returns raw RGBA pixel data, we need to encode it as PNG/JPEG
|
|
212
|
-
const canvas = createCanvas(image.width, image.height);
|
|
213
|
-
const ctx = canvas.getContext('2d');
|
|
214
|
-
// Create ImageData from the raw pixel data
|
|
215
|
-
const imageData = ctx.createImageData(image.width, image.height);
|
|
216
|
-
imageData.data.set(image.data);
|
|
217
|
-
ctx.putImageData(imageData, 0, 0);
|
|
218
|
-
// Encode to buffer (PNG by default, JPEG if original was JPEG)
|
|
219
|
-
let buffer;
|
|
220
|
-
let mimeType;
|
|
221
|
-
if (image.kind === 1) {
|
|
222
|
-
// Original was JPEG - encode as JPEG with 90% quality
|
|
223
|
-
buffer = canvas.toBuffer('image/jpeg', { quality: 0.9 });
|
|
224
|
-
mimeType = 'image/jpeg';
|
|
225
|
-
}
|
|
226
|
-
else {
|
|
227
|
-
// Encode as PNG (lossless)
|
|
228
|
-
buffer = canvas.toBuffer('image/png');
|
|
229
|
-
mimeType = 'image/png';
|
|
230
|
-
}
|
|
231
|
-
// Get transformation matrix
|
|
232
|
-
const transform = findTransformForImage(ops, index);
|
|
233
|
-
return {
|
|
234
|
-
buffer,
|
|
235
|
-
mimeType,
|
|
236
|
-
width: image.width,
|
|
237
|
-
height: image.height,
|
|
238
|
-
transform,
|
|
239
|
-
};
|
|
240
|
-
}
|
|
241
|
-
catch (error) {
|
|
242
|
-
console.warn(`Failed to extract image ${name}:`, error instanceof Error ? error.message : error);
|
|
243
|
-
return null;
|
|
244
|
-
}
|
|
245
|
-
});
|
|
246
|
-
// Wait for all images to be processed
|
|
247
|
-
const results = await Promise.all(imagePromises);
|
|
248
|
-
// Filter out null results (failed extractions)
|
|
249
|
-
return results.filter((img) => img !== null);
|
|
250
|
-
}
|
|
251
|
-
catch (error) {
|
|
252
|
-
console.warn('Failed to extract images from page:', error);
|
|
253
|
-
return [];
|
|
254
|
-
}
|
|
255
|
-
}
|
|
256
|
-
/**
|
|
257
|
-
* Parse a single PDF page and extract all content
|
|
258
|
-
*/
|
|
259
|
-
export async function parsePage(pdfDoc, pageNumber) {
|
|
260
|
-
const page = await pdfDoc.getPage(pageNumber);
|
|
261
|
-
const [metadata, textItems, images] = await Promise.all([
|
|
262
|
-
extractPageMetadata(page, pageNumber),
|
|
263
|
-
extractTextItems(page),
|
|
264
|
-
extractImages(page),
|
|
265
|
-
]);
|
|
266
|
-
return {
|
|
267
|
-
metadata,
|
|
268
|
-
textItems,
|
|
269
|
-
images,
|
|
270
|
-
};
|
|
271
|
-
}
|
|
272
|
-
/**
|
|
273
|
-
* Parse entire PDF document or specific pages
|
|
274
|
-
*/
|
|
275
|
-
export async function parsePDF(source, pageNumbers) {
|
|
276
|
-
const pdfDoc = await loadPDF(source);
|
|
277
|
-
const totalPages = getPageCount(pdfDoc);
|
|
278
|
-
// Determine which pages to parse
|
|
279
|
-
const pagesToParse = pageNumbers && pageNumbers.length > 0
|
|
280
|
-
? pageNumbers.filter(n => n >= 1 && n <= totalPages)
|
|
281
|
-
: Array.from({ length: totalPages }, (_, i) => i + 1);
|
|
282
|
-
// Parse all pages in parallel
|
|
283
|
-
const pages = await Promise.all(pagesToParse.map(pageNum => parsePage(pdfDoc, pageNum)));
|
|
284
|
-
return pages;
|
|
285
|
-
}
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
import type { PDFTextItem, TextBlock, PDFImportOptions } from './types.js';
|
|
2
|
-
/**
|
|
3
|
-
* Sort text items by position (top-to-bottom, left-to-right)
|
|
4
|
-
*/
|
|
5
|
-
export declare function sortTextItems(items: PDFTextItem[], pageHeight: number): PDFTextItem[];
|
|
6
|
-
/**
|
|
7
|
-
* Cluster text items into text blocks
|
|
8
|
-
*/
|
|
9
|
-
export declare function clusterTextItems(items: PDFTextItem[], pageHeight: number, pageWidth: number, options: PDFImportOptions): TextBlock[];
|
|
10
|
-
/**
|
|
11
|
-
* Detect text alignment based on position within page
|
|
12
|
-
*/
|
|
13
|
-
export declare function detectAlignment(block: TextBlock, pageWidth: number): 'left' | 'center' | 'right';
|
|
14
|
-
/**
|
|
15
|
-
* Apply alignment detection to all blocks
|
|
16
|
-
*/
|
|
17
|
-
export declare function applyAlignmentDetection(blocks: TextBlock[], pageWidth: number): TextBlock[];
|