@cj-tech-master/excelts 8.1.2 → 9.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/README_zh.md +2 -2
- package/dist/browser/modules/excel/cell.js +11 -7
- package/dist/browser/modules/excel/column.js +7 -6
- package/dist/browser/modules/excel/row.js +5 -1
- package/dist/browser/modules/excel/stream/worksheet-reader.js +3 -2
- package/dist/browser/modules/excel/utils/cell-format.js +64 -2
- package/dist/browser/modules/pdf/excel-bridge.d.ts +4 -3
- package/dist/browser/modules/pdf/excel-bridge.js +18 -5
- package/dist/browser/modules/pdf/index.d.ts +3 -3
- package/dist/browser/modules/pdf/index.js +3 -3
- package/dist/browser/modules/pdf/pdf.d.ts +7 -6
- package/dist/browser/modules/pdf/pdf.js +7 -6
- package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +8 -7
- package/dist/browser/modules/pdf/reader/pdf-reader.js +81 -74
- package/dist/browser/modules/pdf/render/constants.d.ts +30 -0
- package/dist/browser/modules/pdf/render/constants.js +30 -0
- package/dist/browser/modules/pdf/render/layout-engine.d.ts +2 -1
- package/dist/browser/modules/pdf/render/layout-engine.js +359 -156
- package/dist/browser/modules/pdf/render/page-renderer.d.ts +2 -2
- package/dist/browser/modules/pdf/render/page-renderer.js +245 -107
- package/dist/browser/modules/pdf/render/pdf-exporter.d.ts +3 -2
- package/dist/browser/modules/pdf/render/pdf-exporter.js +145 -105
- package/dist/browser/modules/pdf/render/style-converter.js +27 -26
- package/dist/browser/modules/pdf/types.d.ts +8 -0
- package/dist/browser/utils/utils.base.d.ts +5 -0
- package/dist/browser/utils/utils.base.js +10 -0
- package/dist/cjs/modules/excel/cell.js +11 -7
- package/dist/cjs/modules/excel/column.js +7 -6
- package/dist/cjs/modules/excel/row.js +5 -1
- package/dist/cjs/modules/excel/stream/worksheet-reader.js +3 -2
- package/dist/cjs/modules/excel/utils/cell-format.js +64 -2
- package/dist/cjs/modules/pdf/excel-bridge.js +18 -5
- package/dist/cjs/modules/pdf/index.js +3 -3
- package/dist/cjs/modules/pdf/pdf.js +7 -6
- package/dist/cjs/modules/pdf/reader/pdf-reader.js +81 -74
- package/dist/cjs/modules/pdf/render/constants.js +33 -0
- package/dist/cjs/modules/pdf/render/layout-engine.js +359 -156
- package/dist/cjs/modules/pdf/render/page-renderer.js +245 -107
- package/dist/cjs/modules/pdf/render/pdf-exporter.js +145 -105
- package/dist/cjs/modules/pdf/render/style-converter.js +27 -26
- package/dist/cjs/utils/utils.base.js +11 -0
- package/dist/esm/modules/excel/cell.js +11 -7
- package/dist/esm/modules/excel/column.js +7 -6
- package/dist/esm/modules/excel/row.js +5 -1
- package/dist/esm/modules/excel/stream/worksheet-reader.js +3 -2
- package/dist/esm/modules/excel/utils/cell-format.js +64 -2
- package/dist/esm/modules/pdf/excel-bridge.js +18 -5
- package/dist/esm/modules/pdf/index.js +3 -3
- package/dist/esm/modules/pdf/pdf.js +7 -6
- package/dist/esm/modules/pdf/reader/pdf-reader.js +81 -74
- package/dist/esm/modules/pdf/render/constants.js +30 -0
- package/dist/esm/modules/pdf/render/layout-engine.js +359 -156
- package/dist/esm/modules/pdf/render/page-renderer.js +245 -107
- package/dist/esm/modules/pdf/render/pdf-exporter.js +145 -105
- package/dist/esm/modules/pdf/render/style-converter.js +27 -26
- package/dist/esm/utils/utils.base.js +10 -0
- package/dist/iife/excelts.iife.js +1022 -677
- package/dist/iife/excelts.iife.js.map +1 -1
- package/dist/iife/excelts.iife.min.js +48 -48
- package/dist/types/modules/pdf/excel-bridge.d.ts +4 -3
- package/dist/types/modules/pdf/index.d.ts +3 -3
- package/dist/types/modules/pdf/pdf.d.ts +7 -6
- package/dist/types/modules/pdf/reader/pdf-reader.d.ts +8 -7
- package/dist/types/modules/pdf/render/constants.d.ts +30 -0
- package/dist/types/modules/pdf/render/layout-engine.d.ts +2 -1
- package/dist/types/modules/pdf/render/page-renderer.d.ts +2 -2
- package/dist/types/modules/pdf/render/pdf-exporter.d.ts +3 -2
- package/dist/types/modules/pdf/types.d.ts +8 -0
- package/dist/types/utils/utils.base.d.ts +5 -0
- package/package.json +1 -1
|
@@ -16,18 +16,18 @@
|
|
|
16
16
|
* - Cross-reference tables and streams (PDF 1.5+)
|
|
17
17
|
* - Incremental updates and xref recovery
|
|
18
18
|
*
|
|
19
|
-
* @example
|
|
19
|
+
* @example Text extraction:
|
|
20
20
|
* ```typescript
|
|
21
21
|
* import { readPdf } from "excelts/pdf";
|
|
22
22
|
*
|
|
23
|
-
* const pdf = readPdf(pdfBytes);
|
|
23
|
+
* const pdf = await readPdf(pdfBytes);
|
|
24
24
|
* console.log(pdf.text); // All text from all pages
|
|
25
25
|
* console.log(pdf.pages[0].text); // Text from page 1
|
|
26
26
|
* ```
|
|
27
27
|
*
|
|
28
28
|
* @example Image extraction:
|
|
29
29
|
* ```typescript
|
|
30
|
-
* const pdf = readPdf(pdfBytes);
|
|
30
|
+
* const pdf = await readPdf(pdfBytes);
|
|
31
31
|
* for (const image of pdf.pages[0].images) {
|
|
32
32
|
* console.log(image.format, image.width, image.height);
|
|
33
33
|
* fs.writeFileSync(`image.${image.format}`, image.data);
|
|
@@ -36,7 +36,7 @@
|
|
|
36
36
|
*
|
|
37
37
|
* @example Metadata:
|
|
38
38
|
* ```typescript
|
|
39
|
-
* const pdf = readPdf(pdfBytes);
|
|
39
|
+
* const pdf = await readPdf(pdfBytes);
|
|
40
40
|
* console.log(pdf.metadata.title);
|
|
41
41
|
* console.log(pdf.metadata.author);
|
|
42
42
|
* console.log(pdf.metadata.pageCount);
|
|
@@ -44,7 +44,7 @@
|
|
|
44
44
|
*
|
|
45
45
|
* @example Encrypted PDF:
|
|
46
46
|
* ```typescript
|
|
47
|
-
* const pdf = readPdf(pdfBytes, { password: "secret" });
|
|
47
|
+
* const pdf = await readPdf(pdfBytes, { password: "secret" });
|
|
48
48
|
* ```
|
|
49
49
|
*/
|
|
50
50
|
import { PdfDocument } from "./pdf-document.js";
|
|
@@ -56,19 +56,36 @@ import { extractAnnotationsFromPage } from "./annotation-extractor.js";
|
|
|
56
56
|
import { extractFormFields } from "./form-extractor.js";
|
|
57
57
|
import { extractMetadata } from "./metadata-reader.js";
|
|
58
58
|
import { PdfStructureError } from "../errors.js";
|
|
59
|
+
import { yieldToEventLoop } from "../../../utils/utils.base.js";
|
|
59
60
|
// =============================================================================
|
|
60
61
|
// Public API
|
|
61
62
|
// =============================================================================
|
|
62
63
|
/**
|
|
63
64
|
* Read a PDF file and extract text, images, and metadata.
|
|
65
|
+
* Yields to the event loop between pages to avoid blocking.
|
|
64
66
|
*
|
|
65
67
|
* @param data - Raw PDF file bytes
|
|
66
68
|
* @param options - Extraction options
|
|
67
|
-
* @returns
|
|
69
|
+
* @returns Promise of extracted content
|
|
68
70
|
* @throws {PdfStructureError} If the PDF structure is invalid
|
|
69
71
|
* @throws {PdfError} If decryption fails (wrong password)
|
|
70
72
|
*/
|
|
71
|
-
export function readPdf(data, options) {
|
|
73
|
+
export async function readPdf(data, options) {
|
|
74
|
+
const { doc, opts, metadata, pagesInfo, pageIndicesToProcess } = prepareRead(data, options);
|
|
75
|
+
const pages = [];
|
|
76
|
+
for (let i = 0; i < pageIndicesToProcess.length; i++) {
|
|
77
|
+
const pageIdx = pageIndicesToProcess[i];
|
|
78
|
+
pages.push(processPage(pagesInfo[pageIdx].dict, pageIdx, doc, opts));
|
|
79
|
+
if (i < pageIndicesToProcess.length - 1) {
|
|
80
|
+
await yieldToEventLoop();
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
return finalizeRead(pages, pagesInfo.length, metadata, opts, doc);
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* Shared setup: parse document, handle encryption, extract metadata, resolve pages.
|
|
87
|
+
*/
|
|
88
|
+
function prepareRead(data, options) {
|
|
72
89
|
const opts = {
|
|
73
90
|
password: options?.password ?? "",
|
|
74
91
|
pages: options?.pages,
|
|
@@ -78,86 +95,81 @@ export function readPdf(data, options) {
|
|
|
78
95
|
extractAnnotations: options?.extractAnnotations ?? true,
|
|
79
96
|
extractFormFields: options?.extractFormFields ?? true
|
|
80
97
|
};
|
|
81
|
-
// Parse document structure
|
|
82
98
|
const doc = new PdfDocument(data);
|
|
83
|
-
// Handle encryption
|
|
84
99
|
if (isEncrypted(doc)) {
|
|
85
100
|
const success = initDecryption(doc, opts.password);
|
|
86
101
|
if (!success) {
|
|
87
102
|
throw new PdfStructureError("Failed to decrypt PDF: incorrect password");
|
|
88
103
|
}
|
|
89
104
|
}
|
|
90
|
-
// Extract metadata
|
|
91
105
|
const metadata = opts.extractMetadata ? extractMetadata(doc) : createEmptyMetadata();
|
|
92
|
-
// Get pages (with object identity for correct decryption)
|
|
93
106
|
const pagesInfo = doc.getPagesWithObjInfo();
|
|
94
107
|
const pageIndicesToProcess = opts.pages
|
|
95
108
|
? opts.pages.map(p => p - 1).filter(p => p >= 0 && p < pagesInfo.length)
|
|
96
109
|
: Array.from({ length: pagesInfo.length }, (_, i) => i);
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
catch (err) {
|
|
114
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
115
|
-
warnings.push(`Text extraction failed on page ${pageNumber}: ${msg}`);
|
|
116
|
-
}
|
|
110
|
+
return { doc, opts, metadata, pagesInfo, pageIndicesToProcess };
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Process a single page: extract text, images, annotations, and dimensions.
|
|
114
|
+
*/
|
|
115
|
+
function processPage(pageDict, pageIdx, doc, opts) {
|
|
116
|
+
const pageNumber = pageIdx + 1;
|
|
117
|
+
const warnings = [];
|
|
118
|
+
let text = "";
|
|
119
|
+
let textLines = [];
|
|
120
|
+
let textFragments = [];
|
|
121
|
+
if (opts.extractText) {
|
|
122
|
+
try {
|
|
123
|
+
textFragments = extractTextFromPage(pageDict, doc);
|
|
124
|
+
text = reconstructText(textFragments);
|
|
125
|
+
textLines = reconstructTextLines(textFragments);
|
|
117
126
|
}
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
try {
|
|
122
|
-
images = extractImagesFromPage(pageDict, doc);
|
|
123
|
-
}
|
|
124
|
-
catch (err) {
|
|
125
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
126
|
-
warnings.push(`Image extraction failed on page ${pageNumber}: ${msg}`);
|
|
127
|
-
}
|
|
127
|
+
catch (err) {
|
|
128
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
129
|
+
warnings.push(`Text extraction failed on page ${pageNumber}: ${msg}`);
|
|
128
130
|
}
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
131
|
+
}
|
|
132
|
+
let images = [];
|
|
133
|
+
if (opts.extractImages) {
|
|
134
|
+
try {
|
|
135
|
+
images = extractImagesFromPage(pageDict, doc);
|
|
136
|
+
}
|
|
137
|
+
catch (err) {
|
|
138
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
139
|
+
warnings.push(`Image extraction failed on page ${pageNumber}: ${msg}`);
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
let annotations = [];
|
|
143
|
+
if (opts.extractAnnotations) {
|
|
144
|
+
try {
|
|
145
|
+
annotations = extractAnnotationsFromPage(pageDict, doc);
|
|
146
|
+
}
|
|
147
|
+
catch (err) {
|
|
148
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
149
|
+
warnings.push(`Annotation extraction failed on page ${pageNumber}: ${msg}`);
|
|
139
150
|
}
|
|
140
|
-
// Get page dimensions
|
|
141
|
-
const { width, height } = getPageDimensions(pageDict, doc);
|
|
142
|
-
pages.push({
|
|
143
|
-
pageNumber,
|
|
144
|
-
text,
|
|
145
|
-
textLines,
|
|
146
|
-
textFragments,
|
|
147
|
-
images,
|
|
148
|
-
annotations,
|
|
149
|
-
width,
|
|
150
|
-
height,
|
|
151
|
-
warnings
|
|
152
|
-
});
|
|
153
151
|
}
|
|
154
|
-
|
|
152
|
+
const { width, height } = getPageDimensions(pageDict, doc);
|
|
153
|
+
return {
|
|
154
|
+
pageNumber,
|
|
155
|
+
text,
|
|
156
|
+
textLines,
|
|
157
|
+
textFragments,
|
|
158
|
+
images,
|
|
159
|
+
annotations,
|
|
160
|
+
width,
|
|
161
|
+
height,
|
|
162
|
+
warnings
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
/**
|
|
166
|
+
* Finalize: concatenate text, update metadata page count, extract form fields.
|
|
167
|
+
*/
|
|
168
|
+
function finalizeRead(pages, totalPageCount, metadata, opts, doc) {
|
|
155
169
|
const allText = pages.map(p => p.text).join("\n\n");
|
|
156
|
-
// Update page count in metadata
|
|
157
170
|
if (opts.extractMetadata) {
|
|
158
|
-
metadata.pageCount =
|
|
171
|
+
metadata.pageCount = totalPageCount;
|
|
159
172
|
}
|
|
160
|
-
// Extract form fields (document-level, not per-page)
|
|
161
173
|
let formFields = [];
|
|
162
174
|
if (opts.extractFormFields) {
|
|
163
175
|
try {
|
|
@@ -167,12 +179,7 @@ export function readPdf(data, options) {
|
|
|
167
179
|
// Non-fatal — just return empty
|
|
168
180
|
}
|
|
169
181
|
}
|
|
170
|
-
return {
|
|
171
|
-
text: allText,
|
|
172
|
-
pages,
|
|
173
|
-
metadata,
|
|
174
|
-
formFields
|
|
175
|
-
};
|
|
182
|
+
return { text: allText, pages, metadata, formFields };
|
|
176
183
|
}
|
|
177
184
|
// =============================================================================
|
|
178
185
|
// Helpers
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared rendering constants used by both the layout engine and page renderer.
|
|
3
|
+
*
|
|
4
|
+
* Keeping these in one place ensures row-height computation and text rendering
|
|
5
|
+
* use exactly the same values, preventing clipped or overlapping content.
|
|
6
|
+
*/
|
|
7
|
+
/** Horizontal cell padding in points (left + right = 2 × CELL_PADDING_H). */
|
|
8
|
+
export declare const CELL_PADDING_H = 3;
|
|
9
|
+
/** Vertical cell padding in points (top + bottom = 2 × CELL_PADDING_V). */
|
|
10
|
+
export declare const CELL_PADDING_V = 2;
|
|
11
|
+
/**
|
|
12
|
+
* Line-height multiplier applied to the font size.
|
|
13
|
+
*
|
|
14
|
+
* Excel's default row height for an 11pt font is 15pt, which after removing
|
|
15
|
+
* vertical padding (2 × 2 = 4pt) leaves 11pt × 1.0 — but Excel also adds
|
|
16
|
+
* internal leading. A factor of 1.2 matches standard PDF/typographic practice
|
|
17
|
+
* and keeps text readable without inflating row heights.
|
|
18
|
+
*/
|
|
19
|
+
export declare const LINE_HEIGHT_FACTOR = 1.2;
|
|
20
|
+
/** Width of one indent level in points (~3 characters at 11pt). */
|
|
21
|
+
export declare const INDENT_WIDTH = 10;
|
|
22
|
+
/**
|
|
23
|
+
* Excel column widths are measured in characters of the default font's digit width.
|
|
24
|
+
* For Calibri 11pt (the default), maxDigitWidth ≈ 7 pixels at 96 DPI.
|
|
25
|
+
* Excel adds 5 pixels of padding per column (4px text margin + 1px gridline).
|
|
26
|
+
* To convert to PDF points: (charWidth × 7 + 5) × (72/96).
|
|
27
|
+
*/
|
|
28
|
+
export declare const MAX_DIGIT_WIDTH_PX = 7;
|
|
29
|
+
export declare const EXCEL_COLUMN_PADDING_PX = 5;
|
|
30
|
+
export declare const PX_TO_PT: number;
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared rendering constants used by both the layout engine and page renderer.
|
|
3
|
+
*
|
|
4
|
+
* Keeping these in one place ensures row-height computation and text rendering
|
|
5
|
+
* use exactly the same values, preventing clipped or overlapping content.
|
|
6
|
+
*/
|
|
7
|
+
/** Horizontal cell padding in points (left + right = 2 × CELL_PADDING_H). */
|
|
8
|
+
export const CELL_PADDING_H = 3;
|
|
9
|
+
/** Vertical cell padding in points (top + bottom = 2 × CELL_PADDING_V). */
|
|
10
|
+
export const CELL_PADDING_V = 2;
|
|
11
|
+
/**
|
|
12
|
+
* Line-height multiplier applied to the font size.
|
|
13
|
+
*
|
|
14
|
+
* Excel's default row height for an 11pt font is 15pt, which after removing
|
|
15
|
+
* vertical padding (2 × 2 = 4pt) leaves 11pt × 1.0 — but Excel also adds
|
|
16
|
+
* internal leading. A factor of 1.2 matches standard PDF/typographic practice
|
|
17
|
+
* and keeps text readable without inflating row heights.
|
|
18
|
+
*/
|
|
19
|
+
export const LINE_HEIGHT_FACTOR = 1.2;
|
|
20
|
+
/** Width of one indent level in points (~3 characters at 11pt). */
|
|
21
|
+
export const INDENT_WIDTH = 10;
|
|
22
|
+
/**
|
|
23
|
+
* Excel column widths are measured in characters of the default font's digit width.
|
|
24
|
+
* For Calibri 11pt (the default), maxDigitWidth ≈ 7 pixels at 96 DPI.
|
|
25
|
+
* Excel adds 5 pixels of padding per column (4px text margin + 1px gridline).
|
|
26
|
+
* To convert to PDF points: (charWidth × 7 + 5) × (72/96).
|
|
27
|
+
*/
|
|
28
|
+
export const MAX_DIGIT_WIDTH_PX = 7;
|
|
29
|
+
export const EXCEL_COLUMN_PADDING_PX = 5;
|
|
30
|
+
export const PX_TO_PT = 72 / 96; // 0.75
|
|
@@ -20,6 +20,7 @@ import type { PdfSheetData, ResolvedPdfOptions, LayoutPage } from "../types.js";
|
|
|
20
20
|
import type { FontManager } from "../font/font-manager.js";
|
|
21
21
|
/**
|
|
22
22
|
* Compute the layout for a sheet across one or more PDF pages.
|
|
23
|
+
* Yields to the event loop between each output page.
|
|
23
24
|
*/
|
|
24
|
-
export declare function layoutSheet(sheet: PdfSheetData, options: ResolvedPdfOptions, fontManager: FontManager): LayoutPage[]
|
|
25
|
+
export declare function layoutSheet(sheet: PdfSheetData, options: ResolvedPdfOptions, fontManager: FontManager): Promise<LayoutPage[]>;
|
|
25
26
|
export declare function paginateRows(rowHeights: number[], availableHeight: number, repeatRowCount: number, rowBreaks: Set<number>): number[][];
|