@cj-tech-master/excelts 8.1.2 → 9.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/README.md +2 -2
  2. package/README_zh.md +2 -2
  3. package/dist/browser/modules/excel/cell.js +11 -7
  4. package/dist/browser/modules/excel/column.js +7 -6
  5. package/dist/browser/modules/excel/row.js +5 -1
  6. package/dist/browser/modules/excel/stream/worksheet-reader.js +3 -2
  7. package/dist/browser/modules/excel/utils/cell-format.js +64 -2
  8. package/dist/browser/modules/pdf/excel-bridge.d.ts +4 -3
  9. package/dist/browser/modules/pdf/excel-bridge.js +18 -5
  10. package/dist/browser/modules/pdf/index.d.ts +3 -3
  11. package/dist/browser/modules/pdf/index.js +3 -3
  12. package/dist/browser/modules/pdf/pdf.d.ts +7 -6
  13. package/dist/browser/modules/pdf/pdf.js +7 -6
  14. package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +8 -7
  15. package/dist/browser/modules/pdf/reader/pdf-reader.js +81 -74
  16. package/dist/browser/modules/pdf/render/constants.d.ts +30 -0
  17. package/dist/browser/modules/pdf/render/constants.js +30 -0
  18. package/dist/browser/modules/pdf/render/layout-engine.d.ts +2 -1
  19. package/dist/browser/modules/pdf/render/layout-engine.js +359 -156
  20. package/dist/browser/modules/pdf/render/page-renderer.d.ts +2 -2
  21. package/dist/browser/modules/pdf/render/page-renderer.js +245 -107
  22. package/dist/browser/modules/pdf/render/pdf-exporter.d.ts +3 -2
  23. package/dist/browser/modules/pdf/render/pdf-exporter.js +145 -105
  24. package/dist/browser/modules/pdf/render/style-converter.js +27 -26
  25. package/dist/browser/modules/pdf/types.d.ts +8 -0
  26. package/dist/browser/utils/utils.base.d.ts +5 -0
  27. package/dist/browser/utils/utils.base.js +10 -0
  28. package/dist/cjs/modules/excel/cell.js +11 -7
  29. package/dist/cjs/modules/excel/column.js +7 -6
  30. package/dist/cjs/modules/excel/row.js +5 -1
  31. package/dist/cjs/modules/excel/stream/worksheet-reader.js +3 -2
  32. package/dist/cjs/modules/excel/utils/cell-format.js +64 -2
  33. package/dist/cjs/modules/pdf/excel-bridge.js +18 -5
  34. package/dist/cjs/modules/pdf/index.js +3 -3
  35. package/dist/cjs/modules/pdf/pdf.js +7 -6
  36. package/dist/cjs/modules/pdf/reader/pdf-reader.js +81 -74
  37. package/dist/cjs/modules/pdf/render/constants.js +33 -0
  38. package/dist/cjs/modules/pdf/render/layout-engine.js +359 -156
  39. package/dist/cjs/modules/pdf/render/page-renderer.js +245 -107
  40. package/dist/cjs/modules/pdf/render/pdf-exporter.js +145 -105
  41. package/dist/cjs/modules/pdf/render/style-converter.js +27 -26
  42. package/dist/cjs/utils/utils.base.js +11 -0
  43. package/dist/esm/modules/excel/cell.js +11 -7
  44. package/dist/esm/modules/excel/column.js +7 -6
  45. package/dist/esm/modules/excel/row.js +5 -1
  46. package/dist/esm/modules/excel/stream/worksheet-reader.js +3 -2
  47. package/dist/esm/modules/excel/utils/cell-format.js +64 -2
  48. package/dist/esm/modules/pdf/excel-bridge.js +18 -5
  49. package/dist/esm/modules/pdf/index.js +3 -3
  50. package/dist/esm/modules/pdf/pdf.js +7 -6
  51. package/dist/esm/modules/pdf/reader/pdf-reader.js +81 -74
  52. package/dist/esm/modules/pdf/render/constants.js +30 -0
  53. package/dist/esm/modules/pdf/render/layout-engine.js +359 -156
  54. package/dist/esm/modules/pdf/render/page-renderer.js +245 -107
  55. package/dist/esm/modules/pdf/render/pdf-exporter.js +145 -105
  56. package/dist/esm/modules/pdf/render/style-converter.js +27 -26
  57. package/dist/esm/utils/utils.base.js +10 -0
  58. package/dist/iife/excelts.iife.js +1022 -677
  59. package/dist/iife/excelts.iife.js.map +1 -1
  60. package/dist/iife/excelts.iife.min.js +48 -48
  61. package/dist/types/modules/pdf/excel-bridge.d.ts +4 -3
  62. package/dist/types/modules/pdf/index.d.ts +3 -3
  63. package/dist/types/modules/pdf/pdf.d.ts +7 -6
  64. package/dist/types/modules/pdf/reader/pdf-reader.d.ts +8 -7
  65. package/dist/types/modules/pdf/render/constants.d.ts +30 -0
  66. package/dist/types/modules/pdf/render/layout-engine.d.ts +2 -1
  67. package/dist/types/modules/pdf/render/page-renderer.d.ts +2 -2
  68. package/dist/types/modules/pdf/render/pdf-exporter.d.ts +3 -2
  69. package/dist/types/modules/pdf/types.d.ts +8 -0
  70. package/dist/types/utils/utils.base.d.ts +5 -0
  71. package/package.json +1 -1
@@ -16,18 +16,18 @@
16
16
  * - Cross-reference tables and streams (PDF 1.5+)
17
17
  * - Incremental updates and xref recovery
18
18
  *
19
- * @example Basic text extraction:
19
+ * @example Text extraction:
20
20
  * ```typescript
21
21
  * import { readPdf } from "excelts/pdf";
22
22
  *
23
- * const pdf = readPdf(pdfBytes);
23
+ * const pdf = await readPdf(pdfBytes);
24
24
  * console.log(pdf.text); // All text from all pages
25
25
  * console.log(pdf.pages[0].text); // Text from page 1
26
26
  * ```
27
27
  *
28
28
  * @example Image extraction:
29
29
  * ```typescript
30
- * const pdf = readPdf(pdfBytes);
30
+ * const pdf = await readPdf(pdfBytes);
31
31
  * for (const image of pdf.pages[0].images) {
32
32
  * console.log(image.format, image.width, image.height);
33
33
  * fs.writeFileSync(`image.${image.format}`, image.data);
@@ -36,7 +36,7 @@
36
36
  *
37
37
  * @example Metadata:
38
38
  * ```typescript
39
- * const pdf = readPdf(pdfBytes);
39
+ * const pdf = await readPdf(pdfBytes);
40
40
  * console.log(pdf.metadata.title);
41
41
  * console.log(pdf.metadata.author);
42
42
  * console.log(pdf.metadata.pageCount);
@@ -44,7 +44,7 @@
44
44
  *
45
45
  * @example Encrypted PDF:
46
46
  * ```typescript
47
- * const pdf = readPdf(pdfBytes, { password: "secret" });
47
+ * const pdf = await readPdf(pdfBytes, { password: "secret" });
48
48
  * ```
49
49
  */
50
50
  import { PdfDocument } from "./pdf-document.js";
@@ -56,19 +56,36 @@ import { extractAnnotationsFromPage } from "./annotation-extractor.js";
56
56
  import { extractFormFields } from "./form-extractor.js";
57
57
  import { extractMetadata } from "./metadata-reader.js";
58
58
  import { PdfStructureError } from "../errors.js";
59
+ import { yieldToEventLoop } from "../../../utils/utils.base.js";
59
60
  // =============================================================================
60
61
  // Public API
61
62
  // =============================================================================
62
63
  /**
63
64
  * Read a PDF file and extract text, images, and metadata.
65
+ * Yields to the event loop between pages to avoid blocking.
64
66
  *
65
67
  * @param data - Raw PDF file bytes
66
68
  * @param options - Extraction options
67
- * @returns Extracted content
69
+ * @returns Promise of extracted content
68
70
  * @throws {PdfStructureError} If the PDF structure is invalid
69
71
  * @throws {PdfError} If decryption fails (wrong password)
70
72
  */
71
- export function readPdf(data, options) {
73
+ export async function readPdf(data, options) {
74
+ const { doc, opts, metadata, pagesInfo, pageIndicesToProcess } = prepareRead(data, options);
75
+ const pages = [];
76
+ for (let i = 0; i < pageIndicesToProcess.length; i++) {
77
+ const pageIdx = pageIndicesToProcess[i];
78
+ pages.push(processPage(pagesInfo[pageIdx].dict, pageIdx, doc, opts));
79
+ if (i < pageIndicesToProcess.length - 1) {
80
+ await yieldToEventLoop();
81
+ }
82
+ }
83
+ return finalizeRead(pages, pagesInfo.length, metadata, opts, doc);
84
+ }
85
+ /**
86
+ * Shared setup: parse document, handle encryption, extract metadata, resolve pages.
87
+ */
88
+ function prepareRead(data, options) {
72
89
  const opts = {
73
90
  password: options?.password ?? "",
74
91
  pages: options?.pages,
@@ -78,86 +95,81 @@ export function readPdf(data, options) {
78
95
  extractAnnotations: options?.extractAnnotations ?? true,
79
96
  extractFormFields: options?.extractFormFields ?? true
80
97
  };
81
- // Parse document structure
82
98
  const doc = new PdfDocument(data);
83
- // Handle encryption
84
99
  if (isEncrypted(doc)) {
85
100
  const success = initDecryption(doc, opts.password);
86
101
  if (!success) {
87
102
  throw new PdfStructureError("Failed to decrypt PDF: incorrect password");
88
103
  }
89
104
  }
90
- // Extract metadata
91
105
  const metadata = opts.extractMetadata ? extractMetadata(doc) : createEmptyMetadata();
92
- // Get pages (with object identity for correct decryption)
93
106
  const pagesInfo = doc.getPagesWithObjInfo();
94
107
  const pageIndicesToProcess = opts.pages
95
108
  ? opts.pages.map(p => p - 1).filter(p => p >= 0 && p < pagesInfo.length)
96
109
  : Array.from({ length: pagesInfo.length }, (_, i) => i);
97
- // Process each page
98
- const pages = [];
99
- for (const pageIdx of pageIndicesToProcess) {
100
- const { dict: pageDict } = pagesInfo[pageIdx];
101
- const pageNumber = pageIdx + 1;
102
- const warnings = [];
103
- // Extract text
104
- let text = "";
105
- let textLines = [];
106
- let textFragments = [];
107
- if (opts.extractText) {
108
- try {
109
- textFragments = extractTextFromPage(pageDict, doc);
110
- text = reconstructText(textFragments);
111
- textLines = reconstructTextLines(textFragments);
112
- }
113
- catch (err) {
114
- const msg = err instanceof Error ? err.message : String(err);
115
- warnings.push(`Text extraction failed on page ${pageNumber}: ${msg}`);
116
- }
110
+ return { doc, opts, metadata, pagesInfo, pageIndicesToProcess };
111
+ }
112
+ /**
113
+ * Process a single page: extract text, images, annotations, and dimensions.
114
+ */
115
+ function processPage(pageDict, pageIdx, doc, opts) {
116
+ const pageNumber = pageIdx + 1;
117
+ const warnings = [];
118
+ let text = "";
119
+ let textLines = [];
120
+ let textFragments = [];
121
+ if (opts.extractText) {
122
+ try {
123
+ textFragments = extractTextFromPage(pageDict, doc);
124
+ text = reconstructText(textFragments);
125
+ textLines = reconstructTextLines(textFragments);
117
126
  }
118
- // Extract images
119
- let images = [];
120
- if (opts.extractImages) {
121
- try {
122
- images = extractImagesFromPage(pageDict, doc);
123
- }
124
- catch (err) {
125
- const msg = err instanceof Error ? err.message : String(err);
126
- warnings.push(`Image extraction failed on page ${pageNumber}: ${msg}`);
127
- }
127
+ catch (err) {
128
+ const msg = err instanceof Error ? err.message : String(err);
129
+ warnings.push(`Text extraction failed on page ${pageNumber}: ${msg}`);
128
130
  }
129
- // Extract annotations
130
- let annotations = [];
131
- if (opts.extractAnnotations) {
132
- try {
133
- annotations = extractAnnotationsFromPage(pageDict, doc);
134
- }
135
- catch (err) {
136
- const msg = err instanceof Error ? err.message : String(err);
137
- warnings.push(`Annotation extraction failed on page ${pageNumber}: ${msg}`);
138
- }
131
+ }
132
+ let images = [];
133
+ if (opts.extractImages) {
134
+ try {
135
+ images = extractImagesFromPage(pageDict, doc);
136
+ }
137
+ catch (err) {
138
+ const msg = err instanceof Error ? err.message : String(err);
139
+ warnings.push(`Image extraction failed on page ${pageNumber}: ${msg}`);
140
+ }
141
+ }
142
+ let annotations = [];
143
+ if (opts.extractAnnotations) {
144
+ try {
145
+ annotations = extractAnnotationsFromPage(pageDict, doc);
146
+ }
147
+ catch (err) {
148
+ const msg = err instanceof Error ? err.message : String(err);
149
+ warnings.push(`Annotation extraction failed on page ${pageNumber}: ${msg}`);
139
150
  }
140
- // Get page dimensions
141
- const { width, height } = getPageDimensions(pageDict, doc);
142
- pages.push({
143
- pageNumber,
144
- text,
145
- textLines,
146
- textFragments,
147
- images,
148
- annotations,
149
- width,
150
- height,
151
- warnings
152
- });
153
151
  }
154
- // Concatenate all page text
152
+ const { width, height } = getPageDimensions(pageDict, doc);
153
+ return {
154
+ pageNumber,
155
+ text,
156
+ textLines,
157
+ textFragments,
158
+ images,
159
+ annotations,
160
+ width,
161
+ height,
162
+ warnings
163
+ };
164
+ }
165
+ /**
166
+ * Finalize: concatenate text, update metadata page count, extract form fields.
167
+ */
168
+ function finalizeRead(pages, totalPageCount, metadata, opts, doc) {
155
169
  const allText = pages.map(p => p.text).join("\n\n");
156
- // Update page count in metadata
157
170
  if (opts.extractMetadata) {
158
- metadata.pageCount = pagesInfo.length;
171
+ metadata.pageCount = totalPageCount;
159
172
  }
160
- // Extract form fields (document-level, not per-page)
161
173
  let formFields = [];
162
174
  if (opts.extractFormFields) {
163
175
  try {
@@ -167,12 +179,7 @@ export function readPdf(data, options) {
167
179
  // Non-fatal — just return empty
168
180
  }
169
181
  }
170
- return {
171
- text: allText,
172
- pages,
173
- metadata,
174
- formFields
175
- };
182
+ return { text: allText, pages, metadata, formFields };
176
183
  }
177
184
  // =============================================================================
178
185
  // Helpers
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Shared rendering constants used by both the layout engine and page renderer.
3
+ *
4
+ * Keeping these in one place ensures row-height computation and text rendering
5
+ * use exactly the same values, preventing clipped or overlapping content.
6
+ */
7
+ /** Horizontal cell padding in points (left + right = 2 × CELL_PADDING_H). */
8
+ export declare const CELL_PADDING_H = 3;
9
+ /** Vertical cell padding in points (top + bottom = 2 × CELL_PADDING_V). */
10
+ export declare const CELL_PADDING_V = 2;
11
+ /**
12
+ * Line-height multiplier applied to the font size.
13
+ *
14
+ * Excel's default row height for an 11pt font is 15pt, which after removing
15
+ * vertical padding (2 × 2 = 4pt) leaves 11pt × 1.0 — but Excel also adds
16
+ * internal leading. A factor of 1.2 matches standard PDF/typographic practice
17
+ * and keeps text readable without inflating row heights.
18
+ */
19
+ export declare const LINE_HEIGHT_FACTOR = 1.2;
20
+ /** Width of one indent level in points (~3 characters at 11pt). */
21
+ export declare const INDENT_WIDTH = 10;
22
+ /**
23
+ * Excel column widths are measured in characters of the default font's digit width.
24
+ * For Calibri 11pt (the default), maxDigitWidth ≈ 7 pixels at 96 DPI.
25
+ * Excel adds 5 pixels of padding per column (4px text margin + 1px gridline).
26
+ * To convert to PDF points: (charWidth × 7 + 5) × (72/96).
27
+ */
28
+ export declare const MAX_DIGIT_WIDTH_PX = 7;
29
+ export declare const EXCEL_COLUMN_PADDING_PX = 5;
30
+ export declare const PX_TO_PT: number;
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Shared rendering constants used by both the layout engine and page renderer.
3
+ *
4
+ * Keeping these in one place ensures row-height computation and text rendering
5
+ * use exactly the same values, preventing clipped or overlapping content.
6
+ */
7
+ /** Horizontal cell padding in points (left + right = 2 × CELL_PADDING_H). */
8
+ export const CELL_PADDING_H = 3;
9
+ /** Vertical cell padding in points (top + bottom = 2 × CELL_PADDING_V). */
10
+ export const CELL_PADDING_V = 2;
11
+ /**
12
+ * Line-height multiplier applied to the font size.
13
+ *
14
+ * Excel's default row height for an 11pt font is 15pt, which after removing
15
+ * vertical padding (2 × 2 = 4pt) leaves 11pt × 1.0 — but Excel also adds
16
+ * internal leading. A factor of 1.2 matches standard PDF/typographic practice
17
+ * and keeps text readable without inflating row heights.
18
+ */
19
+ export const LINE_HEIGHT_FACTOR = 1.2;
20
+ /** Width of one indent level in points (~3 characters at 11pt). */
21
+ export const INDENT_WIDTH = 10;
22
+ /**
23
+ * Excel column widths are measured in characters of the default font's digit width.
24
+ * For Calibri 11pt (the default), maxDigitWidth ≈ 7 pixels at 96 DPI.
25
+ * Excel adds 5 pixels of padding per column (4px text margin + 1px gridline).
26
+ * To convert to PDF points: (charWidth × 7 + 5) × (72/96).
27
+ */
28
+ export const MAX_DIGIT_WIDTH_PX = 7;
29
+ export const EXCEL_COLUMN_PADDING_PX = 5;
30
+ export const PX_TO_PT = 72 / 96; // 0.75
@@ -20,6 +20,7 @@ import type { PdfSheetData, ResolvedPdfOptions, LayoutPage } from "../types.js";
20
20
  import type { FontManager } from "../font/font-manager.js";
21
21
  /**
22
22
  * Compute the layout for a sheet across one or more PDF pages.
23
+ * Yields to the event loop between each output page.
23
24
  */
24
- export declare function layoutSheet(sheet: PdfSheetData, options: ResolvedPdfOptions, fontManager: FontManager): LayoutPage[];
25
+ export declare function layoutSheet(sheet: PdfSheetData, options: ResolvedPdfOptions, fontManager: FontManager): Promise<LayoutPage[]>;
25
26
  export declare function paginateRows(rowHeights: number[], availableHeight: number, repeatRowCount: number, rowBreaks: Set<number>): number[][];