@cj-tech-master/excelts 8.1.2 → 9.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/README_zh.md +2 -2
- package/dist/browser/modules/excel/cell.js +11 -7
- package/dist/browser/modules/excel/column.js +7 -6
- package/dist/browser/modules/excel/row.js +5 -1
- package/dist/browser/modules/excel/stream/worksheet-reader.js +3 -2
- package/dist/browser/modules/excel/utils/cell-format.js +64 -2
- package/dist/browser/modules/pdf/excel-bridge.d.ts +4 -3
- package/dist/browser/modules/pdf/excel-bridge.js +18 -5
- package/dist/browser/modules/pdf/index.d.ts +3 -3
- package/dist/browser/modules/pdf/index.js +3 -3
- package/dist/browser/modules/pdf/pdf.d.ts +7 -6
- package/dist/browser/modules/pdf/pdf.js +7 -6
- package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +8 -7
- package/dist/browser/modules/pdf/reader/pdf-reader.js +81 -74
- package/dist/browser/modules/pdf/render/constants.d.ts +30 -0
- package/dist/browser/modules/pdf/render/constants.js +30 -0
- package/dist/browser/modules/pdf/render/layout-engine.d.ts +2 -1
- package/dist/browser/modules/pdf/render/layout-engine.js +359 -156
- package/dist/browser/modules/pdf/render/page-renderer.d.ts +2 -2
- package/dist/browser/modules/pdf/render/page-renderer.js +245 -107
- package/dist/browser/modules/pdf/render/pdf-exporter.d.ts +3 -2
- package/dist/browser/modules/pdf/render/pdf-exporter.js +145 -105
- package/dist/browser/modules/pdf/render/style-converter.js +27 -26
- package/dist/browser/modules/pdf/types.d.ts +8 -0
- package/dist/browser/utils/utils.base.d.ts +5 -0
- package/dist/browser/utils/utils.base.js +10 -0
- package/dist/cjs/modules/excel/cell.js +11 -7
- package/dist/cjs/modules/excel/column.js +7 -6
- package/dist/cjs/modules/excel/row.js +5 -1
- package/dist/cjs/modules/excel/stream/worksheet-reader.js +3 -2
- package/dist/cjs/modules/excel/utils/cell-format.js +64 -2
- package/dist/cjs/modules/pdf/excel-bridge.js +18 -5
- package/dist/cjs/modules/pdf/index.js +3 -3
- package/dist/cjs/modules/pdf/pdf.js +7 -6
- package/dist/cjs/modules/pdf/reader/pdf-reader.js +81 -74
- package/dist/cjs/modules/pdf/render/constants.js +33 -0
- package/dist/cjs/modules/pdf/render/layout-engine.js +359 -156
- package/dist/cjs/modules/pdf/render/page-renderer.js +245 -107
- package/dist/cjs/modules/pdf/render/pdf-exporter.js +145 -105
- package/dist/cjs/modules/pdf/render/style-converter.js +27 -26
- package/dist/cjs/utils/utils.base.js +11 -0
- package/dist/esm/modules/excel/cell.js +11 -7
- package/dist/esm/modules/excel/column.js +7 -6
- package/dist/esm/modules/excel/row.js +5 -1
- package/dist/esm/modules/excel/stream/worksheet-reader.js +3 -2
- package/dist/esm/modules/excel/utils/cell-format.js +64 -2
- package/dist/esm/modules/pdf/excel-bridge.js +18 -5
- package/dist/esm/modules/pdf/index.js +3 -3
- package/dist/esm/modules/pdf/pdf.js +7 -6
- package/dist/esm/modules/pdf/reader/pdf-reader.js +81 -74
- package/dist/esm/modules/pdf/render/constants.js +30 -0
- package/dist/esm/modules/pdf/render/layout-engine.js +359 -156
- package/dist/esm/modules/pdf/render/page-renderer.js +245 -107
- package/dist/esm/modules/pdf/render/pdf-exporter.js +145 -105
- package/dist/esm/modules/pdf/render/style-converter.js +27 -26
- package/dist/esm/utils/utils.base.js +10 -0
- package/dist/iife/excelts.iife.js +1022 -677
- package/dist/iife/excelts.iife.js.map +1 -1
- package/dist/iife/excelts.iife.min.js +48 -48
- package/dist/types/modules/pdf/excel-bridge.d.ts +4 -3
- package/dist/types/modules/pdf/index.d.ts +3 -3
- package/dist/types/modules/pdf/pdf.d.ts +7 -6
- package/dist/types/modules/pdf/reader/pdf-reader.d.ts +8 -7
- package/dist/types/modules/pdf/render/constants.d.ts +30 -0
- package/dist/types/modules/pdf/render/layout-engine.d.ts +2 -1
- package/dist/types/modules/pdf/render/page-renderer.d.ts +2 -2
- package/dist/types/modules/pdf/render/pdf-exporter.d.ts +3 -2
- package/dist/types/modules/pdf/types.d.ts +8 -0
- package/dist/types/utils/utils.base.d.ts +5 -0
- package/package.json +1 -1
|
@@ -522,6 +522,35 @@ function formatNumberPattern(val, fmt) {
|
|
|
522
522
|
const decimalPlaces = decFmt.replace(/[^0#?]/g, "").length;
|
|
523
523
|
// Round the value
|
|
524
524
|
const roundedVal = roundTo(scaledVal, decimalPlaces);
|
|
525
|
+
// When value is zero and the format has no required '0' digit placeholders,
|
|
526
|
+
// '?' placeholders become spaces and '#' placeholders produce nothing.
|
|
527
|
+
// This handles accounting format zero sections like "-"?? → "- " (dash + spaces).
|
|
528
|
+
if (roundedVal === 0 && !intFmt.includes("0") && !decFmt.includes("0")) {
|
|
529
|
+
let result = "";
|
|
530
|
+
for (const ch of intFmt) {
|
|
531
|
+
if (ch === "?") {
|
|
532
|
+
result += " ";
|
|
533
|
+
}
|
|
534
|
+
else if (ch !== "#" && ch !== ",") {
|
|
535
|
+
// Preserve literal characters (already unquoted at this point)
|
|
536
|
+
result += ch;
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
if (decimalPlaces > 0) {
|
|
540
|
+
// Only emit the decimal point if the decimal format has '?' or '0' placeholders.
|
|
541
|
+
// Pure '#' decimal digits produce nothing for zero values.
|
|
542
|
+
const hasDecContent = /[0?]/.test(decFmt);
|
|
543
|
+
if (hasDecContent) {
|
|
544
|
+
result += ".";
|
|
545
|
+
for (const ch of decFmt) {
|
|
546
|
+
if (ch === "?") {
|
|
547
|
+
result += " ";
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
return sign + result;
|
|
553
|
+
}
|
|
525
554
|
// Split into integer and decimal parts
|
|
526
555
|
const [intPart, decPart = ""] = roundedVal.toString().split(".");
|
|
527
556
|
// Check if format has literal characters mixed with digit placeholders (like "0-0", "000-0000")
|
|
@@ -561,16 +590,49 @@ function formatNumberPattern(val, fmt) {
|
|
|
561
590
|
if (intFmt.includes(",")) {
|
|
562
591
|
formattedInt = commaify(intPart);
|
|
563
592
|
}
|
|
564
|
-
// Pad integer with leading zeros if needed
|
|
593
|
+
// Pad integer with leading zeros/spaces if needed
|
|
594
|
+
// '0' placeholder → pad with "0", '?' placeholder → pad with " "
|
|
565
595
|
const minIntDigits = (intFmt.match(/0/g) ?? []).length;
|
|
596
|
+
const totalIntSlots = (intFmt.match(/[0?]/g) ?? []).length;
|
|
566
597
|
if (formattedInt.length < minIntDigits) {
|
|
567
598
|
formattedInt = "0".repeat(minIntDigits - formattedInt.length) + formattedInt;
|
|
568
599
|
}
|
|
600
|
+
if (formattedInt.length < totalIntSlots) {
|
|
601
|
+
formattedInt = " ".repeat(totalIntSlots - formattedInt.length) + formattedInt;
|
|
602
|
+
}
|
|
603
|
+
// '#' integer placeholder: suppress "0" when there are no required '0' or '?' digits
|
|
604
|
+
// and the integer value is zero (e.g. "#" format with value 0 → empty)
|
|
605
|
+
if (formattedInt === "0" && minIntDigits === 0 && totalIntSlots === 0) {
|
|
606
|
+
formattedInt = "";
|
|
607
|
+
}
|
|
569
608
|
}
|
|
570
609
|
// Format decimal part
|
|
571
610
|
let formattedDec = "";
|
|
572
611
|
if (decimalPlaces > 0) {
|
|
573
|
-
|
|
612
|
+
const rawDec = (decPart + "0".repeat(decimalPlaces)).substring(0, decimalPlaces);
|
|
613
|
+
// Process each decimal digit position according to its placeholder:
|
|
614
|
+
// '0' → always show digit, '?' → show digit or space, '#' → show digit or nothing (trim trailing)
|
|
615
|
+
const decChars = rawDec.split("");
|
|
616
|
+
// Walk from the end: '#' trailing zeros are removed, '?' trailing zeros become spaces
|
|
617
|
+
for (let i = decFmt.length - 1; i >= 0; i--) {
|
|
618
|
+
if (i >= decChars.length) {
|
|
619
|
+
continue;
|
|
620
|
+
}
|
|
621
|
+
if (decFmt[i] === "#" && decChars[i] === "0") {
|
|
622
|
+
decChars[i] = "";
|
|
623
|
+
}
|
|
624
|
+
else if (decFmt[i] === "?" && decChars[i] === "0") {
|
|
625
|
+
decChars[i] = " ";
|
|
626
|
+
}
|
|
627
|
+
else {
|
|
628
|
+
break; // stop at first non-zero or '0' placeholder
|
|
629
|
+
}
|
|
630
|
+
}
|
|
631
|
+
const decStr = decChars.join("");
|
|
632
|
+
// Only emit decimal point if there is content after it
|
|
633
|
+
if (decStr.length > 0) {
|
|
634
|
+
formattedDec = "." + decStr;
|
|
635
|
+
}
|
|
574
636
|
}
|
|
575
637
|
return sign + formattedInt + formattedDec;
|
|
576
638
|
}
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
*
|
|
13
13
|
* const workbook = new Workbook();
|
|
14
14
|
* // ... build workbook ...
|
|
15
|
-
* const pdf = excelToPdf(workbook);
|
|
15
|
+
* const pdf = await excelToPdf(workbook);
|
|
16
16
|
* ```
|
|
17
17
|
*/
|
|
18
18
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
@@ -30,12 +30,13 @@ const types_1 = require("./types");
|
|
|
30
30
|
*
|
|
31
31
|
* This is a convenience function that converts the Workbook to the PDF module's
|
|
32
32
|
* data model and then generates the PDF.
|
|
33
|
+
* Yields to the event loop between each output page during layout and rendering.
|
|
33
34
|
*
|
|
34
35
|
* @param workbook - An Excel Workbook instance
|
|
35
36
|
* @param options - PDF export options
|
|
36
|
-
* @returns PDF file as a Uint8Array
|
|
37
|
+
* @returns Promise of PDF file as a Uint8Array
|
|
37
38
|
*/
|
|
38
|
-
function excelToPdf(workbook, options) {
|
|
39
|
+
async function excelToPdf(workbook, options) {
|
|
39
40
|
const pdfWorkbook = excelWorkbookToPdf(workbook);
|
|
40
41
|
return (0, pdf_exporter_1.exportPdf)(pdfWorkbook, options);
|
|
41
42
|
}
|
|
@@ -84,12 +85,24 @@ function convertSheet(ws, workbook) {
|
|
|
84
85
|
continue;
|
|
85
86
|
}
|
|
86
87
|
const cells = new Map();
|
|
87
|
-
row.eachCell({ includeEmpty:
|
|
88
|
-
|
|
88
|
+
row.eachCell({ includeEmpty: true }, cell => {
|
|
89
|
+
const hasValue = cell.type !== enums_1.ValueType.Null && cell.type !== enums_1.ValueType.Merge;
|
|
90
|
+
const hasStyle = cell.style &&
|
|
91
|
+
((cell.style.border &&
|
|
92
|
+
(cell.style.border.top ||
|
|
93
|
+
cell.style.border.right ||
|
|
94
|
+
cell.style.border.bottom ||
|
|
95
|
+
cell.style.border.left)) ||
|
|
96
|
+
cell.style.fill ||
|
|
97
|
+
cell.style.font);
|
|
98
|
+
if (hasValue || hasStyle) {
|
|
99
|
+
cells.set(cell.col, convertCell(cell));
|
|
100
|
+
}
|
|
89
101
|
});
|
|
90
102
|
rows.set(r, {
|
|
91
103
|
hidden: row.hidden || undefined,
|
|
92
104
|
height: row.height ?? undefined,
|
|
105
|
+
customHeight: row.customHeight || undefined,
|
|
93
106
|
cells
|
|
94
107
|
});
|
|
95
108
|
}
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
* ```typescript
|
|
9
9
|
* import { pdf } from "excelts/pdf";
|
|
10
10
|
*
|
|
11
|
-
* const bytes = pdf([
|
|
11
|
+
* const bytes = await pdf([
|
|
12
12
|
* ["Product", "Revenue"],
|
|
13
13
|
* ["Widget", 1000],
|
|
14
14
|
* ["Gadget", 2500]
|
|
@@ -23,14 +23,14 @@
|
|
|
23
23
|
* const workbook = new Workbook();
|
|
24
24
|
* const sheet = workbook.addWorksheet("Sales");
|
|
25
25
|
* sheet.addRow(["Product", "Revenue"]);
|
|
26
|
-
* const bytes = excelToPdf(workbook);
|
|
26
|
+
* const bytes = await excelToPdf(workbook);
|
|
27
27
|
* ```
|
|
28
28
|
*
|
|
29
29
|
* @example Read PDF — extract text, images, and metadata:
|
|
30
30
|
* ```typescript
|
|
31
31
|
* import { readPdf } from "excelts/pdf";
|
|
32
32
|
*
|
|
33
|
-
* const result = readPdf(pdfBytes);
|
|
33
|
+
* const result = await readPdf(pdfBytes);
|
|
34
34
|
* console.log(result.text); // All text
|
|
35
35
|
* console.log(result.pages[0].text); // Page 1 text
|
|
36
36
|
* console.log(result.pages[0].images); // Page 1 images
|
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
* ```typescript
|
|
10
10
|
* import { pdf } from "@cj-tech-master/excelts/pdf";
|
|
11
11
|
*
|
|
12
|
-
* const bytes = pdf([
|
|
12
|
+
* const bytes = await pdf([
|
|
13
13
|
* ["Product", "Revenue"],
|
|
14
14
|
* ["Widget", 1000],
|
|
15
15
|
* ["Gadget", 2500]
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
*
|
|
19
19
|
* @example With options:
|
|
20
20
|
* ```typescript
|
|
21
|
-
* const bytes = pdf([
|
|
21
|
+
* const bytes = await pdf([
|
|
22
22
|
* ["Name", "Score"],
|
|
23
23
|
* ["Alice", 95],
|
|
24
24
|
* ["Bob", 87]
|
|
@@ -27,7 +27,7 @@
|
|
|
27
27
|
*
|
|
28
28
|
* @example Multiple sheets:
|
|
29
29
|
* ```typescript
|
|
30
|
-
* const bytes = pdf({
|
|
30
|
+
* const bytes = await pdf({
|
|
31
31
|
* sheets: [
|
|
32
32
|
* { name: "Sales", data: [["Product", "Revenue"], ["Widget", 1000]] },
|
|
33
33
|
* { name: "Costs", data: [["Item", "Amount"], ["Rent", 500]] }
|
|
@@ -37,7 +37,7 @@
|
|
|
37
37
|
*
|
|
38
38
|
* @example With column widths and styles:
|
|
39
39
|
* ```typescript
|
|
40
|
-
* const bytes = pdf({
|
|
40
|
+
* const bytes = await pdf({
|
|
41
41
|
* name: "Report",
|
|
42
42
|
* columns: [{ width: 25 }, { width: 15 }],
|
|
43
43
|
* data: [
|
|
@@ -58,12 +58,13 @@ const pdf_exporter_1 = require("./render/pdf-exporter");
|
|
|
58
58
|
* Generate a PDF.
|
|
59
59
|
*
|
|
60
60
|
* Accepts anything from a plain 2D array to a multi-sheet workbook.
|
|
61
|
+
* Yields to the event loop between each output page during layout and rendering.
|
|
61
62
|
*
|
|
62
63
|
* @param input - 2D array, sheet object, or workbook object
|
|
63
64
|
* @param options - PDF export options (page size, margins, etc.)
|
|
64
|
-
* @returns PDF file as Uint8Array
|
|
65
|
+
* @returns Promise of PDF file as Uint8Array
|
|
65
66
|
*/
|
|
66
|
-
function pdf(input, options) {
|
|
67
|
+
async function pdf(input, options) {
|
|
67
68
|
const workbook = normalizeInput(input);
|
|
68
69
|
return (0, pdf_exporter_1.exportPdf)(workbook, options);
|
|
69
70
|
}
|
|
@@ -17,18 +17,18 @@
|
|
|
17
17
|
* - Cross-reference tables and streams (PDF 1.5+)
|
|
18
18
|
* - Incremental updates and xref recovery
|
|
19
19
|
*
|
|
20
|
-
* @example
|
|
20
|
+
* @example Text extraction:
|
|
21
21
|
* ```typescript
|
|
22
22
|
* import { readPdf } from "excelts/pdf";
|
|
23
23
|
*
|
|
24
|
-
* const pdf = readPdf(pdfBytes);
|
|
24
|
+
* const pdf = await readPdf(pdfBytes);
|
|
25
25
|
* console.log(pdf.text); // All text from all pages
|
|
26
26
|
* console.log(pdf.pages[0].text); // Text from page 1
|
|
27
27
|
* ```
|
|
28
28
|
*
|
|
29
29
|
* @example Image extraction:
|
|
30
30
|
* ```typescript
|
|
31
|
-
* const pdf = readPdf(pdfBytes);
|
|
31
|
+
* const pdf = await readPdf(pdfBytes);
|
|
32
32
|
* for (const image of pdf.pages[0].images) {
|
|
33
33
|
* console.log(image.format, image.width, image.height);
|
|
34
34
|
* fs.writeFileSync(`image.${image.format}`, image.data);
|
|
@@ -37,7 +37,7 @@
|
|
|
37
37
|
*
|
|
38
38
|
* @example Metadata:
|
|
39
39
|
* ```typescript
|
|
40
|
-
* const pdf = readPdf(pdfBytes);
|
|
40
|
+
* const pdf = await readPdf(pdfBytes);
|
|
41
41
|
* console.log(pdf.metadata.title);
|
|
42
42
|
* console.log(pdf.metadata.author);
|
|
43
43
|
* console.log(pdf.metadata.pageCount);
|
|
@@ -45,7 +45,7 @@
|
|
|
45
45
|
*
|
|
46
46
|
* @example Encrypted PDF:
|
|
47
47
|
* ```typescript
|
|
48
|
-
* const pdf = readPdf(pdfBytes, { password: "secret" });
|
|
48
|
+
* const pdf = await readPdf(pdfBytes, { password: "secret" });
|
|
49
49
|
* ```
|
|
50
50
|
*/
|
|
51
51
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
@@ -59,19 +59,36 @@ const annotation_extractor_1 = require("./annotation-extractor");
|
|
|
59
59
|
const form_extractor_1 = require("./form-extractor");
|
|
60
60
|
const metadata_reader_1 = require("./metadata-reader");
|
|
61
61
|
const errors_1 = require("../errors");
|
|
62
|
+
const utils_base_1 = require("../../../utils/utils.base.js");
|
|
62
63
|
// =============================================================================
|
|
63
64
|
// Public API
|
|
64
65
|
// =============================================================================
|
|
65
66
|
/**
|
|
66
67
|
* Read a PDF file and extract text, images, and metadata.
|
|
68
|
+
* Yields to the event loop between pages to avoid blocking.
|
|
67
69
|
*
|
|
68
70
|
* @param data - Raw PDF file bytes
|
|
69
71
|
* @param options - Extraction options
|
|
70
|
-
* @returns
|
|
72
|
+
* @returns Promise of extracted content
|
|
71
73
|
* @throws {PdfStructureError} If the PDF structure is invalid
|
|
72
74
|
* @throws {PdfError} If decryption fails (wrong password)
|
|
73
75
|
*/
|
|
74
|
-
function readPdf(data, options) {
|
|
76
|
+
async function readPdf(data, options) {
|
|
77
|
+
const { doc, opts, metadata, pagesInfo, pageIndicesToProcess } = prepareRead(data, options);
|
|
78
|
+
const pages = [];
|
|
79
|
+
for (let i = 0; i < pageIndicesToProcess.length; i++) {
|
|
80
|
+
const pageIdx = pageIndicesToProcess[i];
|
|
81
|
+
pages.push(processPage(pagesInfo[pageIdx].dict, pageIdx, doc, opts));
|
|
82
|
+
if (i < pageIndicesToProcess.length - 1) {
|
|
83
|
+
await (0, utils_base_1.yieldToEventLoop)();
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
return finalizeRead(pages, pagesInfo.length, metadata, opts, doc);
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Shared setup: parse document, handle encryption, extract metadata, resolve pages.
|
|
90
|
+
*/
|
|
91
|
+
function prepareRead(data, options) {
|
|
75
92
|
const opts = {
|
|
76
93
|
password: options?.password ?? "",
|
|
77
94
|
pages: options?.pages,
|
|
@@ -81,86 +98,81 @@ function readPdf(data, options) {
|
|
|
81
98
|
extractAnnotations: options?.extractAnnotations ?? true,
|
|
82
99
|
extractFormFields: options?.extractFormFields ?? true
|
|
83
100
|
};
|
|
84
|
-
// Parse document structure
|
|
85
101
|
const doc = new pdf_document_1.PdfDocument(data);
|
|
86
|
-
// Handle encryption
|
|
87
102
|
if ((0, pdf_decrypt_1.isEncrypted)(doc)) {
|
|
88
103
|
const success = (0, pdf_decrypt_1.initDecryption)(doc, opts.password);
|
|
89
104
|
if (!success) {
|
|
90
105
|
throw new errors_1.PdfStructureError("Failed to decrypt PDF: incorrect password");
|
|
91
106
|
}
|
|
92
107
|
}
|
|
93
|
-
// Extract metadata
|
|
94
108
|
const metadata = opts.extractMetadata ? (0, metadata_reader_1.extractMetadata)(doc) : createEmptyMetadata();
|
|
95
|
-
// Get pages (with object identity for correct decryption)
|
|
96
109
|
const pagesInfo = doc.getPagesWithObjInfo();
|
|
97
110
|
const pageIndicesToProcess = opts.pages
|
|
98
111
|
? opts.pages.map(p => p - 1).filter(p => p >= 0 && p < pagesInfo.length)
|
|
99
112
|
: Array.from({ length: pagesInfo.length }, (_, i) => i);
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
catch (err) {
|
|
117
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
118
|
-
warnings.push(`Text extraction failed on page ${pageNumber}: ${msg}`);
|
|
119
|
-
}
|
|
113
|
+
return { doc, opts, metadata, pagesInfo, pageIndicesToProcess };
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Process a single page: extract text, images, annotations, and dimensions.
|
|
117
|
+
*/
|
|
118
|
+
function processPage(pageDict, pageIdx, doc, opts) {
|
|
119
|
+
const pageNumber = pageIdx + 1;
|
|
120
|
+
const warnings = [];
|
|
121
|
+
let text = "";
|
|
122
|
+
let textLines = [];
|
|
123
|
+
let textFragments = [];
|
|
124
|
+
if (opts.extractText) {
|
|
125
|
+
try {
|
|
126
|
+
textFragments = (0, content_interpreter_1.extractTextFromPage)(pageDict, doc);
|
|
127
|
+
text = (0, text_reconstruction_1.reconstructText)(textFragments);
|
|
128
|
+
textLines = (0, text_reconstruction_1.reconstructTextLines)(textFragments);
|
|
120
129
|
}
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
try {
|
|
125
|
-
images = (0, image_extractor_1.extractImagesFromPage)(pageDict, doc);
|
|
126
|
-
}
|
|
127
|
-
catch (err) {
|
|
128
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
129
|
-
warnings.push(`Image extraction failed on page ${pageNumber}: ${msg}`);
|
|
130
|
-
}
|
|
130
|
+
catch (err) {
|
|
131
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
132
|
+
warnings.push(`Text extraction failed on page ${pageNumber}: ${msg}`);
|
|
131
133
|
}
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
134
|
+
}
|
|
135
|
+
let images = [];
|
|
136
|
+
if (opts.extractImages) {
|
|
137
|
+
try {
|
|
138
|
+
images = (0, image_extractor_1.extractImagesFromPage)(pageDict, doc);
|
|
139
|
+
}
|
|
140
|
+
catch (err) {
|
|
141
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
142
|
+
warnings.push(`Image extraction failed on page ${pageNumber}: ${msg}`);
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
let annotations = [];
|
|
146
|
+
if (opts.extractAnnotations) {
|
|
147
|
+
try {
|
|
148
|
+
annotations = (0, annotation_extractor_1.extractAnnotationsFromPage)(pageDict, doc);
|
|
149
|
+
}
|
|
150
|
+
catch (err) {
|
|
151
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
152
|
+
warnings.push(`Annotation extraction failed on page ${pageNumber}: ${msg}`);
|
|
142
153
|
}
|
|
143
|
-
// Get page dimensions
|
|
144
|
-
const { width, height } = getPageDimensions(pageDict, doc);
|
|
145
|
-
pages.push({
|
|
146
|
-
pageNumber,
|
|
147
|
-
text,
|
|
148
|
-
textLines,
|
|
149
|
-
textFragments,
|
|
150
|
-
images,
|
|
151
|
-
annotations,
|
|
152
|
-
width,
|
|
153
|
-
height,
|
|
154
|
-
warnings
|
|
155
|
-
});
|
|
156
154
|
}
|
|
157
|
-
|
|
155
|
+
const { width, height } = getPageDimensions(pageDict, doc);
|
|
156
|
+
return {
|
|
157
|
+
pageNumber,
|
|
158
|
+
text,
|
|
159
|
+
textLines,
|
|
160
|
+
textFragments,
|
|
161
|
+
images,
|
|
162
|
+
annotations,
|
|
163
|
+
width,
|
|
164
|
+
height,
|
|
165
|
+
warnings
|
|
166
|
+
};
|
|
167
|
+
}
|
|
168
|
+
/**
|
|
169
|
+
* Finalize: concatenate text, update metadata page count, extract form fields.
|
|
170
|
+
*/
|
|
171
|
+
function finalizeRead(pages, totalPageCount, metadata, opts, doc) {
|
|
158
172
|
const allText = pages.map(p => p.text).join("\n\n");
|
|
159
|
-
// Update page count in metadata
|
|
160
173
|
if (opts.extractMetadata) {
|
|
161
|
-
metadata.pageCount =
|
|
174
|
+
metadata.pageCount = totalPageCount;
|
|
162
175
|
}
|
|
163
|
-
// Extract form fields (document-level, not per-page)
|
|
164
176
|
let formFields = [];
|
|
165
177
|
if (opts.extractFormFields) {
|
|
166
178
|
try {
|
|
@@ -170,12 +182,7 @@ function readPdf(data, options) {
|
|
|
170
182
|
// Non-fatal — just return empty
|
|
171
183
|
}
|
|
172
184
|
}
|
|
173
|
-
return {
|
|
174
|
-
text: allText,
|
|
175
|
-
pages,
|
|
176
|
-
metadata,
|
|
177
|
-
formFields
|
|
178
|
-
};
|
|
185
|
+
return { text: allText, pages, metadata, formFields };
|
|
179
186
|
}
|
|
180
187
|
// =============================================================================
|
|
181
188
|
// Helpers
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Shared rendering constants used by both the layout engine and page renderer.
|
|
4
|
+
*
|
|
5
|
+
* Keeping these in one place ensures row-height computation and text rendering
|
|
6
|
+
* use exactly the same values, preventing clipped or overlapping content.
|
|
7
|
+
*/
|
|
8
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
9
|
+
exports.PX_TO_PT = exports.EXCEL_COLUMN_PADDING_PX = exports.MAX_DIGIT_WIDTH_PX = exports.INDENT_WIDTH = exports.LINE_HEIGHT_FACTOR = exports.CELL_PADDING_V = exports.CELL_PADDING_H = void 0;
|
|
10
|
+
/** Horizontal cell padding in points (left + right = 2 × CELL_PADDING_H). */
|
|
11
|
+
exports.CELL_PADDING_H = 3;
|
|
12
|
+
/** Vertical cell padding in points (top + bottom = 2 × CELL_PADDING_V). */
|
|
13
|
+
exports.CELL_PADDING_V = 2;
|
|
14
|
+
/**
|
|
15
|
+
* Line-height multiplier applied to the font size.
|
|
16
|
+
*
|
|
17
|
+
* Excel's default row height for an 11pt font is 15pt, which after removing
|
|
18
|
+
* vertical padding (2 × 2 = 4pt) leaves 11pt × 1.0 — but Excel also adds
|
|
19
|
+
* internal leading. A factor of 1.2 matches standard PDF/typographic practice
|
|
20
|
+
* and keeps text readable without inflating row heights.
|
|
21
|
+
*/
|
|
22
|
+
exports.LINE_HEIGHT_FACTOR = 1.2;
|
|
23
|
+
/** Width of one indent level in points (~3 characters at 11pt). */
|
|
24
|
+
exports.INDENT_WIDTH = 10;
|
|
25
|
+
/**
|
|
26
|
+
* Excel column widths are measured in characters of the default font's digit width.
|
|
27
|
+
* For Calibri 11pt (the default), maxDigitWidth ≈ 7 pixels at 96 DPI.
|
|
28
|
+
* Excel adds 5 pixels of padding per column (4px text margin + 1px gridline).
|
|
29
|
+
* To convert to PDF points: (charWidth × 7 + 5) × (72/96).
|
|
30
|
+
*/
|
|
31
|
+
exports.MAX_DIGIT_WIDTH_PX = 7;
|
|
32
|
+
exports.EXCEL_COLUMN_PADDING_PX = 5;
|
|
33
|
+
exports.PX_TO_PT = 72 / 96; // 0.75
|