@cj-tech-master/excelts 9.1.0 → 9.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -1
- package/dist/browser/modules/archive/compression/crc32.js +1 -1
- package/dist/browser/modules/archive/crypto/aes.d.ts +0 -8
- package/dist/browser/modules/archive/crypto/aes.js +1 -20
- package/dist/browser/modules/archive/crypto/index.d.ts +2 -1
- package/dist/browser/modules/archive/crypto/index.js +3 -1
- package/dist/browser/modules/csv/parse/row-processor.d.ts +1 -1
- package/dist/browser/modules/csv/worker/worker-script.generated.js +1 -1
- package/dist/browser/modules/excel/utils/cell-matrix.js +1 -0
- package/dist/browser/modules/excel/utils/encryptor.browser.d.ts +4 -5
- package/dist/browser/modules/excel/utils/encryptor.browser.js +7 -12
- package/dist/browser/modules/excel/utils/encryptor.d.ts +1 -1
- package/dist/browser/modules/excel/utils/encryptor.js +4 -7
- package/dist/browser/modules/pdf/builder/document-builder.d.ts +517 -0
- package/dist/browser/modules/pdf/builder/document-builder.js +1493 -0
- package/dist/browser/modules/pdf/builder/form-appearance.d.ts +56 -0
- package/dist/browser/modules/pdf/builder/form-appearance.js +140 -0
- package/dist/browser/modules/pdf/builder/image-utils.d.ts +39 -0
- package/dist/browser/modules/pdf/builder/image-utils.js +129 -0
- package/dist/browser/modules/pdf/builder/pdf-editor.d.ts +230 -0
- package/dist/browser/modules/pdf/builder/pdf-editor.js +1574 -0
- package/dist/browser/modules/pdf/builder/resource-merger.d.ts +41 -0
- package/dist/browser/modules/pdf/builder/resource-merger.js +258 -0
- package/dist/browser/modules/pdf/core/digital-signature.d.ts +109 -0
- package/dist/browser/modules/pdf/core/digital-signature.js +659 -0
- package/dist/browser/modules/pdf/core/encryption.js +8 -7
- package/dist/browser/modules/pdf/core/pdf-object.d.ts +11 -0
- package/dist/browser/modules/pdf/core/pdf-object.js +38 -0
- package/dist/browser/modules/pdf/core/pdf-stream.d.ts +32 -0
- package/dist/browser/modules/pdf/core/pdf-stream.js +66 -0
- package/dist/browser/modules/pdf/core/pdf-writer.d.ts +55 -1
- package/dist/browser/modules/pdf/core/pdf-writer.js +271 -6
- package/dist/browser/modules/pdf/core/pdfa.d.ts +62 -0
- package/dist/browser/modules/pdf/core/pdfa.js +261 -0
- package/dist/browser/modules/pdf/index.d.ts +11 -0
- package/dist/browser/modules/pdf/index.js +9 -0
- package/dist/browser/modules/pdf/reader/bookmark-extractor.d.ts +35 -0
- package/dist/browser/modules/pdf/reader/bookmark-extractor.js +324 -0
- package/dist/browser/modules/pdf/reader/pdf-decrypt.js +6 -5
- package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +17 -0
- package/dist/browser/modules/pdf/reader/pdf-reader.js +26 -2
- package/dist/browser/modules/pdf/reader/table-extractor.d.ts +69 -0
- package/dist/browser/modules/pdf/reader/table-extractor.js +365 -0
- package/dist/browser/modules/pdf/render/layout-engine.d.ts +21 -1
- package/dist/browser/modules/pdf/render/layout-engine.js +112 -5
- package/dist/browser/modules/pdf/render/page-renderer.d.ts +2 -9
- package/dist/browser/modules/pdf/render/page-renderer.js +62 -103
- package/dist/browser/modules/pdf/render/pdf-exporter.js +2 -61
- package/dist/browser/modules/pdf/render/style-converter.d.ts +4 -0
- package/dist/browser/modules/pdf/render/style-converter.js +1 -1
- package/dist/browser/modules/pdf/types.d.ts +14 -1
- package/dist/browser/modules/stream/browser/readable.js +8 -2
- package/dist/browser/utils/crypto.browser.d.ts +64 -0
- package/dist/browser/{modules/pdf/core/crypto.js → utils/crypto.browser.js} +91 -101
- package/dist/browser/utils/crypto.d.ts +97 -0
- package/dist/browser/utils/crypto.js +209 -0
- package/dist/cjs/modules/archive/compression/crc32.js +1 -1
- package/dist/cjs/modules/archive/crypto/aes.js +2 -23
- package/dist/cjs/modules/archive/crypto/index.js +3 -1
- package/dist/cjs/modules/csv/worker/worker-script.generated.js +1 -1
- package/dist/cjs/modules/excel/utils/cell-matrix.js +1 -0
- package/dist/cjs/modules/excel/utils/encryptor.browser.js +7 -12
- package/dist/cjs/modules/excel/utils/encryptor.js +4 -10
- package/dist/cjs/modules/pdf/builder/document-builder.js +1532 -0
- package/dist/cjs/modules/pdf/builder/form-appearance.js +145 -0
- package/dist/cjs/modules/pdf/builder/image-utils.js +135 -0
- package/dist/cjs/modules/pdf/builder/pdf-editor.js +1612 -0
- package/dist/cjs/modules/pdf/builder/resource-merger.js +263 -0
- package/dist/cjs/modules/pdf/core/digital-signature.js +667 -0
- package/dist/cjs/modules/pdf/core/encryption.js +8 -7
- package/dist/cjs/modules/pdf/core/pdf-object.js +38 -0
- package/dist/cjs/modules/pdf/core/pdf-stream.js +66 -0
- package/dist/cjs/modules/pdf/core/pdf-writer.js +272 -6
- package/dist/cjs/modules/pdf/core/pdfa.js +266 -0
- package/dist/cjs/modules/pdf/index.js +19 -1
- package/dist/cjs/modules/pdf/reader/bookmark-extractor.js +327 -0
- package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +6 -5
- package/dist/cjs/modules/pdf/reader/pdf-reader.js +26 -2
- package/dist/cjs/modules/pdf/reader/table-extractor.js +368 -0
- package/dist/cjs/modules/pdf/render/layout-engine.js +113 -4
- package/dist/cjs/modules/pdf/render/page-renderer.js +63 -105
- package/dist/cjs/modules/pdf/render/pdf-exporter.js +3 -62
- package/dist/cjs/modules/pdf/render/style-converter.js +1 -0
- package/dist/cjs/modules/stream/browser/readable.js +8 -2
- package/dist/cjs/{modules/pdf/core/crypto.js → utils/crypto.browser.js} +95 -102
- package/dist/cjs/utils/crypto.js +228 -0
- package/dist/esm/modules/archive/compression/crc32.js +1 -1
- package/dist/esm/modules/archive/crypto/aes.js +1 -20
- package/dist/esm/modules/archive/crypto/index.js +3 -1
- package/dist/esm/modules/csv/worker/worker-script.generated.js +1 -1
- package/dist/esm/modules/excel/utils/cell-matrix.js +1 -0
- package/dist/esm/modules/excel/utils/encryptor.browser.js +7 -12
- package/dist/esm/modules/excel/utils/encryptor.js +4 -7
- package/dist/esm/modules/pdf/builder/document-builder.js +1493 -0
- package/dist/esm/modules/pdf/builder/form-appearance.js +140 -0
- package/dist/esm/modules/pdf/builder/image-utils.js +129 -0
- package/dist/esm/modules/pdf/builder/pdf-editor.js +1574 -0
- package/dist/esm/modules/pdf/builder/resource-merger.js +258 -0
- package/dist/esm/modules/pdf/core/digital-signature.js +659 -0
- package/dist/esm/modules/pdf/core/encryption.js +8 -7
- package/dist/esm/modules/pdf/core/pdf-object.js +38 -0
- package/dist/esm/modules/pdf/core/pdf-stream.js +66 -0
- package/dist/esm/modules/pdf/core/pdf-writer.js +271 -6
- package/dist/esm/modules/pdf/core/pdfa.js +261 -0
- package/dist/esm/modules/pdf/index.js +9 -0
- package/dist/esm/modules/pdf/reader/bookmark-extractor.js +324 -0
- package/dist/esm/modules/pdf/reader/pdf-decrypt.js +6 -5
- package/dist/esm/modules/pdf/reader/pdf-reader.js +26 -2
- package/dist/esm/modules/pdf/reader/table-extractor.js +365 -0
- package/dist/esm/modules/pdf/render/layout-engine.js +112 -5
- package/dist/esm/modules/pdf/render/page-renderer.js +62 -103
- package/dist/esm/modules/pdf/render/pdf-exporter.js +2 -61
- package/dist/esm/modules/pdf/render/style-converter.js +1 -1
- package/dist/esm/modules/stream/browser/readable.js +8 -2
- package/dist/esm/{modules/pdf/core/crypto.js → utils/crypto.browser.js} +91 -101
- package/dist/esm/utils/crypto.js +209 -0
- package/dist/iife/excelts.iife.js +1248 -1074
- package/dist/iife/excelts.iife.js.map +1 -1
- package/dist/iife/excelts.iife.min.js +53 -54
- package/dist/types/modules/archive/crypto/aes.d.ts +0 -8
- package/dist/types/modules/archive/crypto/index.d.ts +2 -1
- package/dist/types/modules/csv/parse/row-processor.d.ts +1 -1
- package/dist/types/modules/excel/utils/encryptor.browser.d.ts +4 -5
- package/dist/types/modules/excel/utils/encryptor.d.ts +1 -1
- package/dist/types/modules/pdf/builder/document-builder.d.ts +517 -0
- package/dist/types/modules/pdf/builder/form-appearance.d.ts +56 -0
- package/dist/types/modules/pdf/builder/image-utils.d.ts +39 -0
- package/dist/types/modules/pdf/builder/pdf-editor.d.ts +230 -0
- package/dist/types/modules/pdf/builder/resource-merger.d.ts +41 -0
- package/dist/types/modules/pdf/core/digital-signature.d.ts +109 -0
- package/dist/types/modules/pdf/core/pdf-object.d.ts +11 -0
- package/dist/types/modules/pdf/core/pdf-stream.d.ts +32 -0
- package/dist/types/modules/pdf/core/pdf-writer.d.ts +55 -1
- package/dist/types/modules/pdf/core/pdfa.d.ts +62 -0
- package/dist/types/modules/pdf/index.d.ts +11 -0
- package/dist/types/modules/pdf/reader/bookmark-extractor.d.ts +35 -0
- package/dist/types/modules/pdf/reader/pdf-reader.d.ts +17 -0
- package/dist/types/modules/pdf/reader/table-extractor.d.ts +69 -0
- package/dist/types/modules/pdf/render/layout-engine.d.ts +21 -1
- package/dist/types/modules/pdf/render/page-renderer.d.ts +2 -9
- package/dist/types/modules/pdf/render/style-converter.d.ts +4 -0
- package/dist/types/modules/pdf/types.d.ts +14 -1
- package/dist/types/utils/crypto.browser.d.ts +64 -0
- package/dist/types/utils/crypto.d.ts +97 -0
- package/package.json +110 -111
- package/dist/browser/modules/pdf/core/crypto.d.ts +0 -65
- package/dist/types/modules/pdf/core/crypto.d.ts +0 -65
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF bookmark (outline) extractor.
|
|
3
|
+
*
|
|
4
|
+
* Extracts the document outline tree from a PDF's `/Outlines` dictionary.
|
|
5
|
+
* Each outline item has a title, a target page index, and optional children
|
|
6
|
+
* forming a hierarchical bookmark tree.
|
|
7
|
+
*
|
|
8
|
+
* Supports:
|
|
9
|
+
* - Direct destinations (`/Dest` as array or named destination)
|
|
10
|
+
* - Action-based destinations (`/A << /S /GoTo /D ... >>`)
|
|
11
|
+
* - Nested bookmarks (children via `/First`/`/Last` chains)
|
|
12
|
+
* - Circular reference protection
|
|
13
|
+
*
|
|
14
|
+
* @see PDF Reference 1.7, §12.3 - Document-Level Navigation
|
|
15
|
+
*/
|
|
16
|
+
import { isPdfArray, isPdfRef, dictGetName, decodePdfStringBytes } from "./pdf-parser.js";
|
|
17
|
+
import { getDictStringValue } from "./reader-utils.js";
|
|
18
|
+
// =============================================================================
|
|
19
|
+
// Constants
|
|
20
|
+
// =============================================================================
|
|
21
|
+
/** Maximum depth for recursive outline traversal to prevent stack overflow. */
|
|
22
|
+
const MAX_OUTLINE_DEPTH = 100;
|
|
23
|
+
/** Maximum number of siblings at any level to prevent infinite /Next chains. */
|
|
24
|
+
const MAX_SIBLINGS = 10000;
|
|
25
|
+
// =============================================================================
|
|
26
|
+
// Public API
|
|
27
|
+
// =============================================================================
|
|
28
|
+
/**
|
|
29
|
+
* Extract bookmarks (outlines) from a PDF document.
|
|
30
|
+
*
|
|
31
|
+
* Reads the `/Outlines` dictionary from the catalog and recursively
|
|
32
|
+
* traverses the outline tree following `/First` → `/Next` chains.
|
|
33
|
+
*
|
|
34
|
+
* @param doc - The PDF document
|
|
35
|
+
* @returns Array of top-level bookmarks with nested children
|
|
36
|
+
*/
|
|
37
|
+
export function extractBookmarks(doc) {
|
|
38
|
+
try {
|
|
39
|
+
const catalog = doc.getCatalog();
|
|
40
|
+
const outlinesObj = catalog.get("Outlines");
|
|
41
|
+
if (!outlinesObj) {
|
|
42
|
+
return [];
|
|
43
|
+
}
|
|
44
|
+
const outlinesDict = doc.derefDict(outlinesObj);
|
|
45
|
+
if (!outlinesDict) {
|
|
46
|
+
return [];
|
|
47
|
+
}
|
|
48
|
+
// Build a page reference → index map for resolving destinations
|
|
49
|
+
const pageMap = buildPageMap(doc);
|
|
50
|
+
// The outline root's /First points to the first top-level item
|
|
51
|
+
const visited = new Set();
|
|
52
|
+
return collectSiblings(outlinesDict, doc, pageMap, visited, 0);
|
|
53
|
+
}
|
|
54
|
+
catch {
|
|
55
|
+
return [];
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
// =============================================================================
|
|
59
|
+
// Page Map
|
|
60
|
+
// =============================================================================
|
|
61
|
+
/**
|
|
62
|
+
* Build a map from page object reference identity to 0-based page index.
|
|
63
|
+
*
|
|
64
|
+
* We map by object number since page dicts resolved from different refs
|
|
65
|
+
* will share the same objNum.
|
|
66
|
+
*/
|
|
67
|
+
function buildPageMap(doc) {
|
|
68
|
+
const pages = doc.getPagesWithObjInfo();
|
|
69
|
+
const map = new Map();
|
|
70
|
+
for (let i = 0; i < pages.length; i++) {
|
|
71
|
+
const { objNum } = pages[i];
|
|
72
|
+
if (objNum !== 0) {
|
|
73
|
+
map.set(objNum, i);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
return map;
|
|
77
|
+
}
|
|
78
|
+
// =============================================================================
|
|
79
|
+
// Outline Tree Traversal
|
|
80
|
+
// =============================================================================
|
|
81
|
+
/**
|
|
82
|
+
* Collect the sibling chain starting from the `/First` child of a parent node.
|
|
83
|
+
*/
|
|
84
|
+
function collectSiblings(parentDict, doc, pageMap, visited, depth) {
|
|
85
|
+
if (depth > MAX_OUTLINE_DEPTH) {
|
|
86
|
+
return [];
|
|
87
|
+
}
|
|
88
|
+
const firstObj = parentDict.get("First");
|
|
89
|
+
if (!firstObj) {
|
|
90
|
+
return [];
|
|
91
|
+
}
|
|
92
|
+
const bookmarks = [];
|
|
93
|
+
let currentObj = firstObj;
|
|
94
|
+
let count = 0;
|
|
95
|
+
while (currentObj != null && count < MAX_SIBLINGS) {
|
|
96
|
+
count++;
|
|
97
|
+
// Guard against circular references using object numbers
|
|
98
|
+
if (isPdfRef(currentObj)) {
|
|
99
|
+
if (visited.has(currentObj.objNum)) {
|
|
100
|
+
break;
|
|
101
|
+
}
|
|
102
|
+
visited.add(currentObj.objNum);
|
|
103
|
+
}
|
|
104
|
+
const itemDict = doc.derefDict(currentObj);
|
|
105
|
+
if (!itemDict) {
|
|
106
|
+
break;
|
|
107
|
+
}
|
|
108
|
+
const bookmark = parseOutlineItem(itemDict, doc, pageMap, visited, depth);
|
|
109
|
+
if (bookmark) {
|
|
110
|
+
bookmarks.push(bookmark);
|
|
111
|
+
}
|
|
112
|
+
// Follow /Next to the next sibling
|
|
113
|
+
currentObj = itemDict.get("Next");
|
|
114
|
+
}
|
|
115
|
+
return bookmarks;
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Parse a single outline item dictionary into a PdfBookmark.
|
|
119
|
+
*/
|
|
120
|
+
function parseOutlineItem(dict, doc, pageMap, visited, depth) {
|
|
121
|
+
// Extract title — required per spec
|
|
122
|
+
const title = getOutlineTitle(dict, doc);
|
|
123
|
+
if (!title) {
|
|
124
|
+
return null;
|
|
125
|
+
}
|
|
126
|
+
// Resolve destination to a page index
|
|
127
|
+
const pageIndex = resolveDestination(dict, doc, pageMap);
|
|
128
|
+
// Collect children (nested bookmarks)
|
|
129
|
+
const children = collectSiblings(dict, doc, pageMap, visited, depth + 1);
|
|
130
|
+
return { title, pageIndex, children };
|
|
131
|
+
}
|
|
132
|
+
// =============================================================================
|
|
133
|
+
// Title Extraction
|
|
134
|
+
// =============================================================================
|
|
135
|
+
/**
|
|
136
|
+
* Extract the title string from an outline item dictionary.
|
|
137
|
+
* The /Title entry is a text string (may be Uint8Array or string).
|
|
138
|
+
*/
|
|
139
|
+
function getOutlineTitle(dict, doc) {
|
|
140
|
+
return getDictStringValue(dict, "Title", doc);
|
|
141
|
+
}
|
|
142
|
+
// =============================================================================
|
|
143
|
+
// Destination Resolution
|
|
144
|
+
// =============================================================================
|
|
145
|
+
/**
|
|
146
|
+
* Resolve an outline item's destination to a 0-based page index.
|
|
147
|
+
*
|
|
148
|
+
* Checks /Dest first, then falls back to /A (action) with /S /GoTo.
|
|
149
|
+
* Returns -1 if the destination cannot be resolved.
|
|
150
|
+
*/
|
|
151
|
+
function resolveDestination(dict, doc, pageMap) {
|
|
152
|
+
// 1. Try /Dest (direct destination)
|
|
153
|
+
const destObj = dict.get("Dest");
|
|
154
|
+
if (destObj != null) {
|
|
155
|
+
const pageIndex = resolveDestValue(destObj, doc, pageMap);
|
|
156
|
+
if (pageIndex >= 0) {
|
|
157
|
+
return pageIndex;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
// 2. Try /A (action dictionary) with /S /GoTo
|
|
161
|
+
const actionObj = dict.get("A");
|
|
162
|
+
if (actionObj != null) {
|
|
163
|
+
const actionDict = doc.derefDict(actionObj);
|
|
164
|
+
if (actionDict) {
|
|
165
|
+
const actionType = dictGetName(actionDict, "S");
|
|
166
|
+
if (actionType === "GoTo") {
|
|
167
|
+
const actionDest = actionDict.get("D");
|
|
168
|
+
if (actionDest != null) {
|
|
169
|
+
return resolveDestValue(actionDest, doc, pageMap);
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
return -1;
|
|
175
|
+
}
|
|
176
|
+
/**
|
|
177
|
+
* Resolve a destination value (from /Dest or /A.D) to a page index.
|
|
178
|
+
*
|
|
179
|
+
* Destination formats (PDF Reference 1.7, §12.3.2):
|
|
180
|
+
* - Array: `[pageRef /XYZ left top zoom]`, `[pageRef /Fit]`, etc.
|
|
181
|
+
* - Named string: looked up in the document's /Dests or /Names.Dests
|
|
182
|
+
*/
|
|
183
|
+
function resolveDestValue(destObj, doc, pageMap) {
|
|
184
|
+
const resolved = doc.deref(destObj);
|
|
185
|
+
if (resolved == null) {
|
|
186
|
+
return -1;
|
|
187
|
+
}
|
|
188
|
+
// Array destination: first element is the page reference
|
|
189
|
+
if (isPdfArray(resolved) && resolved.length >= 1) {
|
|
190
|
+
return resolvePageRef(resolved[0], doc, pageMap);
|
|
191
|
+
}
|
|
192
|
+
// Named destination (string) — look up in /Dests or /Names tree
|
|
193
|
+
if (typeof resolved === "string") {
|
|
194
|
+
return resolveNamedDest(resolved, doc, pageMap);
|
|
195
|
+
}
|
|
196
|
+
// Byte string named destination
|
|
197
|
+
if (resolved instanceof Uint8Array) {
|
|
198
|
+
const name = decodePdfStringBytes(resolved);
|
|
199
|
+
return resolveNamedDest(name, doc, pageMap);
|
|
200
|
+
}
|
|
201
|
+
return -1;
|
|
202
|
+
}
|
|
203
|
+
/**
|
|
204
|
+
* Resolve a page reference (from the first element of a dest array) to a page index.
|
|
205
|
+
*/
|
|
206
|
+
function resolvePageRef(pageObj, doc, pageMap) {
|
|
207
|
+
// If it's a direct reference, use the object number
|
|
208
|
+
if (isPdfRef(pageObj)) {
|
|
209
|
+
const idx = pageMap.get(pageObj.objNum);
|
|
210
|
+
return idx !== undefined ? idx : -1;
|
|
211
|
+
}
|
|
212
|
+
// If it's a page number (integer), use it directly as 0-based index
|
|
213
|
+
if (typeof pageObj === "number" && Number.isInteger(pageObj)) {
|
|
214
|
+
return pageObj;
|
|
215
|
+
}
|
|
216
|
+
return -1;
|
|
217
|
+
}
|
|
218
|
+
/**
|
|
219
|
+
* Look up a named destination in the catalog's /Dests dictionary
|
|
220
|
+
* or /Names.Dests name tree.
|
|
221
|
+
*/
|
|
222
|
+
function resolveNamedDest(name, doc, pageMap) {
|
|
223
|
+
const catalog = doc.getCatalog();
|
|
224
|
+
// 1. Try /Dests dictionary (older PDFs)
|
|
225
|
+
const destsObj = catalog.get("Dests");
|
|
226
|
+
if (destsObj != null) {
|
|
227
|
+
const destsDict = doc.derefDict(destsObj);
|
|
228
|
+
if (destsDict) {
|
|
229
|
+
const entry = destsDict.get(name);
|
|
230
|
+
if (entry != null) {
|
|
231
|
+
return resolveDestEntry(entry, doc, pageMap);
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
// 2. Try /Names.Dests name tree (PDF 1.2+)
|
|
236
|
+
const namesObj = catalog.get("Names");
|
|
237
|
+
if (namesObj != null) {
|
|
238
|
+
const namesDict = doc.derefDict(namesObj);
|
|
239
|
+
if (namesDict) {
|
|
240
|
+
const destsTreeObj = namesDict.get("Dests");
|
|
241
|
+
if (destsTreeObj != null) {
|
|
242
|
+
const value = lookupNameTree(destsTreeObj, name, doc);
|
|
243
|
+
if (value != null) {
|
|
244
|
+
return resolveDestEntry(value, doc, pageMap);
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
return -1;
|
|
250
|
+
}
|
|
251
|
+
/**
|
|
252
|
+
* Resolve a destination entry value. It may be a dict with /D key,
|
|
253
|
+
* or a direct array destination.
|
|
254
|
+
*/
|
|
255
|
+
function resolveDestEntry(entry, doc, pageMap) {
|
|
256
|
+
const resolved = doc.deref(entry);
|
|
257
|
+
if (resolved == null) {
|
|
258
|
+
return -1;
|
|
259
|
+
}
|
|
260
|
+
// Direct array destination
|
|
261
|
+
if (isPdfArray(resolved) && resolved.length >= 1) {
|
|
262
|
+
return resolvePageRef(resolved[0], doc, pageMap);
|
|
263
|
+
}
|
|
264
|
+
// Dictionary with /D entry (destination dictionary)
|
|
265
|
+
if (resolved instanceof Map) {
|
|
266
|
+
const d = resolved.get("D");
|
|
267
|
+
if (d != null) {
|
|
268
|
+
return resolveDestValue(d, doc, pageMap);
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
return -1;
|
|
272
|
+
}
|
|
273
|
+
/**
|
|
274
|
+
* Look up a key in a PDF name tree.
|
|
275
|
+
*
|
|
276
|
+
* Name trees use either /Names (leaf) or /Kids (intermediate) arrays.
|
|
277
|
+
* /Names is an array of alternating [key, value, key, value, ...] pairs.
|
|
278
|
+
*
|
|
279
|
+
* @see PDF Reference 1.7, §7.9.6 - Name Trees
|
|
280
|
+
*/
|
|
281
|
+
function lookupNameTree(treeObj, name, doc, depth = 0) {
|
|
282
|
+
if (depth > MAX_OUTLINE_DEPTH) {
|
|
283
|
+
return null;
|
|
284
|
+
}
|
|
285
|
+
const treeDict = doc.derefDict(treeObj);
|
|
286
|
+
if (!treeDict) {
|
|
287
|
+
return null;
|
|
288
|
+
}
|
|
289
|
+
// Check leaf /Names array
|
|
290
|
+
const namesArr = treeDict.get("Names");
|
|
291
|
+
if (namesArr != null) {
|
|
292
|
+
const resolved = doc.deref(namesArr);
|
|
293
|
+
if (isPdfArray(resolved)) {
|
|
294
|
+
// Alternating [key, value, key, value, ...]
|
|
295
|
+
for (let i = 0; i + 1 < resolved.length; i += 2) {
|
|
296
|
+
const key = doc.deref(resolved[i]);
|
|
297
|
+
let keyStr = null;
|
|
298
|
+
if (typeof key === "string") {
|
|
299
|
+
keyStr = key;
|
|
300
|
+
}
|
|
301
|
+
else if (key instanceof Uint8Array) {
|
|
302
|
+
keyStr = decodePdfStringBytes(key);
|
|
303
|
+
}
|
|
304
|
+
if (keyStr === name) {
|
|
305
|
+
return resolved[i + 1];
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
// Check intermediate /Kids array
|
|
311
|
+
const kidsArr = treeDict.get("Kids");
|
|
312
|
+
if (kidsArr != null) {
|
|
313
|
+
const resolved = doc.deref(kidsArr);
|
|
314
|
+
if (isPdfArray(resolved)) {
|
|
315
|
+
for (const kid of resolved) {
|
|
316
|
+
const result = lookupNameTree(kid, name, doc, depth + 1);
|
|
317
|
+
if (result != null) {
|
|
318
|
+
return result;
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
return null;
|
|
324
|
+
}
|
|
@@ -10,7 +10,8 @@
|
|
|
10
10
|
* @see PDF Reference 1.7, §3.5 - Encryption
|
|
11
11
|
* @see PDF 2.0 (ISO 32000-2), §7.6 - Encryption
|
|
12
12
|
*/
|
|
13
|
-
import { rc4, md5, sha256, aesCbcDecrypt, aesCbcDecryptRaw
|
|
13
|
+
import { rc4, md5, sha256, aesCbcDecrypt, aesCbcDecryptRaw } from "../../../utils/crypto.browser.js";
|
|
14
|
+
import { concatUint8Arrays } from "../../../utils/binary.js";
|
|
14
15
|
import { dictGetNumber, dictGetName, dictGetBytes, dictGetArray, dictGetBool } from "./pdf-parser.js";
|
|
15
16
|
import { PdfStructureError } from "../errors.js";
|
|
16
17
|
// =============================================================================
|
|
@@ -164,13 +165,13 @@ function tryUserPasswordV5(passwordBytes, uValue, ueValue) {
|
|
|
164
165
|
const uValidationSalt = uValue.subarray(32, 40);
|
|
165
166
|
const uKeySalt = uValue.subarray(40, 48);
|
|
166
167
|
// Validate: SHA-256(password + validation salt) == first 32 bytes of U
|
|
167
|
-
const validateInput =
|
|
168
|
+
const validateInput = concatUint8Arrays([passwordBytes, uValidationSalt]);
|
|
168
169
|
const computedHash = sha256(validateInput);
|
|
169
170
|
if (!arraysEqual(computedHash, uHash)) {
|
|
170
171
|
return null;
|
|
171
172
|
}
|
|
172
173
|
// Derive key: SHA-256(password + key salt) => use as AES-256 key to decrypt UE
|
|
173
|
-
const keyInput =
|
|
174
|
+
const keyInput = concatUint8Arrays([passwordBytes, uKeySalt]);
|
|
174
175
|
const keyHash = sha256(keyInput);
|
|
175
176
|
// Decrypt UE with this key using AES-256-CBC with zero IV
|
|
176
177
|
const zeroIv = new Uint8Array(16);
|
|
@@ -188,13 +189,13 @@ function tryOwnerPasswordV5(passwordBytes, oValue, oeValue, uValue) {
|
|
|
188
189
|
const oKeySalt = oValue.subarray(40, 48);
|
|
189
190
|
const u48 = uValue.subarray(0, 48);
|
|
190
191
|
// Validate: SHA-256(password + validation salt + U(0..47)) == first 32 bytes of O
|
|
191
|
-
const validateInput =
|
|
192
|
+
const validateInput = concatUint8Arrays([passwordBytes, oValidationSalt, u48]);
|
|
192
193
|
const computedHash = sha256(validateInput);
|
|
193
194
|
if (!arraysEqual(computedHash, oHash)) {
|
|
194
195
|
return null;
|
|
195
196
|
}
|
|
196
197
|
// Derive key: SHA-256(password + key salt + U(0..47))
|
|
197
|
-
const keyInput =
|
|
198
|
+
const keyInput = concatUint8Arrays([passwordBytes, oKeySalt, u48]);
|
|
198
199
|
const keyHash = sha256(keyInput);
|
|
199
200
|
// Decrypt OE with this key using AES-256-CBC with zero IV
|
|
200
201
|
const zeroIv = new Uint8Array(16);
|
|
@@ -52,7 +52,9 @@ import type { TextFragment } from "./content-interpreter.js";
|
|
|
52
52
|
import type { ExtractedImage } from "./image-extractor.js";
|
|
53
53
|
import type { PdfAnnotation } from "./annotation-extractor.js";
|
|
54
54
|
import type { PdfFormField } from "./form-extractor.js";
|
|
55
|
+
import type { PdfBookmark } from "./bookmark-extractor.js";
|
|
55
56
|
import type { PdfMetadata } from "./metadata-reader.js";
|
|
57
|
+
import type { PdfTable } from "./table-extractor.js";
|
|
56
58
|
/**
|
|
57
59
|
* Options for reading a PDF.
|
|
58
60
|
*/
|
|
@@ -94,6 +96,17 @@ export interface ReadPdfOptions {
|
|
|
94
96
|
* @default true
|
|
95
97
|
*/
|
|
96
98
|
extractFormFields?: boolean;
|
|
99
|
+
/**
|
|
100
|
+
* Whether to extract bookmarks (document outline / table of contents).
|
|
101
|
+
* @default true
|
|
102
|
+
*/
|
|
103
|
+
extractBookmarks?: boolean;
|
|
104
|
+
/**
|
|
105
|
+
* Whether to extract tables from pages using text positioning heuristics.
|
|
106
|
+
* Opt-in since table detection is heavier than plain text extraction.
|
|
107
|
+
* @default false
|
|
108
|
+
*/
|
|
109
|
+
extractTables?: boolean;
|
|
97
110
|
}
|
|
98
111
|
/**
|
|
99
112
|
* A single page from a read PDF.
|
|
@@ -111,6 +124,8 @@ export interface ReadPdfPage {
|
|
|
111
124
|
images: ExtractedImage[];
|
|
112
125
|
/** Extracted annotations (links, comments, highlights, etc.) */
|
|
113
126
|
annotations: PdfAnnotation[];
|
|
127
|
+
/** Tables detected from text fragment positioning (opt-in via extractTables) */
|
|
128
|
+
tables: PdfTable[];
|
|
114
129
|
/** Page width in points */
|
|
115
130
|
width: number;
|
|
116
131
|
/** Page height in points */
|
|
@@ -130,6 +145,8 @@ export interface ReadPdfResult {
|
|
|
130
145
|
metadata: PdfMetadata;
|
|
131
146
|
/** Form fields extracted from AcroForm (document-level) */
|
|
132
147
|
formFields: PdfFormField[];
|
|
148
|
+
/** Bookmarks (document outline) extracted from the outline tree */
|
|
149
|
+
bookmarks: PdfBookmark[];
|
|
133
150
|
}
|
|
134
151
|
/**
|
|
135
152
|
* Read a PDF file and extract text, images, and metadata.
|
|
@@ -54,7 +54,9 @@ import { reconstructText, reconstructTextLines } from "./text-reconstruction.js"
|
|
|
54
54
|
import { extractImagesFromPage } from "./image-extractor.js";
|
|
55
55
|
import { extractAnnotationsFromPage } from "./annotation-extractor.js";
|
|
56
56
|
import { extractFormFields } from "./form-extractor.js";
|
|
57
|
+
import { extractBookmarks } from "./bookmark-extractor.js";
|
|
57
58
|
import { extractMetadata } from "./metadata-reader.js";
|
|
59
|
+
import { extractTables } from "./table-extractor.js";
|
|
58
60
|
import { PdfStructureError } from "../errors.js";
|
|
59
61
|
import { yieldToEventLoop } from "../../../utils/utils.base.js";
|
|
60
62
|
// =============================================================================
|
|
@@ -93,7 +95,9 @@ function prepareRead(data, options) {
|
|
|
93
95
|
extractImages: options?.extractImages ?? true,
|
|
94
96
|
extractMetadata: options?.extractMetadata ?? true,
|
|
95
97
|
extractAnnotations: options?.extractAnnotations ?? true,
|
|
96
|
-
extractFormFields: options?.extractFormFields ?? true
|
|
98
|
+
extractFormFields: options?.extractFormFields ?? true,
|
|
99
|
+
extractBookmarks: options?.extractBookmarks ?? true,
|
|
100
|
+
extractTables: options?.extractTables ?? false
|
|
97
101
|
};
|
|
98
102
|
const doc = new PdfDocument(data);
|
|
99
103
|
if (isEncrypted(doc)) {
|
|
@@ -150,6 +154,16 @@ function processPage(pageDict, pageIdx, doc, opts) {
|
|
|
150
154
|
}
|
|
151
155
|
}
|
|
152
156
|
const { width, height } = getPageDimensions(pageDict, doc);
|
|
157
|
+
let tables = [];
|
|
158
|
+
if (opts.extractTables) {
|
|
159
|
+
try {
|
|
160
|
+
tables = extractTables(textFragments, width, height);
|
|
161
|
+
}
|
|
162
|
+
catch (err) {
|
|
163
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
164
|
+
warnings.push(`Table extraction failed on page ${pageNumber}: ${msg}`);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
153
167
|
return {
|
|
154
168
|
pageNumber,
|
|
155
169
|
text,
|
|
@@ -157,6 +171,7 @@ function processPage(pageDict, pageIdx, doc, opts) {
|
|
|
157
171
|
textFragments,
|
|
158
172
|
images,
|
|
159
173
|
annotations,
|
|
174
|
+
tables,
|
|
160
175
|
width,
|
|
161
176
|
height,
|
|
162
177
|
warnings
|
|
@@ -179,7 +194,16 @@ function finalizeRead(pages, totalPageCount, metadata, opts, doc) {
|
|
|
179
194
|
// Non-fatal — just return empty
|
|
180
195
|
}
|
|
181
196
|
}
|
|
182
|
-
|
|
197
|
+
let bookmarks = [];
|
|
198
|
+
if (opts.extractBookmarks) {
|
|
199
|
+
try {
|
|
200
|
+
bookmarks = extractBookmarks(doc);
|
|
201
|
+
}
|
|
202
|
+
catch {
|
|
203
|
+
// Non-fatal — just return empty
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
return { text: allText, pages, metadata, formFields, bookmarks };
|
|
183
207
|
}
|
|
184
208
|
// =============================================================================
|
|
185
209
|
// Helpers
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Table extraction from PDF pages using text fragment positioning.
|
|
3
|
+
*
|
|
4
|
+
* Detects tabular structures by analyzing the spatial layout of text fragments.
|
|
5
|
+
* Since PDF content streams typically render tables as positioned text (with or
|
|
6
|
+
* without drawn grid lines), this module uses a text-only heuristic:
|
|
7
|
+
*
|
|
8
|
+
* 1. Group fragments into lines by Y proximity
|
|
9
|
+
* 2. Detect column boundaries from consistent X-position clusters
|
|
10
|
+
* 3. Identify contiguous blocks of multi-column lines as tables
|
|
11
|
+
* 4. Map fragments to cells based on column/line membership
|
|
12
|
+
*
|
|
13
|
+
* @see content-interpreter.ts for TextFragment extraction
|
|
14
|
+
* @see text-reconstruction.ts for line grouping logic
|
|
15
|
+
*/
|
|
16
|
+
import type { TextFragment } from "./content-interpreter.js";
|
|
17
|
+
/**
|
|
18
|
+
* A single cell in a PDF table.
|
|
19
|
+
*/
|
|
20
|
+
export interface PdfTableCell {
|
|
21
|
+
/** Text content of the cell */
|
|
22
|
+
text: string;
|
|
23
|
+
/** X position in page coordinates (points) */
|
|
24
|
+
x: number;
|
|
25
|
+
/** Y position in page coordinates (points) */
|
|
26
|
+
y: number;
|
|
27
|
+
/** Width of the cell in points */
|
|
28
|
+
width: number;
|
|
29
|
+
/** Height of the cell in points */
|
|
30
|
+
height: number;
|
|
31
|
+
/** Number of rows this cell spans (default 1) */
|
|
32
|
+
rowSpan?: number;
|
|
33
|
+
/** Number of columns this cell spans (default 1) */
|
|
34
|
+
colSpan?: number;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* A single row in a PDF table.
|
|
38
|
+
*/
|
|
39
|
+
export interface PdfTableRow {
|
|
40
|
+
/** Cells in this row, ordered left-to-right */
|
|
41
|
+
cells: PdfTableCell[];
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* A table extracted from a PDF page.
|
|
45
|
+
*/
|
|
46
|
+
export interface PdfTable {
|
|
47
|
+
/** Rows in this table, ordered top-to-bottom */
|
|
48
|
+
rows: PdfTableRow[];
|
|
49
|
+
/** X position of the table (left edge) in page coordinates */
|
|
50
|
+
x: number;
|
|
51
|
+
/** Y position of the table (top edge) in page coordinates */
|
|
52
|
+
y: number;
|
|
53
|
+
/** Width of the table in points */
|
|
54
|
+
width: number;
|
|
55
|
+
/** Height of the table in points */
|
|
56
|
+
height: number;
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Extract tables from a page's text fragments.
|
|
60
|
+
*
|
|
61
|
+
* Uses text positioning heuristics to detect tabular structures without
|
|
62
|
+
* relying on drawn lines or grid paths.
|
|
63
|
+
*
|
|
64
|
+
* @param fragments - Text fragments from `extractTextFromPage`
|
|
65
|
+
* @param pageWidth - Page width in points
|
|
66
|
+
* @param pageHeight - Page height in points
|
|
67
|
+
* @returns Array of detected tables
|
|
68
|
+
*/
|
|
69
|
+
export declare function extractTables(fragments: TextFragment[], pageWidth: number, pageHeight: number): PdfTable[];
|