@cj-tech-master/excelts 9.1.0 → 9.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. package/README.md +16 -1
  2. package/dist/browser/modules/archive/compression/crc32.js +1 -1
  3. package/dist/browser/modules/archive/crypto/aes.d.ts +0 -8
  4. package/dist/browser/modules/archive/crypto/aes.js +1 -20
  5. package/dist/browser/modules/archive/crypto/index.d.ts +2 -1
  6. package/dist/browser/modules/archive/crypto/index.js +3 -1
  7. package/dist/browser/modules/csv/parse/row-processor.d.ts +1 -1
  8. package/dist/browser/modules/csv/worker/worker-script.generated.js +1 -1
  9. package/dist/browser/modules/excel/utils/cell-matrix.js +1 -0
  10. package/dist/browser/modules/excel/utils/encryptor.browser.d.ts +4 -5
  11. package/dist/browser/modules/excel/utils/encryptor.browser.js +7 -12
  12. package/dist/browser/modules/excel/utils/encryptor.d.ts +1 -1
  13. package/dist/browser/modules/excel/utils/encryptor.js +4 -7
  14. package/dist/browser/modules/pdf/builder/document-builder.d.ts +517 -0
  15. package/dist/browser/modules/pdf/builder/document-builder.js +1493 -0
  16. package/dist/browser/modules/pdf/builder/form-appearance.d.ts +56 -0
  17. package/dist/browser/modules/pdf/builder/form-appearance.js +140 -0
  18. package/dist/browser/modules/pdf/builder/image-utils.d.ts +39 -0
  19. package/dist/browser/modules/pdf/builder/image-utils.js +129 -0
  20. package/dist/browser/modules/pdf/builder/pdf-editor.d.ts +230 -0
  21. package/dist/browser/modules/pdf/builder/pdf-editor.js +1574 -0
  22. package/dist/browser/modules/pdf/builder/resource-merger.d.ts +41 -0
  23. package/dist/browser/modules/pdf/builder/resource-merger.js +258 -0
  24. package/dist/browser/modules/pdf/core/digital-signature.d.ts +109 -0
  25. package/dist/browser/modules/pdf/core/digital-signature.js +659 -0
  26. package/dist/browser/modules/pdf/core/encryption.js +8 -7
  27. package/dist/browser/modules/pdf/core/pdf-object.d.ts +11 -0
  28. package/dist/browser/modules/pdf/core/pdf-object.js +38 -0
  29. package/dist/browser/modules/pdf/core/pdf-stream.d.ts +32 -0
  30. package/dist/browser/modules/pdf/core/pdf-stream.js +66 -0
  31. package/dist/browser/modules/pdf/core/pdf-writer.d.ts +55 -1
  32. package/dist/browser/modules/pdf/core/pdf-writer.js +271 -6
  33. package/dist/browser/modules/pdf/core/pdfa.d.ts +62 -0
  34. package/dist/browser/modules/pdf/core/pdfa.js +261 -0
  35. package/dist/browser/modules/pdf/index.d.ts +11 -0
  36. package/dist/browser/modules/pdf/index.js +9 -0
  37. package/dist/browser/modules/pdf/reader/bookmark-extractor.d.ts +35 -0
  38. package/dist/browser/modules/pdf/reader/bookmark-extractor.js +324 -0
  39. package/dist/browser/modules/pdf/reader/pdf-decrypt.js +6 -5
  40. package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +17 -0
  41. package/dist/browser/modules/pdf/reader/pdf-reader.js +26 -2
  42. package/dist/browser/modules/pdf/reader/table-extractor.d.ts +69 -0
  43. package/dist/browser/modules/pdf/reader/table-extractor.js +365 -0
  44. package/dist/browser/modules/pdf/render/layout-engine.d.ts +21 -1
  45. package/dist/browser/modules/pdf/render/layout-engine.js +112 -5
  46. package/dist/browser/modules/pdf/render/page-renderer.d.ts +2 -9
  47. package/dist/browser/modules/pdf/render/page-renderer.js +62 -103
  48. package/dist/browser/modules/pdf/render/pdf-exporter.js +2 -61
  49. package/dist/browser/modules/pdf/render/style-converter.d.ts +4 -0
  50. package/dist/browser/modules/pdf/render/style-converter.js +1 -1
  51. package/dist/browser/modules/pdf/types.d.ts +14 -1
  52. package/dist/browser/modules/stream/browser/readable.js +8 -2
  53. package/dist/browser/utils/crypto.browser.d.ts +64 -0
  54. package/dist/browser/{modules/pdf/core/crypto.js → utils/crypto.browser.js} +91 -101
  55. package/dist/browser/utils/crypto.d.ts +97 -0
  56. package/dist/browser/utils/crypto.js +209 -0
  57. package/dist/cjs/modules/archive/compression/crc32.js +1 -1
  58. package/dist/cjs/modules/archive/crypto/aes.js +2 -23
  59. package/dist/cjs/modules/archive/crypto/index.js +3 -1
  60. package/dist/cjs/modules/csv/worker/worker-script.generated.js +1 -1
  61. package/dist/cjs/modules/excel/utils/cell-matrix.js +1 -0
  62. package/dist/cjs/modules/excel/utils/encryptor.browser.js +7 -12
  63. package/dist/cjs/modules/excel/utils/encryptor.js +4 -10
  64. package/dist/cjs/modules/pdf/builder/document-builder.js +1532 -0
  65. package/dist/cjs/modules/pdf/builder/form-appearance.js +145 -0
  66. package/dist/cjs/modules/pdf/builder/image-utils.js +135 -0
  67. package/dist/cjs/modules/pdf/builder/pdf-editor.js +1612 -0
  68. package/dist/cjs/modules/pdf/builder/resource-merger.js +263 -0
  69. package/dist/cjs/modules/pdf/core/digital-signature.js +667 -0
  70. package/dist/cjs/modules/pdf/core/encryption.js +8 -7
  71. package/dist/cjs/modules/pdf/core/pdf-object.js +38 -0
  72. package/dist/cjs/modules/pdf/core/pdf-stream.js +66 -0
  73. package/dist/cjs/modules/pdf/core/pdf-writer.js +272 -6
  74. package/dist/cjs/modules/pdf/core/pdfa.js +266 -0
  75. package/dist/cjs/modules/pdf/index.js +19 -1
  76. package/dist/cjs/modules/pdf/reader/bookmark-extractor.js +327 -0
  77. package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +6 -5
  78. package/dist/cjs/modules/pdf/reader/pdf-reader.js +26 -2
  79. package/dist/cjs/modules/pdf/reader/table-extractor.js +368 -0
  80. package/dist/cjs/modules/pdf/render/layout-engine.js +113 -4
  81. package/dist/cjs/modules/pdf/render/page-renderer.js +63 -105
  82. package/dist/cjs/modules/pdf/render/pdf-exporter.js +3 -62
  83. package/dist/cjs/modules/pdf/render/style-converter.js +1 -0
  84. package/dist/cjs/modules/stream/browser/readable.js +8 -2
  85. package/dist/cjs/{modules/pdf/core/crypto.js → utils/crypto.browser.js} +95 -102
  86. package/dist/cjs/utils/crypto.js +228 -0
  87. package/dist/esm/modules/archive/compression/crc32.js +1 -1
  88. package/dist/esm/modules/archive/crypto/aes.js +1 -20
  89. package/dist/esm/modules/archive/crypto/index.js +3 -1
  90. package/dist/esm/modules/csv/worker/worker-script.generated.js +1 -1
  91. package/dist/esm/modules/excel/utils/cell-matrix.js +1 -0
  92. package/dist/esm/modules/excel/utils/encryptor.browser.js +7 -12
  93. package/dist/esm/modules/excel/utils/encryptor.js +4 -7
  94. package/dist/esm/modules/pdf/builder/document-builder.js +1493 -0
  95. package/dist/esm/modules/pdf/builder/form-appearance.js +140 -0
  96. package/dist/esm/modules/pdf/builder/image-utils.js +129 -0
  97. package/dist/esm/modules/pdf/builder/pdf-editor.js +1574 -0
  98. package/dist/esm/modules/pdf/builder/resource-merger.js +258 -0
  99. package/dist/esm/modules/pdf/core/digital-signature.js +659 -0
  100. package/dist/esm/modules/pdf/core/encryption.js +8 -7
  101. package/dist/esm/modules/pdf/core/pdf-object.js +38 -0
  102. package/dist/esm/modules/pdf/core/pdf-stream.js +66 -0
  103. package/dist/esm/modules/pdf/core/pdf-writer.js +271 -6
  104. package/dist/esm/modules/pdf/core/pdfa.js +261 -0
  105. package/dist/esm/modules/pdf/index.js +9 -0
  106. package/dist/esm/modules/pdf/reader/bookmark-extractor.js +324 -0
  107. package/dist/esm/modules/pdf/reader/pdf-decrypt.js +6 -5
  108. package/dist/esm/modules/pdf/reader/pdf-reader.js +26 -2
  109. package/dist/esm/modules/pdf/reader/table-extractor.js +365 -0
  110. package/dist/esm/modules/pdf/render/layout-engine.js +112 -5
  111. package/dist/esm/modules/pdf/render/page-renderer.js +62 -103
  112. package/dist/esm/modules/pdf/render/pdf-exporter.js +2 -61
  113. package/dist/esm/modules/pdf/render/style-converter.js +1 -1
  114. package/dist/esm/modules/stream/browser/readable.js +8 -2
  115. package/dist/esm/{modules/pdf/core/crypto.js → utils/crypto.browser.js} +91 -101
  116. package/dist/esm/utils/crypto.js +209 -0
  117. package/dist/iife/excelts.iife.js +1248 -1074
  118. package/dist/iife/excelts.iife.js.map +1 -1
  119. package/dist/iife/excelts.iife.min.js +53 -54
  120. package/dist/types/modules/archive/crypto/aes.d.ts +0 -8
  121. package/dist/types/modules/archive/crypto/index.d.ts +2 -1
  122. package/dist/types/modules/csv/parse/row-processor.d.ts +1 -1
  123. package/dist/types/modules/excel/utils/encryptor.browser.d.ts +4 -5
  124. package/dist/types/modules/excel/utils/encryptor.d.ts +1 -1
  125. package/dist/types/modules/pdf/builder/document-builder.d.ts +517 -0
  126. package/dist/types/modules/pdf/builder/form-appearance.d.ts +56 -0
  127. package/dist/types/modules/pdf/builder/image-utils.d.ts +39 -0
  128. package/dist/types/modules/pdf/builder/pdf-editor.d.ts +230 -0
  129. package/dist/types/modules/pdf/builder/resource-merger.d.ts +41 -0
  130. package/dist/types/modules/pdf/core/digital-signature.d.ts +109 -0
  131. package/dist/types/modules/pdf/core/pdf-object.d.ts +11 -0
  132. package/dist/types/modules/pdf/core/pdf-stream.d.ts +32 -0
  133. package/dist/types/modules/pdf/core/pdf-writer.d.ts +55 -1
  134. package/dist/types/modules/pdf/core/pdfa.d.ts +62 -0
  135. package/dist/types/modules/pdf/index.d.ts +11 -0
  136. package/dist/types/modules/pdf/reader/bookmark-extractor.d.ts +35 -0
  137. package/dist/types/modules/pdf/reader/pdf-reader.d.ts +17 -0
  138. package/dist/types/modules/pdf/reader/table-extractor.d.ts +69 -0
  139. package/dist/types/modules/pdf/render/layout-engine.d.ts +21 -1
  140. package/dist/types/modules/pdf/render/page-renderer.d.ts +2 -9
  141. package/dist/types/modules/pdf/render/style-converter.d.ts +4 -0
  142. package/dist/types/modules/pdf/types.d.ts +14 -1
  143. package/dist/types/utils/crypto.browser.d.ts +64 -0
  144. package/dist/types/utils/crypto.d.ts +97 -0
  145. package/package.json +110 -111
  146. package/dist/browser/modules/pdf/core/crypto.d.ts +0 -65
  147. package/dist/types/modules/pdf/core/crypto.d.ts +0 -65
@@ -0,0 +1,324 @@
1
+ /**
2
+ * PDF bookmark (outline) extractor.
3
+ *
4
+ * Extracts the document outline tree from a PDF's `/Outlines` dictionary.
5
+ * Each outline item has a title, a target page index, and optional children
6
+ * forming a hierarchical bookmark tree.
7
+ *
8
+ * Supports:
9
+ * - Direct destinations (`/Dest` as array or named destination)
10
+ * - Action-based destinations (`/A << /S /GoTo /D ... >>`)
11
+ * - Nested bookmarks (children via `/First`/`/Last` chains)
12
+ * - Circular reference protection
13
+ *
14
+ * @see PDF Reference 1.7, §12.3 - Document-Level Navigation
15
+ */
16
+ import { isPdfArray, isPdfRef, dictGetName, decodePdfStringBytes } from "./pdf-parser.js";
17
+ import { getDictStringValue } from "./reader-utils.js";
18
+ // =============================================================================
19
+ // Constants
20
+ // =============================================================================
21
+ /** Maximum depth for recursive outline traversal to prevent stack overflow. */
22
+ const MAX_OUTLINE_DEPTH = 100;
23
+ /** Maximum number of siblings at any level to prevent infinite /Next chains. */
24
+ const MAX_SIBLINGS = 10000;
25
+ // =============================================================================
26
+ // Public API
27
+ // =============================================================================
28
+ /**
29
+ * Extract bookmarks (outlines) from a PDF document.
30
+ *
31
+ * Reads the `/Outlines` dictionary from the catalog and recursively
32
+ * traverses the outline tree following `/First` → `/Next` chains.
33
+ *
34
+ * @param doc - The PDF document
35
+ * @returns Array of top-level bookmarks with nested children
36
+ */
37
+ export function extractBookmarks(doc) {
38
+ try {
39
+ const catalog = doc.getCatalog();
40
+ const outlinesObj = catalog.get("Outlines");
41
+ if (!outlinesObj) {
42
+ return [];
43
+ }
44
+ const outlinesDict = doc.derefDict(outlinesObj);
45
+ if (!outlinesDict) {
46
+ return [];
47
+ }
48
+ // Build a page reference → index map for resolving destinations
49
+ const pageMap = buildPageMap(doc);
50
+ // The outline root's /First points to the first top-level item
51
+ const visited = new Set();
52
+ return collectSiblings(outlinesDict, doc, pageMap, visited, 0);
53
+ }
54
+ catch {
55
+ return [];
56
+ }
57
+ }
58
+ // =============================================================================
59
+ // Page Map
60
+ // =============================================================================
61
+ /**
62
+ * Build a map from page object reference identity to 0-based page index.
63
+ *
64
+ * We map by object number since page dicts resolved from different refs
65
+ * will share the same objNum.
66
+ */
67
+ function buildPageMap(doc) {
68
+ const pages = doc.getPagesWithObjInfo();
69
+ const map = new Map();
70
+ for (let i = 0; i < pages.length; i++) {
71
+ const { objNum } = pages[i];
72
+ if (objNum !== 0) {
73
+ map.set(objNum, i);
74
+ }
75
+ }
76
+ return map;
77
+ }
78
+ // =============================================================================
79
+ // Outline Tree Traversal
80
+ // =============================================================================
81
+ /**
82
+ * Collect the sibling chain starting from the `/First` child of a parent node.
83
+ */
84
+ function collectSiblings(parentDict, doc, pageMap, visited, depth) {
85
+ if (depth > MAX_OUTLINE_DEPTH) {
86
+ return [];
87
+ }
88
+ const firstObj = parentDict.get("First");
89
+ if (!firstObj) {
90
+ return [];
91
+ }
92
+ const bookmarks = [];
93
+ let currentObj = firstObj;
94
+ let count = 0;
95
+ while (currentObj != null && count < MAX_SIBLINGS) {
96
+ count++;
97
+ // Guard against circular references using object numbers
98
+ if (isPdfRef(currentObj)) {
99
+ if (visited.has(currentObj.objNum)) {
100
+ break;
101
+ }
102
+ visited.add(currentObj.objNum);
103
+ }
104
+ const itemDict = doc.derefDict(currentObj);
105
+ if (!itemDict) {
106
+ break;
107
+ }
108
+ const bookmark = parseOutlineItem(itemDict, doc, pageMap, visited, depth);
109
+ if (bookmark) {
110
+ bookmarks.push(bookmark);
111
+ }
112
+ // Follow /Next to the next sibling
113
+ currentObj = itemDict.get("Next");
114
+ }
115
+ return bookmarks;
116
+ }
117
+ /**
118
+ * Parse a single outline item dictionary into a PdfBookmark.
119
+ */
120
+ function parseOutlineItem(dict, doc, pageMap, visited, depth) {
121
+ // Extract title — required per spec
122
+ const title = getOutlineTitle(dict, doc);
123
+ if (!title) {
124
+ return null;
125
+ }
126
+ // Resolve destination to a page index
127
+ const pageIndex = resolveDestination(dict, doc, pageMap);
128
+ // Collect children (nested bookmarks)
129
+ const children = collectSiblings(dict, doc, pageMap, visited, depth + 1);
130
+ return { title, pageIndex, children };
131
+ }
132
+ // =============================================================================
133
+ // Title Extraction
134
+ // =============================================================================
135
+ /**
136
+ * Extract the title string from an outline item dictionary.
137
+ * The /Title entry is a text string (may be Uint8Array or string).
138
+ */
139
+ function getOutlineTitle(dict, doc) {
140
+ return getDictStringValue(dict, "Title", doc);
141
+ }
142
+ // =============================================================================
143
+ // Destination Resolution
144
+ // =============================================================================
145
+ /**
146
+ * Resolve an outline item's destination to a 0-based page index.
147
+ *
148
+ * Checks /Dest first, then falls back to /A (action) with /S /GoTo.
149
+ * Returns -1 if the destination cannot be resolved.
150
+ */
151
+ function resolveDestination(dict, doc, pageMap) {
152
+ // 1. Try /Dest (direct destination)
153
+ const destObj = dict.get("Dest");
154
+ if (destObj != null) {
155
+ const pageIndex = resolveDestValue(destObj, doc, pageMap);
156
+ if (pageIndex >= 0) {
157
+ return pageIndex;
158
+ }
159
+ }
160
+ // 2. Try /A (action dictionary) with /S /GoTo
161
+ const actionObj = dict.get("A");
162
+ if (actionObj != null) {
163
+ const actionDict = doc.derefDict(actionObj);
164
+ if (actionDict) {
165
+ const actionType = dictGetName(actionDict, "S");
166
+ if (actionType === "GoTo") {
167
+ const actionDest = actionDict.get("D");
168
+ if (actionDest != null) {
169
+ return resolveDestValue(actionDest, doc, pageMap);
170
+ }
171
+ }
172
+ }
173
+ }
174
+ return -1;
175
+ }
176
+ /**
177
+ * Resolve a destination value (from /Dest or /A.D) to a page index.
178
+ *
179
+ * Destination formats (PDF Reference 1.7, §12.3.2):
180
+ * - Array: `[pageRef /XYZ left top zoom]`, `[pageRef /Fit]`, etc.
181
+ * - Named string: looked up in the document's /Dests or /Names.Dests
182
+ */
183
+ function resolveDestValue(destObj, doc, pageMap) {
184
+ const resolved = doc.deref(destObj);
185
+ if (resolved == null) {
186
+ return -1;
187
+ }
188
+ // Array destination: first element is the page reference
189
+ if (isPdfArray(resolved) && resolved.length >= 1) {
190
+ return resolvePageRef(resolved[0], doc, pageMap);
191
+ }
192
+ // Named destination (string) — look up in /Dests or /Names tree
193
+ if (typeof resolved === "string") {
194
+ return resolveNamedDest(resolved, doc, pageMap);
195
+ }
196
+ // Byte string named destination
197
+ if (resolved instanceof Uint8Array) {
198
+ const name = decodePdfStringBytes(resolved);
199
+ return resolveNamedDest(name, doc, pageMap);
200
+ }
201
+ return -1;
202
+ }
203
+ /**
204
+ * Resolve a page reference (from the first element of a dest array) to a page index.
205
+ */
206
+ function resolvePageRef(pageObj, doc, pageMap) {
207
+ // If it's a direct reference, use the object number
208
+ if (isPdfRef(pageObj)) {
209
+ const idx = pageMap.get(pageObj.objNum);
210
+ return idx !== undefined ? idx : -1;
211
+ }
212
+ // If it's a page number (integer), use it directly as 0-based index
213
+ if (typeof pageObj === "number" && Number.isInteger(pageObj)) {
214
+ return pageObj;
215
+ }
216
+ return -1;
217
+ }
218
+ /**
219
+ * Look up a named destination in the catalog's /Dests dictionary
220
+ * or /Names.Dests name tree.
221
+ */
222
+ function resolveNamedDest(name, doc, pageMap) {
223
+ const catalog = doc.getCatalog();
224
+ // 1. Try /Dests dictionary (older PDFs)
225
+ const destsObj = catalog.get("Dests");
226
+ if (destsObj != null) {
227
+ const destsDict = doc.derefDict(destsObj);
228
+ if (destsDict) {
229
+ const entry = destsDict.get(name);
230
+ if (entry != null) {
231
+ return resolveDestEntry(entry, doc, pageMap);
232
+ }
233
+ }
234
+ }
235
+ // 2. Try /Names.Dests name tree (PDF 1.2+)
236
+ const namesObj = catalog.get("Names");
237
+ if (namesObj != null) {
238
+ const namesDict = doc.derefDict(namesObj);
239
+ if (namesDict) {
240
+ const destsTreeObj = namesDict.get("Dests");
241
+ if (destsTreeObj != null) {
242
+ const value = lookupNameTree(destsTreeObj, name, doc);
243
+ if (value != null) {
244
+ return resolveDestEntry(value, doc, pageMap);
245
+ }
246
+ }
247
+ }
248
+ }
249
+ return -1;
250
+ }
251
+ /**
252
+ * Resolve a destination entry value. It may be a dict with /D key,
253
+ * or a direct array destination.
254
+ */
255
+ function resolveDestEntry(entry, doc, pageMap) {
256
+ const resolved = doc.deref(entry);
257
+ if (resolved == null) {
258
+ return -1;
259
+ }
260
+ // Direct array destination
261
+ if (isPdfArray(resolved) && resolved.length >= 1) {
262
+ return resolvePageRef(resolved[0], doc, pageMap);
263
+ }
264
+ // Dictionary with /D entry (destination dictionary)
265
+ if (resolved instanceof Map) {
266
+ const d = resolved.get("D");
267
+ if (d != null) {
268
+ return resolveDestValue(d, doc, pageMap);
269
+ }
270
+ }
271
+ return -1;
272
+ }
273
+ /**
274
+ * Look up a key in a PDF name tree.
275
+ *
276
+ * Name trees use either /Names (leaf) or /Kids (intermediate) arrays.
277
+ * /Names is an array of alternating [key, value, key, value, ...] pairs.
278
+ *
279
+ * @see PDF Reference 1.7, §7.9.6 - Name Trees
280
+ */
281
+ function lookupNameTree(treeObj, name, doc, depth = 0) {
282
+ if (depth > MAX_OUTLINE_DEPTH) {
283
+ return null;
284
+ }
285
+ const treeDict = doc.derefDict(treeObj);
286
+ if (!treeDict) {
287
+ return null;
288
+ }
289
+ // Check leaf /Names array
290
+ const namesArr = treeDict.get("Names");
291
+ if (namesArr != null) {
292
+ const resolved = doc.deref(namesArr);
293
+ if (isPdfArray(resolved)) {
294
+ // Alternating [key, value, key, value, ...]
295
+ for (let i = 0; i + 1 < resolved.length; i += 2) {
296
+ const key = doc.deref(resolved[i]);
297
+ let keyStr = null;
298
+ if (typeof key === "string") {
299
+ keyStr = key;
300
+ }
301
+ else if (key instanceof Uint8Array) {
302
+ keyStr = decodePdfStringBytes(key);
303
+ }
304
+ if (keyStr === name) {
305
+ return resolved[i + 1];
306
+ }
307
+ }
308
+ }
309
+ }
310
+ // Check intermediate /Kids array
311
+ const kidsArr = treeDict.get("Kids");
312
+ if (kidsArr != null) {
313
+ const resolved = doc.deref(kidsArr);
314
+ if (isPdfArray(resolved)) {
315
+ for (const kid of resolved) {
316
+ const result = lookupNameTree(kid, name, doc, depth + 1);
317
+ if (result != null) {
318
+ return result;
319
+ }
320
+ }
321
+ }
322
+ }
323
+ return null;
324
+ }
@@ -10,7 +10,8 @@
10
10
  * @see PDF Reference 1.7, §3.5 - Encryption
11
11
  * @see PDF 2.0 (ISO 32000-2), §7.6 - Encryption
12
12
  */
13
- import { rc4, md5, sha256, aesCbcDecrypt, aesCbcDecryptRaw, concatArrays } from "../core/crypto.js";
13
+ import { rc4, md5, sha256, aesCbcDecrypt, aesCbcDecryptRaw } from "../../../utils/crypto.browser.js";
14
+ import { concatUint8Arrays } from "../../../utils/binary.js";
14
15
  import { dictGetNumber, dictGetName, dictGetBytes, dictGetArray, dictGetBool } from "./pdf-parser.js";
15
16
  import { PdfStructureError } from "../errors.js";
16
17
  // =============================================================================
@@ -164,13 +165,13 @@ function tryUserPasswordV5(passwordBytes, uValue, ueValue) {
164
165
  const uValidationSalt = uValue.subarray(32, 40);
165
166
  const uKeySalt = uValue.subarray(40, 48);
166
167
  // Validate: SHA-256(password + validation salt) == first 32 bytes of U
167
- const validateInput = concatArrays(passwordBytes, uValidationSalt);
168
+ const validateInput = concatUint8Arrays([passwordBytes, uValidationSalt]);
168
169
  const computedHash = sha256(validateInput);
169
170
  if (!arraysEqual(computedHash, uHash)) {
170
171
  return null;
171
172
  }
172
173
  // Derive key: SHA-256(password + key salt) => use as AES-256 key to decrypt UE
173
- const keyInput = concatArrays(passwordBytes, uKeySalt);
174
+ const keyInput = concatUint8Arrays([passwordBytes, uKeySalt]);
174
175
  const keyHash = sha256(keyInput);
175
176
  // Decrypt UE with this key using AES-256-CBC with zero IV
176
177
  const zeroIv = new Uint8Array(16);
@@ -188,13 +189,13 @@ function tryOwnerPasswordV5(passwordBytes, oValue, oeValue, uValue) {
188
189
  const oKeySalt = oValue.subarray(40, 48);
189
190
  const u48 = uValue.subarray(0, 48);
190
191
  // Validate: SHA-256(password + validation salt + U(0..47)) == first 32 bytes of O
191
- const validateInput = concatArrays(passwordBytes, oValidationSalt, u48);
192
+ const validateInput = concatUint8Arrays([passwordBytes, oValidationSalt, u48]);
192
193
  const computedHash = sha256(validateInput);
193
194
  if (!arraysEqual(computedHash, oHash)) {
194
195
  return null;
195
196
  }
196
197
  // Derive key: SHA-256(password + key salt + U(0..47))
197
- const keyInput = concatArrays(passwordBytes, oKeySalt, u48);
198
+ const keyInput = concatUint8Arrays([passwordBytes, oKeySalt, u48]);
198
199
  const keyHash = sha256(keyInput);
199
200
  // Decrypt OE with this key using AES-256-CBC with zero IV
200
201
  const zeroIv = new Uint8Array(16);
@@ -52,7 +52,9 @@ import type { TextFragment } from "./content-interpreter.js";
52
52
  import type { ExtractedImage } from "./image-extractor.js";
53
53
  import type { PdfAnnotation } from "./annotation-extractor.js";
54
54
  import type { PdfFormField } from "./form-extractor.js";
55
+ import type { PdfBookmark } from "./bookmark-extractor.js";
55
56
  import type { PdfMetadata } from "./metadata-reader.js";
57
+ import type { PdfTable } from "./table-extractor.js";
56
58
  /**
57
59
  * Options for reading a PDF.
58
60
  */
@@ -94,6 +96,17 @@ export interface ReadPdfOptions {
94
96
  * @default true
95
97
  */
96
98
  extractFormFields?: boolean;
99
+ /**
100
+ * Whether to extract bookmarks (document outline / table of contents).
101
+ * @default true
102
+ */
103
+ extractBookmarks?: boolean;
104
+ /**
105
+ * Whether to extract tables from pages using text positioning heuristics.
106
+ * Opt-in since table detection is heavier than plain text extraction.
107
+ * @default false
108
+ */
109
+ extractTables?: boolean;
97
110
  }
98
111
  /**
99
112
  * A single page from a read PDF.
@@ -111,6 +124,8 @@ export interface ReadPdfPage {
111
124
  images: ExtractedImage[];
112
125
  /** Extracted annotations (links, comments, highlights, etc.) */
113
126
  annotations: PdfAnnotation[];
127
+ /** Tables detected from text fragment positioning (opt-in via extractTables) */
128
+ tables: PdfTable[];
114
129
  /** Page width in points */
115
130
  width: number;
116
131
  /** Page height in points */
@@ -130,6 +145,8 @@ export interface ReadPdfResult {
130
145
  metadata: PdfMetadata;
131
146
  /** Form fields extracted from AcroForm (document-level) */
132
147
  formFields: PdfFormField[];
148
+ /** Bookmarks (document outline) extracted from the outline tree */
149
+ bookmarks: PdfBookmark[];
133
150
  }
134
151
  /**
135
152
  * Read a PDF file and extract text, images, and metadata.
@@ -54,7 +54,9 @@ import { reconstructText, reconstructTextLines } from "./text-reconstruction.js"
54
54
  import { extractImagesFromPage } from "./image-extractor.js";
55
55
  import { extractAnnotationsFromPage } from "./annotation-extractor.js";
56
56
  import { extractFormFields } from "./form-extractor.js";
57
+ import { extractBookmarks } from "./bookmark-extractor.js";
57
58
  import { extractMetadata } from "./metadata-reader.js";
59
+ import { extractTables } from "./table-extractor.js";
58
60
  import { PdfStructureError } from "../errors.js";
59
61
  import { yieldToEventLoop } from "../../../utils/utils.base.js";
60
62
  // =============================================================================
@@ -93,7 +95,9 @@ function prepareRead(data, options) {
93
95
  extractImages: options?.extractImages ?? true,
94
96
  extractMetadata: options?.extractMetadata ?? true,
95
97
  extractAnnotations: options?.extractAnnotations ?? true,
96
- extractFormFields: options?.extractFormFields ?? true
98
+ extractFormFields: options?.extractFormFields ?? true,
99
+ extractBookmarks: options?.extractBookmarks ?? true,
100
+ extractTables: options?.extractTables ?? false
97
101
  };
98
102
  const doc = new PdfDocument(data);
99
103
  if (isEncrypted(doc)) {
@@ -150,6 +154,16 @@ function processPage(pageDict, pageIdx, doc, opts) {
150
154
  }
151
155
  }
152
156
  const { width, height } = getPageDimensions(pageDict, doc);
157
+ let tables = [];
158
+ if (opts.extractTables) {
159
+ try {
160
+ tables = extractTables(textFragments, width, height);
161
+ }
162
+ catch (err) {
163
+ const msg = err instanceof Error ? err.message : String(err);
164
+ warnings.push(`Table extraction failed on page ${pageNumber}: ${msg}`);
165
+ }
166
+ }
153
167
  return {
154
168
  pageNumber,
155
169
  text,
@@ -157,6 +171,7 @@ function processPage(pageDict, pageIdx, doc, opts) {
157
171
  textFragments,
158
172
  images,
159
173
  annotations,
174
+ tables,
160
175
  width,
161
176
  height,
162
177
  warnings
@@ -179,7 +194,16 @@ function finalizeRead(pages, totalPageCount, metadata, opts, doc) {
179
194
  // Non-fatal — just return empty
180
195
  }
181
196
  }
182
- return { text: allText, pages, metadata, formFields };
197
+ let bookmarks = [];
198
+ if (opts.extractBookmarks) {
199
+ try {
200
+ bookmarks = extractBookmarks(doc);
201
+ }
202
+ catch {
203
+ // Non-fatal — just return empty
204
+ }
205
+ }
206
+ return { text: allText, pages, metadata, formFields, bookmarks };
183
207
  }
184
208
  // =============================================================================
185
209
  // Helpers
@@ -0,0 +1,69 @@
1
+ /**
2
+ * Table extraction from PDF pages using text fragment positioning.
3
+ *
4
+ * Detects tabular structures by analyzing the spatial layout of text fragments.
5
+ * Since PDF content streams typically render tables as positioned text (with or
6
+ * without drawn grid lines), this module uses a text-only heuristic:
7
+ *
8
+ * 1. Group fragments into lines by Y proximity
9
+ * 2. Detect column boundaries from consistent X-position clusters
10
+ * 3. Identify contiguous blocks of multi-column lines as tables
11
+ * 4. Map fragments to cells based on column/line membership
12
+ *
13
+ * @see content-interpreter.ts for TextFragment extraction
14
+ * @see text-reconstruction.ts for line grouping logic
15
+ */
16
+ import type { TextFragment } from "./content-interpreter.js";
17
+ /**
18
+ * A single cell in a PDF table.
19
+ */
20
+ export interface PdfTableCell {
21
+ /** Text content of the cell */
22
+ text: string;
23
+ /** X position in page coordinates (points) */
24
+ x: number;
25
+ /** Y position in page coordinates (points) */
26
+ y: number;
27
+ /** Width of the cell in points */
28
+ width: number;
29
+ /** Height of the cell in points */
30
+ height: number;
31
+ /** Number of rows this cell spans (default 1) */
32
+ rowSpan?: number;
33
+ /** Number of columns this cell spans (default 1) */
34
+ colSpan?: number;
35
+ }
36
+ /**
37
+ * A single row in a PDF table.
38
+ */
39
+ export interface PdfTableRow {
40
+ /** Cells in this row, ordered left-to-right */
41
+ cells: PdfTableCell[];
42
+ }
43
+ /**
44
+ * A table extracted from a PDF page.
45
+ */
46
+ export interface PdfTable {
47
+ /** Rows in this table, ordered top-to-bottom */
48
+ rows: PdfTableRow[];
49
+ /** X position of the table (left edge) in page coordinates */
50
+ x: number;
51
+ /** Y position of the table (top edge) in page coordinates */
52
+ y: number;
53
+ /** Width of the table in points */
54
+ width: number;
55
+ /** Height of the table in points */
56
+ height: number;
57
+ }
58
+ /**
59
+ * Extract tables from a page's text fragments.
60
+ *
61
+ * Uses text positioning heuristics to detect tabular structures without
62
+ * relying on drawn lines or grid paths.
63
+ *
64
+ * @param fragments - Text fragments from `extractTextFromPage`
65
+ * @param pageWidth - Page width in points
66
+ * @param pageHeight - Page height in points
67
+ * @returns Array of detected tables
68
+ */
69
+ export declare function extractTables(fragments: TextFragment[], pageWidth: number, pageHeight: number): PdfTable[];