@cj-tech-master/excelts 8.0.0 → 8.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -1
- package/README_zh.md +6 -0
- package/dist/browser/modules/archive/zip/stream.d.ts +4 -0
- package/dist/browser/modules/archive/zip/stream.js +53 -0
- package/dist/browser/modules/pdf/core/crypto.d.ts +65 -0
- package/dist/browser/modules/pdf/core/crypto.js +637 -0
- package/dist/browser/modules/pdf/core/encryption.d.ts +23 -20
- package/dist/browser/modules/pdf/core/encryption.js +88 -261
- package/dist/browser/modules/pdf/core/pdf-writer.d.ts +6 -4
- package/dist/browser/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/browser/modules/pdf/index.d.ts +23 -2
- package/dist/browser/modules/pdf/index.js +21 -3
- package/dist/browser/modules/pdf/reader/annotation-extractor.d.ts +63 -0
- package/dist/browser/modules/pdf/reader/annotation-extractor.js +155 -0
- package/dist/browser/modules/pdf/reader/cmap-parser.d.ts +70 -0
- package/dist/browser/modules/pdf/reader/cmap-parser.js +321 -0
- package/dist/browser/modules/pdf/reader/content-interpreter.d.ts +57 -0
- package/dist/browser/modules/pdf/reader/content-interpreter.js +715 -0
- package/dist/browser/modules/pdf/reader/font-decoder.d.ts +58 -0
- package/dist/browser/modules/pdf/reader/font-decoder.js +1513 -0
- package/dist/browser/modules/pdf/reader/form-extractor.d.ts +48 -0
- package/dist/browser/modules/pdf/reader/form-extractor.js +355 -0
- package/dist/browser/modules/pdf/reader/image-extractor.d.ts +55 -0
- package/dist/browser/modules/pdf/reader/image-extractor.js +220 -0
- package/dist/browser/modules/pdf/reader/metadata-reader.d.ts +56 -0
- package/dist/browser/modules/pdf/reader/metadata-reader.js +275 -0
- package/dist/browser/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
- package/dist/browser/modules/pdf/reader/pdf-decrypt.js +443 -0
- package/dist/browser/modules/pdf/reader/pdf-document.d.ts +191 -0
- package/dist/browser/modules/pdf/reader/pdf-document.js +818 -0
- package/dist/browser/modules/pdf/reader/pdf-parser.d.ts +65 -0
- package/dist/browser/modules/pdf/reader/pdf-parser.js +285 -0
- package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +143 -0
- package/dist/browser/modules/pdf/reader/pdf-reader.js +200 -0
- package/dist/browser/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
- package/dist/browser/modules/pdf/reader/pdf-tokenizer.js +543 -0
- package/dist/browser/modules/pdf/reader/reader-utils.d.ts +15 -0
- package/dist/browser/modules/pdf/reader/reader-utils.js +27 -0
- package/dist/browser/modules/pdf/reader/stream-filters.d.ts +20 -0
- package/dist/browser/modules/pdf/reader/stream-filters.js +456 -0
- package/dist/browser/modules/pdf/reader/text-reconstruction.d.ts +44 -0
- package/dist/browser/modules/pdf/reader/text-reconstruction.js +463 -0
- package/dist/cjs/modules/archive/zip/stream.js +53 -0
- package/dist/cjs/modules/pdf/core/crypto.js +649 -0
- package/dist/cjs/modules/pdf/core/encryption.js +88 -263
- package/dist/cjs/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/cjs/modules/pdf/index.js +23 -4
- package/dist/cjs/modules/pdf/reader/annotation-extractor.js +158 -0
- package/dist/cjs/modules/pdf/reader/cmap-parser.js +326 -0
- package/dist/cjs/modules/pdf/reader/content-interpreter.js +718 -0
- package/dist/cjs/modules/pdf/reader/font-decoder.js +1518 -0
- package/dist/cjs/modules/pdf/reader/form-extractor.js +358 -0
- package/dist/cjs/modules/pdf/reader/image-extractor.js +223 -0
- package/dist/cjs/modules/pdf/reader/metadata-reader.js +278 -0
- package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +447 -0
- package/dist/cjs/modules/pdf/reader/pdf-document.js +822 -0
- package/dist/cjs/modules/pdf/reader/pdf-parser.js +301 -0
- package/dist/cjs/modules/pdf/reader/pdf-reader.js +203 -0
- package/dist/cjs/modules/pdf/reader/pdf-tokenizer.js +517 -0
- package/dist/cjs/modules/pdf/reader/reader-utils.js +30 -0
- package/dist/cjs/modules/pdf/reader/stream-filters.js +459 -0
- package/dist/cjs/modules/pdf/reader/text-reconstruction.js +467 -0
- package/dist/esm/modules/archive/zip/stream.js +53 -0
- package/dist/esm/modules/pdf/core/crypto.js +637 -0
- package/dist/esm/modules/pdf/core/encryption.js +88 -261
- package/dist/esm/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/esm/modules/pdf/index.js +21 -3
- package/dist/esm/modules/pdf/reader/annotation-extractor.js +155 -0
- package/dist/esm/modules/pdf/reader/cmap-parser.js +321 -0
- package/dist/esm/modules/pdf/reader/content-interpreter.js +715 -0
- package/dist/esm/modules/pdf/reader/font-decoder.js +1513 -0
- package/dist/esm/modules/pdf/reader/form-extractor.js +355 -0
- package/dist/esm/modules/pdf/reader/image-extractor.js +220 -0
- package/dist/esm/modules/pdf/reader/metadata-reader.js +275 -0
- package/dist/esm/modules/pdf/reader/pdf-decrypt.js +443 -0
- package/dist/esm/modules/pdf/reader/pdf-document.js +818 -0
- package/dist/esm/modules/pdf/reader/pdf-parser.js +285 -0
- package/dist/esm/modules/pdf/reader/pdf-reader.js +200 -0
- package/dist/esm/modules/pdf/reader/pdf-tokenizer.js +543 -0
- package/dist/esm/modules/pdf/reader/reader-utils.js +27 -0
- package/dist/esm/modules/pdf/reader/stream-filters.js +456 -0
- package/dist/esm/modules/pdf/reader/text-reconstruction.js +463 -0
- package/dist/iife/excelts.iife.js +703 -267
- package/dist/iife/excelts.iife.js.map +1 -1
- package/dist/iife/excelts.iife.min.js +35 -35
- package/dist/types/modules/archive/zip/stream.d.ts +4 -0
- package/dist/types/modules/pdf/core/crypto.d.ts +65 -0
- package/dist/types/modules/pdf/core/encryption.d.ts +23 -20
- package/dist/types/modules/pdf/core/pdf-writer.d.ts +6 -4
- package/dist/types/modules/pdf/index.d.ts +23 -2
- package/dist/types/modules/pdf/reader/annotation-extractor.d.ts +63 -0
- package/dist/types/modules/pdf/reader/cmap-parser.d.ts +70 -0
- package/dist/types/modules/pdf/reader/content-interpreter.d.ts +57 -0
- package/dist/types/modules/pdf/reader/font-decoder.d.ts +58 -0
- package/dist/types/modules/pdf/reader/form-extractor.d.ts +48 -0
- package/dist/types/modules/pdf/reader/image-extractor.d.ts +55 -0
- package/dist/types/modules/pdf/reader/metadata-reader.d.ts +56 -0
- package/dist/types/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
- package/dist/types/modules/pdf/reader/pdf-document.d.ts +191 -0
- package/dist/types/modules/pdf/reader/pdf-parser.d.ts +65 -0
- package/dist/types/modules/pdf/reader/pdf-reader.d.ts +143 -0
- package/dist/types/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
- package/dist/types/modules/pdf/reader/reader-utils.d.ts +15 -0
- package/dist/types/modules/pdf/reader/stream-filters.d.ts +20 -0
- package/dist/types/modules/pdf/reader/text-reconstruction.d.ts +44 -0
- package/package.json +1 -1
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF stream filter decoder chain.
|
|
3
|
+
*
|
|
4
|
+
* Decodes PDF stream data by applying the appropriate filter(s)
|
|
5
|
+
* specified in the stream dictionary's /Filter entry.
|
|
6
|
+
*
|
|
7
|
+
* Supported filters:
|
|
8
|
+
* - /FlateDecode (zlib/deflate compression)
|
|
9
|
+
* - /ASCII85Decode (ASCII base-85 encoding)
|
|
10
|
+
* - /ASCIIHexDecode (ASCII hexadecimal encoding)
|
|
11
|
+
* - /LZWDecode (LZW compression)
|
|
12
|
+
* - /RunLengthDecode (run-length encoding)
|
|
13
|
+
*
|
|
14
|
+
* @see PDF Reference 1.7, §3.3 - Filters
|
|
15
|
+
*/
|
|
16
|
+
import type { PdfDictValue } from "./pdf-parser.js";
|
|
17
|
+
/**
|
|
18
|
+
* Decode stream data by applying the filter chain from the stream dictionary.
|
|
19
|
+
*/
|
|
20
|
+
export declare function decodeStreamFilters(data: Uint8Array, dict: PdfDictValue): Uint8Array;
|
|
@@ -0,0 +1,456 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF stream filter decoder chain.
|
|
3
|
+
*
|
|
4
|
+
* Decodes PDF stream data by applying the appropriate filter(s)
|
|
5
|
+
* specified in the stream dictionary's /Filter entry.
|
|
6
|
+
*
|
|
7
|
+
* Supported filters:
|
|
8
|
+
* - /FlateDecode (zlib/deflate compression)
|
|
9
|
+
* - /ASCII85Decode (ASCII base-85 encoding)
|
|
10
|
+
* - /ASCIIHexDecode (ASCII hexadecimal encoding)
|
|
11
|
+
* - /LZWDecode (LZW compression)
|
|
12
|
+
* - /RunLengthDecode (run-length encoding)
|
|
13
|
+
*
|
|
14
|
+
* @see PDF Reference 1.7, §3.3 - Filters
|
|
15
|
+
*/
|
|
16
|
+
import { dictGetNumber, isPdfDict, isPdfArray } from "./pdf-parser.js";
|
|
17
|
+
import { unzlibSync } from "../../archive/compression/compress.browser.js";
|
|
18
|
+
import { inflateRaw } from "../../archive/compression/deflate-fallback.js";
|
|
19
|
+
// =============================================================================
|
|
20
|
+
// Public API
|
|
21
|
+
// =============================================================================
|
|
22
|
+
/**
|
|
23
|
+
* Decode stream data by applying the filter chain from the stream dictionary.
|
|
24
|
+
*/
|
|
25
|
+
export function decodeStreamFilters(data, dict) {
|
|
26
|
+
const filter = dict.get("Filter");
|
|
27
|
+
if (filter === undefined || filter === null) {
|
|
28
|
+
return data;
|
|
29
|
+
}
|
|
30
|
+
const decodeParms = dict.get("DecodeParms") ?? dict.get("DP");
|
|
31
|
+
if (typeof filter === "string") {
|
|
32
|
+
// Single filter
|
|
33
|
+
const parms = isPdfDict(decodeParms) ? decodeParms : undefined;
|
|
34
|
+
return applyFilter(data, filter, parms);
|
|
35
|
+
}
|
|
36
|
+
if (isPdfArray(filter)) {
|
|
37
|
+
// Filter chain — apply in order
|
|
38
|
+
let result = data;
|
|
39
|
+
const parmsArray = isPdfArray(decodeParms) ? decodeParms : [];
|
|
40
|
+
for (let i = 0; i < filter.length; i++) {
|
|
41
|
+
const filterName = filter[i];
|
|
42
|
+
const parm = parmsArray[i];
|
|
43
|
+
const parmDict = isPdfDict(parm) ? parm : undefined;
|
|
44
|
+
result = applyFilter(result, filterName, parmDict);
|
|
45
|
+
}
|
|
46
|
+
return result;
|
|
47
|
+
}
|
|
48
|
+
return data;
|
|
49
|
+
}
|
|
50
|
+
// =============================================================================
|
|
51
|
+
// Filter Application
|
|
52
|
+
// =============================================================================
|
|
53
|
+
function applyFilter(data, filterName, parms) {
|
|
54
|
+
switch (filterName) {
|
|
55
|
+
case "FlateDecode":
|
|
56
|
+
case "Fl":
|
|
57
|
+
return decodeFlateDecode(data, parms);
|
|
58
|
+
case "ASCII85Decode":
|
|
59
|
+
case "A85":
|
|
60
|
+
return decodeAscii85(data);
|
|
61
|
+
case "ASCIIHexDecode":
|
|
62
|
+
case "AHx":
|
|
63
|
+
return decodeAsciiHex(data);
|
|
64
|
+
case "LZWDecode":
|
|
65
|
+
case "LZW":
|
|
66
|
+
return decodeLzw(data, parms);
|
|
67
|
+
case "RunLengthDecode":
|
|
68
|
+
case "RL":
|
|
69
|
+
return decodeRunLength(data);
|
|
70
|
+
case "DCTDecode":
|
|
71
|
+
case "DCT":
|
|
72
|
+
// JPEG data — return as-is (used for image XObjects)
|
|
73
|
+
return data;
|
|
74
|
+
case "JPXDecode":
|
|
75
|
+
// JPEG 2000 — return as-is
|
|
76
|
+
return data;
|
|
77
|
+
case "CCITTFaxDecode":
|
|
78
|
+
case "CCF":
|
|
79
|
+
// CCITT fax — return as-is (would need full CCITT decoder)
|
|
80
|
+
return data;
|
|
81
|
+
case "JBIG2Decode":
|
|
82
|
+
// JBIG2 — return as-is
|
|
83
|
+
return data;
|
|
84
|
+
case "Crypt":
|
|
85
|
+
// Handled by decryption layer — pass through
|
|
86
|
+
return data;
|
|
87
|
+
default:
|
|
88
|
+
// Unknown filter — return as-is
|
|
89
|
+
return data;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
// =============================================================================
|
|
93
|
+
// FlateDecode
|
|
94
|
+
// =============================================================================
|
|
95
|
+
function decodeFlateDecode(data, parms) {
|
|
96
|
+
if (data.length === 0) {
|
|
97
|
+
return data;
|
|
98
|
+
}
|
|
99
|
+
let decompressed;
|
|
100
|
+
try {
|
|
101
|
+
// Try zlib (RFC 1950) first — has 2-byte header
|
|
102
|
+
decompressed = unzlibSync(data);
|
|
103
|
+
}
|
|
104
|
+
catch {
|
|
105
|
+
try {
|
|
106
|
+
// Fall back to raw deflate
|
|
107
|
+
decompressed = inflateRaw(data);
|
|
108
|
+
}
|
|
109
|
+
catch {
|
|
110
|
+
// Last resort: return as-is
|
|
111
|
+
return data;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
// Apply predictor if specified
|
|
115
|
+
if (parms) {
|
|
116
|
+
const predictor = dictGetNumber(parms, "Predictor") ?? 1;
|
|
117
|
+
if (predictor > 1) {
|
|
118
|
+
decompressed = undoPredictor(decompressed, parms);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
return decompressed;
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Undo PNG/TIFF predictors used in FlateDecode and LZWDecode streams.
|
|
125
|
+
*
|
|
126
|
+
* @see PDF Reference 1.7, Table 3.8
|
|
127
|
+
*/
|
|
128
|
+
function undoPredictor(data, parms) {
|
|
129
|
+
const predictor = dictGetNumber(parms, "Predictor") ?? 1;
|
|
130
|
+
const columns = dictGetNumber(parms, "Columns") ?? 1;
|
|
131
|
+
const colors = dictGetNumber(parms, "Colors") ?? 1;
|
|
132
|
+
const bitsPerComponent = dictGetNumber(parms, "BitsPerComponent") ?? 8;
|
|
133
|
+
if (predictor === 1) {
|
|
134
|
+
return data; // No prediction
|
|
135
|
+
}
|
|
136
|
+
if (predictor === 2) {
|
|
137
|
+
// TIFF predictor 2
|
|
138
|
+
return undoTiffPredictor(data, columns, colors, bitsPerComponent);
|
|
139
|
+
}
|
|
140
|
+
if (predictor >= 10 && predictor <= 15) {
|
|
141
|
+
// PNG predictors (10-15)
|
|
142
|
+
return undoPngPredictor(data, columns, colors, bitsPerComponent);
|
|
143
|
+
}
|
|
144
|
+
return data;
|
|
145
|
+
}
|
|
146
|
+
/**
|
|
147
|
+
* Undo TIFF Predictor 2 (horizontal differencing).
|
|
148
|
+
*/
|
|
149
|
+
function undoTiffPredictor(data, columns, colors, bitsPerComponent) {
|
|
150
|
+
const bytesPerPixel = Math.ceil((colors * bitsPerComponent) / 8);
|
|
151
|
+
const rowBytes = Math.ceil((columns * colors * bitsPerComponent) / 8);
|
|
152
|
+
const rows = Math.floor(data.length / rowBytes);
|
|
153
|
+
const result = new Uint8Array(data.length);
|
|
154
|
+
for (let row = 0; row < rows; row++) {
|
|
155
|
+
const rowStart = row * rowBytes;
|
|
156
|
+
// First pixel is unmodified
|
|
157
|
+
for (let i = 0; i < bytesPerPixel; i++) {
|
|
158
|
+
result[rowStart + i] = data[rowStart + i];
|
|
159
|
+
}
|
|
160
|
+
// Subsequent pixels: add previous pixel
|
|
161
|
+
for (let i = bytesPerPixel; i < rowBytes; i++) {
|
|
162
|
+
result[rowStart + i] = (data[rowStart + i] + result[rowStart + i - bytesPerPixel]) & 0xff;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
return result;
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Undo PNG row filters.
|
|
169
|
+
* Each row is preceded by a filter type byte.
|
|
170
|
+
*/
|
|
171
|
+
function undoPngPredictor(data, columns, colors, bitsPerComponent) {
|
|
172
|
+
const bytesPerPixel = Math.max(1, Math.ceil((colors * bitsPerComponent) / 8));
|
|
173
|
+
const rowBytes = Math.ceil((columns * colors * bitsPerComponent) / 8);
|
|
174
|
+
const rowWithFilter = rowBytes + 1; // 1 byte for filter type
|
|
175
|
+
const rows = Math.floor(data.length / rowWithFilter);
|
|
176
|
+
const result = new Uint8Array(rows * rowBytes);
|
|
177
|
+
for (let row = 0; row < rows; row++) {
|
|
178
|
+
const srcRow = row * rowWithFilter;
|
|
179
|
+
const dstRow = row * rowBytes;
|
|
180
|
+
const filterType = data[srcRow];
|
|
181
|
+
for (let i = 0; i < rowBytes; i++) {
|
|
182
|
+
const raw = data[srcRow + 1 + i];
|
|
183
|
+
const a = i >= bytesPerPixel ? result[dstRow + i - bytesPerPixel] : 0; // left
|
|
184
|
+
const b = row > 0 ? result[dstRow - rowBytes + i] : 0; // above
|
|
185
|
+
const c = row > 0 && i >= bytesPerPixel ? result[dstRow - rowBytes + i - bytesPerPixel] : 0; // upper-left
|
|
186
|
+
switch (filterType) {
|
|
187
|
+
case 0: // None
|
|
188
|
+
result[dstRow + i] = raw;
|
|
189
|
+
break;
|
|
190
|
+
case 1: // Sub
|
|
191
|
+
result[dstRow + i] = (raw + a) & 0xff;
|
|
192
|
+
break;
|
|
193
|
+
case 2: // Up
|
|
194
|
+
result[dstRow + i] = (raw + b) & 0xff;
|
|
195
|
+
break;
|
|
196
|
+
case 3: // Average
|
|
197
|
+
result[dstRow + i] = (raw + ((a + b) >> 1)) & 0xff;
|
|
198
|
+
break;
|
|
199
|
+
case 4: // Paeth
|
|
200
|
+
result[dstRow + i] = (raw + paethPredictor(a, b, c)) & 0xff;
|
|
201
|
+
break;
|
|
202
|
+
default:
|
|
203
|
+
result[dstRow + i] = raw;
|
|
204
|
+
break;
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
return result;
|
|
209
|
+
}
|
|
210
|
+
function paethPredictor(a, b, c) {
|
|
211
|
+
const p = a + b - c;
|
|
212
|
+
const pa = Math.abs(p - a);
|
|
213
|
+
const pb = Math.abs(p - b);
|
|
214
|
+
const pc = Math.abs(p - c);
|
|
215
|
+
if (pa <= pb && pa <= pc) {
|
|
216
|
+
return a;
|
|
217
|
+
}
|
|
218
|
+
if (pb <= pc) {
|
|
219
|
+
return b;
|
|
220
|
+
}
|
|
221
|
+
return c;
|
|
222
|
+
}
|
|
223
|
+
// =============================================================================
|
|
224
|
+
// ASCII85Decode
|
|
225
|
+
// =============================================================================
|
|
226
|
+
function decodeAscii85(data) {
|
|
227
|
+
const output = [];
|
|
228
|
+
let i = 0;
|
|
229
|
+
while (i < data.length) {
|
|
230
|
+
const b = data[i];
|
|
231
|
+
// Skip whitespace
|
|
232
|
+
if (b === 0x20 || b === 0x09 || b === 0x0a || b === 0x0d || b === 0x0c) {
|
|
233
|
+
i++;
|
|
234
|
+
continue;
|
|
235
|
+
}
|
|
236
|
+
// End of data marker ~>
|
|
237
|
+
if (b === 0x7e) {
|
|
238
|
+
break;
|
|
239
|
+
}
|
|
240
|
+
// Special 'z' character = four zero bytes
|
|
241
|
+
if (b === 0x7a) {
|
|
242
|
+
output.push(0, 0, 0, 0);
|
|
243
|
+
i++;
|
|
244
|
+
continue;
|
|
245
|
+
}
|
|
246
|
+
// Decode 5-character group into 4 bytes
|
|
247
|
+
const group = [];
|
|
248
|
+
while (group.length < 5 && i < data.length) {
|
|
249
|
+
const c = data[i];
|
|
250
|
+
if (c === 0x7e) {
|
|
251
|
+
break; // EOD
|
|
252
|
+
}
|
|
253
|
+
if (c === 0x20 || c === 0x09 || c === 0x0a || c === 0x0d || c === 0x0c) {
|
|
254
|
+
i++;
|
|
255
|
+
continue;
|
|
256
|
+
}
|
|
257
|
+
if (c < 0x21 || c > 0x75) {
|
|
258
|
+
i++;
|
|
259
|
+
continue; // Invalid — skip
|
|
260
|
+
}
|
|
261
|
+
group.push(c - 0x21);
|
|
262
|
+
i++;
|
|
263
|
+
}
|
|
264
|
+
if (group.length === 0) {
|
|
265
|
+
break;
|
|
266
|
+
}
|
|
267
|
+
// Pad short final group with 'u' (84) values
|
|
268
|
+
const numBytes = group.length - 1;
|
|
269
|
+
while (group.length < 5) {
|
|
270
|
+
group.push(84);
|
|
271
|
+
}
|
|
272
|
+
const value = group[0] * 85 * 85 * 85 * 85 +
|
|
273
|
+
group[1] * 85 * 85 * 85 +
|
|
274
|
+
group[2] * 85 * 85 +
|
|
275
|
+
group[3] * 85 +
|
|
276
|
+
group[4];
|
|
277
|
+
const bytes = [
|
|
278
|
+
(value >>> 24) & 0xff,
|
|
279
|
+
(value >>> 16) & 0xff,
|
|
280
|
+
(value >>> 8) & 0xff,
|
|
281
|
+
value & 0xff
|
|
282
|
+
];
|
|
283
|
+
for (let j = 0; j < numBytes; j++) {
|
|
284
|
+
output.push(bytes[j]);
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
return new Uint8Array(output);
|
|
288
|
+
}
|
|
289
|
+
// =============================================================================
|
|
290
|
+
// ASCIIHexDecode
|
|
291
|
+
// =============================================================================
|
|
292
|
+
function decodeAsciiHex(data) {
|
|
293
|
+
const output = [];
|
|
294
|
+
let highNibble = -1;
|
|
295
|
+
for (let i = 0; i < data.length; i++) {
|
|
296
|
+
const b = data[i];
|
|
297
|
+
// End of data marker >
|
|
298
|
+
if (b === 0x3e) {
|
|
299
|
+
break;
|
|
300
|
+
}
|
|
301
|
+
// Skip whitespace
|
|
302
|
+
if (b === 0x20 || b === 0x09 || b === 0x0a || b === 0x0d || b === 0x0c) {
|
|
303
|
+
continue;
|
|
304
|
+
}
|
|
305
|
+
let val;
|
|
306
|
+
if (b >= 0x30 && b <= 0x39) {
|
|
307
|
+
val = b - 0x30;
|
|
308
|
+
}
|
|
309
|
+
else if (b >= 0x41 && b <= 0x46) {
|
|
310
|
+
val = b - 0x41 + 10;
|
|
311
|
+
}
|
|
312
|
+
else if (b >= 0x61 && b <= 0x66) {
|
|
313
|
+
val = b - 0x61 + 10;
|
|
314
|
+
}
|
|
315
|
+
else {
|
|
316
|
+
continue;
|
|
317
|
+
}
|
|
318
|
+
if (highNibble < 0) {
|
|
319
|
+
highNibble = val;
|
|
320
|
+
}
|
|
321
|
+
else {
|
|
322
|
+
output.push((highNibble << 4) | val);
|
|
323
|
+
highNibble = -1;
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
// Odd digit — pad with 0
|
|
327
|
+
if (highNibble >= 0) {
|
|
328
|
+
output.push(highNibble << 4);
|
|
329
|
+
}
|
|
330
|
+
return new Uint8Array(output);
|
|
331
|
+
}
|
|
332
|
+
// =============================================================================
|
|
333
|
+
// LZWDecode
|
|
334
|
+
// =============================================================================
|
|
335
|
+
function decodeLzw(data, parms) {
|
|
336
|
+
const earlyChange = parms ? (dictGetNumber(parms, "EarlyChange") ?? 1) : 1;
|
|
337
|
+
const output = [];
|
|
338
|
+
// LZW bit reader
|
|
339
|
+
let bitPos = 0;
|
|
340
|
+
function readBits(n) {
|
|
341
|
+
let result = 0;
|
|
342
|
+
for (let i = 0; i < n; i++) {
|
|
343
|
+
const byteIdx = (bitPos + i) >> 3;
|
|
344
|
+
const bitIdx = 7 - ((bitPos + i) & 7); // MSB first
|
|
345
|
+
if (byteIdx < data.length) {
|
|
346
|
+
result = (result << 1) | ((data[byteIdx] >> bitIdx) & 1);
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
bitPos += n;
|
|
350
|
+
return result;
|
|
351
|
+
}
|
|
352
|
+
const CLEAR_TABLE = 256;
|
|
353
|
+
const EOD = 257;
|
|
354
|
+
let codeSize = 9;
|
|
355
|
+
let nextCode = 258;
|
|
356
|
+
let table = [];
|
|
357
|
+
// Initialize table
|
|
358
|
+
function resetTable() {
|
|
359
|
+
table = [];
|
|
360
|
+
for (let i = 0; i < 256; i++) {
|
|
361
|
+
table[i] = new Uint8Array([i]);
|
|
362
|
+
}
|
|
363
|
+
table[CLEAR_TABLE] = new Uint8Array(0);
|
|
364
|
+
table[EOD] = new Uint8Array(0);
|
|
365
|
+
nextCode = 258;
|
|
366
|
+
codeSize = 9;
|
|
367
|
+
}
|
|
368
|
+
resetTable();
|
|
369
|
+
let prevEntry = null;
|
|
370
|
+
while (bitPos < data.length * 8) {
|
|
371
|
+
const code = readBits(codeSize);
|
|
372
|
+
if (code === EOD) {
|
|
373
|
+
break;
|
|
374
|
+
}
|
|
375
|
+
if (code === CLEAR_TABLE) {
|
|
376
|
+
resetTable();
|
|
377
|
+
prevEntry = null;
|
|
378
|
+
continue;
|
|
379
|
+
}
|
|
380
|
+
let entry;
|
|
381
|
+
if (code < nextCode && table[code]) {
|
|
382
|
+
entry = table[code];
|
|
383
|
+
}
|
|
384
|
+
else if (code === nextCode && prevEntry) {
|
|
385
|
+
// Special case: code not in table yet
|
|
386
|
+
entry = new Uint8Array(prevEntry.length + 1);
|
|
387
|
+
entry.set(prevEntry);
|
|
388
|
+
entry[prevEntry.length] = prevEntry[0];
|
|
389
|
+
}
|
|
390
|
+
else {
|
|
391
|
+
// Invalid code — bail
|
|
392
|
+
break;
|
|
393
|
+
}
|
|
394
|
+
for (let i = 0; i < entry.length; i++) {
|
|
395
|
+
output.push(entry[i]);
|
|
396
|
+
}
|
|
397
|
+
// Add new entry to table
|
|
398
|
+
if (prevEntry !== null) {
|
|
399
|
+
const newEntry = new Uint8Array(prevEntry.length + 1);
|
|
400
|
+
newEntry.set(prevEntry);
|
|
401
|
+
newEntry[prevEntry.length] = entry[0];
|
|
402
|
+
table[nextCode] = newEntry;
|
|
403
|
+
nextCode++;
|
|
404
|
+
// Increase code size
|
|
405
|
+
const threshold = earlyChange ? nextCode : nextCode + 1;
|
|
406
|
+
if (threshold >= 1 << codeSize && codeSize < 12) {
|
|
407
|
+
codeSize++;
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
prevEntry = entry;
|
|
411
|
+
}
|
|
412
|
+
let result = new Uint8Array(output);
|
|
413
|
+
// Apply predictor if specified
|
|
414
|
+
if (parms) {
|
|
415
|
+
const predictor = dictGetNumber(parms, "Predictor") ?? 1;
|
|
416
|
+
if (predictor > 1) {
|
|
417
|
+
result = undoPredictor(result, parms);
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
return result;
|
|
421
|
+
}
|
|
422
|
+
// =============================================================================
|
|
423
|
+
// RunLengthDecode
|
|
424
|
+
// =============================================================================
|
|
425
|
+
function decodeRunLength(data) {
|
|
426
|
+
const output = [];
|
|
427
|
+
let i = 0;
|
|
428
|
+
while (i < data.length) {
|
|
429
|
+
const length = data[i];
|
|
430
|
+
i++;
|
|
431
|
+
if (length === 128) {
|
|
432
|
+
// EOD
|
|
433
|
+
break;
|
|
434
|
+
}
|
|
435
|
+
if (length < 128) {
|
|
436
|
+
// Copy (length + 1) literal bytes
|
|
437
|
+
const count = length + 1;
|
|
438
|
+
for (let j = 0; j < count && i < data.length; j++) {
|
|
439
|
+
output.push(data[i]);
|
|
440
|
+
i++;
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
else {
|
|
444
|
+
// Repeat next byte (257 - length) times
|
|
445
|
+
const count = 257 - length;
|
|
446
|
+
if (i < data.length) {
|
|
447
|
+
const byte = data[i];
|
|
448
|
+
i++;
|
|
449
|
+
for (let j = 0; j < count; j++) {
|
|
450
|
+
output.push(byte);
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
return new Uint8Array(output);
|
|
456
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Text reconstruction from positioned text fragments.
|
|
3
|
+
*
|
|
4
|
+
* Assembles raw text fragments extracted from PDF content streams into
|
|
5
|
+
* coherent, human-readable text with proper reading order, line breaks,
|
|
6
|
+
* and paragraph detection.
|
|
7
|
+
*
|
|
8
|
+
* Challenges addressed:
|
|
9
|
+
* - PDF text has no semantic structure (only "draw char at (x,y)")
|
|
10
|
+
* - Text fragments may be out of order
|
|
11
|
+
* - Word and line boundaries must be inferred from positions
|
|
12
|
+
* - Columns and tables need proper handling
|
|
13
|
+
* - Different fonts/sizes affect spacing thresholds
|
|
14
|
+
* - Multi-column layouts need column detection
|
|
15
|
+
* - RTL (Arabic, Hebrew) text needs right-to-left sorting
|
|
16
|
+
* - Vertical CJK text needs column-based grouping
|
|
17
|
+
*
|
|
18
|
+
* @see PDF Reference 1.7, Chapter 5 - Text
|
|
19
|
+
*/
|
|
20
|
+
import type { TextFragment } from "./content-interpreter.js";
|
|
21
|
+
/**
|
|
22
|
+
* Reconstruct readable text from positioned text fragments.
|
|
23
|
+
*
|
|
24
|
+
* @param fragments - Raw text fragments with positions from content stream
|
|
25
|
+
* @returns Reconstructed text with proper line breaks and spacing
|
|
26
|
+
*/
|
|
27
|
+
export declare function reconstructText(fragments: TextFragment[]): string;
|
|
28
|
+
/**
|
|
29
|
+
* Detailed text extraction result preserving position information.
|
|
30
|
+
*/
|
|
31
|
+
export interface TextLine {
|
|
32
|
+
/** The text content of this line */
|
|
33
|
+
text: string;
|
|
34
|
+
/** Y position (PDF coordinate, origin = bottom-left) */
|
|
35
|
+
y: number;
|
|
36
|
+
/** X position of the start of the line */
|
|
37
|
+
x: number;
|
|
38
|
+
/** Font size of the first fragment */
|
|
39
|
+
fontSize: number;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Extract text as structured lines.
|
|
43
|
+
*/
|
|
44
|
+
export declare function reconstructTextLines(fragments: TextFragment[]): TextLine[];
|