@cj-tech-master/excelts 8.0.0 → 8.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -1
- package/README_zh.md +6 -0
- package/dist/browser/modules/archive/zip/stream.d.ts +4 -0
- package/dist/browser/modules/archive/zip/stream.js +53 -0
- package/dist/browser/modules/pdf/core/crypto.d.ts +65 -0
- package/dist/browser/modules/pdf/core/crypto.js +637 -0
- package/dist/browser/modules/pdf/core/encryption.d.ts +23 -20
- package/dist/browser/modules/pdf/core/encryption.js +88 -261
- package/dist/browser/modules/pdf/core/pdf-writer.d.ts +6 -4
- package/dist/browser/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/browser/modules/pdf/index.d.ts +23 -2
- package/dist/browser/modules/pdf/index.js +21 -3
- package/dist/browser/modules/pdf/reader/annotation-extractor.d.ts +63 -0
- package/dist/browser/modules/pdf/reader/annotation-extractor.js +155 -0
- package/dist/browser/modules/pdf/reader/cmap-parser.d.ts +70 -0
- package/dist/browser/modules/pdf/reader/cmap-parser.js +321 -0
- package/dist/browser/modules/pdf/reader/content-interpreter.d.ts +57 -0
- package/dist/browser/modules/pdf/reader/content-interpreter.js +715 -0
- package/dist/browser/modules/pdf/reader/font-decoder.d.ts +58 -0
- package/dist/browser/modules/pdf/reader/font-decoder.js +1513 -0
- package/dist/browser/modules/pdf/reader/form-extractor.d.ts +48 -0
- package/dist/browser/modules/pdf/reader/form-extractor.js +355 -0
- package/dist/browser/modules/pdf/reader/image-extractor.d.ts +55 -0
- package/dist/browser/modules/pdf/reader/image-extractor.js +220 -0
- package/dist/browser/modules/pdf/reader/metadata-reader.d.ts +56 -0
- package/dist/browser/modules/pdf/reader/metadata-reader.js +275 -0
- package/dist/browser/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
- package/dist/browser/modules/pdf/reader/pdf-decrypt.js +443 -0
- package/dist/browser/modules/pdf/reader/pdf-document.d.ts +191 -0
- package/dist/browser/modules/pdf/reader/pdf-document.js +818 -0
- package/dist/browser/modules/pdf/reader/pdf-parser.d.ts +65 -0
- package/dist/browser/modules/pdf/reader/pdf-parser.js +285 -0
- package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +143 -0
- package/dist/browser/modules/pdf/reader/pdf-reader.js +200 -0
- package/dist/browser/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
- package/dist/browser/modules/pdf/reader/pdf-tokenizer.js +543 -0
- package/dist/browser/modules/pdf/reader/reader-utils.d.ts +15 -0
- package/dist/browser/modules/pdf/reader/reader-utils.js +27 -0
- package/dist/browser/modules/pdf/reader/stream-filters.d.ts +20 -0
- package/dist/browser/modules/pdf/reader/stream-filters.js +456 -0
- package/dist/browser/modules/pdf/reader/text-reconstruction.d.ts +44 -0
- package/dist/browser/modules/pdf/reader/text-reconstruction.js +463 -0
- package/dist/cjs/modules/archive/zip/stream.js +53 -0
- package/dist/cjs/modules/pdf/core/crypto.js +649 -0
- package/dist/cjs/modules/pdf/core/encryption.js +88 -263
- package/dist/cjs/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/cjs/modules/pdf/index.js +23 -4
- package/dist/cjs/modules/pdf/reader/annotation-extractor.js +158 -0
- package/dist/cjs/modules/pdf/reader/cmap-parser.js +326 -0
- package/dist/cjs/modules/pdf/reader/content-interpreter.js +718 -0
- package/dist/cjs/modules/pdf/reader/font-decoder.js +1518 -0
- package/dist/cjs/modules/pdf/reader/form-extractor.js +358 -0
- package/dist/cjs/modules/pdf/reader/image-extractor.js +223 -0
- package/dist/cjs/modules/pdf/reader/metadata-reader.js +278 -0
- package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +447 -0
- package/dist/cjs/modules/pdf/reader/pdf-document.js +822 -0
- package/dist/cjs/modules/pdf/reader/pdf-parser.js +301 -0
- package/dist/cjs/modules/pdf/reader/pdf-reader.js +203 -0
- package/dist/cjs/modules/pdf/reader/pdf-tokenizer.js +517 -0
- package/dist/cjs/modules/pdf/reader/reader-utils.js +30 -0
- package/dist/cjs/modules/pdf/reader/stream-filters.js +459 -0
- package/dist/cjs/modules/pdf/reader/text-reconstruction.js +467 -0
- package/dist/esm/modules/archive/zip/stream.js +53 -0
- package/dist/esm/modules/pdf/core/crypto.js +637 -0
- package/dist/esm/modules/pdf/core/encryption.js +88 -261
- package/dist/esm/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/esm/modules/pdf/index.js +21 -3
- package/dist/esm/modules/pdf/reader/annotation-extractor.js +155 -0
- package/dist/esm/modules/pdf/reader/cmap-parser.js +321 -0
- package/dist/esm/modules/pdf/reader/content-interpreter.js +715 -0
- package/dist/esm/modules/pdf/reader/font-decoder.js +1513 -0
- package/dist/esm/modules/pdf/reader/form-extractor.js +355 -0
- package/dist/esm/modules/pdf/reader/image-extractor.js +220 -0
- package/dist/esm/modules/pdf/reader/metadata-reader.js +275 -0
- package/dist/esm/modules/pdf/reader/pdf-decrypt.js +443 -0
- package/dist/esm/modules/pdf/reader/pdf-document.js +818 -0
- package/dist/esm/modules/pdf/reader/pdf-parser.js +285 -0
- package/dist/esm/modules/pdf/reader/pdf-reader.js +200 -0
- package/dist/esm/modules/pdf/reader/pdf-tokenizer.js +543 -0
- package/dist/esm/modules/pdf/reader/reader-utils.js +27 -0
- package/dist/esm/modules/pdf/reader/stream-filters.js +456 -0
- package/dist/esm/modules/pdf/reader/text-reconstruction.js +463 -0
- package/dist/iife/excelts.iife.js +703 -267
- package/dist/iife/excelts.iife.js.map +1 -1
- package/dist/iife/excelts.iife.min.js +35 -35
- package/dist/types/modules/archive/zip/stream.d.ts +4 -0
- package/dist/types/modules/pdf/core/crypto.d.ts +65 -0
- package/dist/types/modules/pdf/core/encryption.d.ts +23 -20
- package/dist/types/modules/pdf/core/pdf-writer.d.ts +6 -4
- package/dist/types/modules/pdf/index.d.ts +23 -2
- package/dist/types/modules/pdf/reader/annotation-extractor.d.ts +63 -0
- package/dist/types/modules/pdf/reader/cmap-parser.d.ts +70 -0
- package/dist/types/modules/pdf/reader/content-interpreter.d.ts +57 -0
- package/dist/types/modules/pdf/reader/font-decoder.d.ts +58 -0
- package/dist/types/modules/pdf/reader/form-extractor.d.ts +48 -0
- package/dist/types/modules/pdf/reader/image-extractor.d.ts +55 -0
- package/dist/types/modules/pdf/reader/metadata-reader.d.ts +56 -0
- package/dist/types/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
- package/dist/types/modules/pdf/reader/pdf-document.d.ts +191 -0
- package/dist/types/modules/pdf/reader/pdf-parser.d.ts +65 -0
- package/dist/types/modules/pdf/reader/pdf-reader.d.ts +143 -0
- package/dist/types/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
- package/dist/types/modules/pdf/reader/reader-utils.d.ts +15 -0
- package/dist/types/modules/pdf/reader/stream-filters.d.ts +20 -0
- package/dist/types/modules/pdf/reader/text-reconstruction.d.ts +44 -0
- package/package.json +1 -1
|
@@ -0,0 +1,715 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF content stream interpreter for text extraction.
|
|
3
|
+
*
|
|
4
|
+
* Implements a full PDF graphics state machine that processes content stream
|
|
5
|
+
* operators to extract positioned text fragments. These fragments are then
|
|
6
|
+
* assembled into readable text by the text reconstruction module.
|
|
7
|
+
*
|
|
8
|
+
* Supported operator categories:
|
|
9
|
+
* - Text state: Tf, Tc, Tw, Tz, TL, Ts, Tr
|
|
10
|
+
* - Text positioning: Td, TD, Tm, T*
|
|
11
|
+
* - Text showing: Tj, TJ, ', "
|
|
12
|
+
* - Text objects: BT, ET
|
|
13
|
+
* - Graphics state: q, Q, cm, gs, i, M, ri, W, W*
|
|
14
|
+
* - Color: CS, cs, SC, sc, SCN, scn
|
|
15
|
+
* - Marked content: BDC, BMC, EMC, MP, DP
|
|
16
|
+
* - Type3 glyph: d0, d1
|
|
17
|
+
* - Shading: sh
|
|
18
|
+
* - Inline images: BI/ID/EI
|
|
19
|
+
* - XObject invocation: Do (for form XObjects containing text)
|
|
20
|
+
*
|
|
21
|
+
* @see PDF Reference 1.7, Chapter 5 - Text
|
|
22
|
+
* @see PDF Reference 1.7, Chapter 4 - Graphics
|
|
23
|
+
*/
|
|
24
|
+
import { PdfTokenizer, TokenType } from "./pdf-tokenizer.js";
|
|
25
|
+
import { resolveFont, decodeText, getCharWidth } from "./font-decoder.js";
|
|
26
|
+
import { isPdfRef, isPdfArray, dictGetName, dictGetArray } from "./pdf-parser.js";
|
|
27
|
+
// =============================================================================
|
|
28
|
+
// Constants
|
|
29
|
+
// =============================================================================
|
|
30
|
+
/** Maximum Form XObject nesting depth to prevent infinite recursion */
|
|
31
|
+
const MAX_FORM_DEPTH = 10;
|
|
32
|
+
/** Cached TextEncoder instance */
|
|
33
|
+
const _textEncoder = new TextEncoder();
|
|
34
|
+
// =============================================================================
|
|
35
|
+
// RTL Detection
|
|
36
|
+
// =============================================================================
|
|
37
|
+
/**
|
|
38
|
+
* Check if a character code point is in an RTL Unicode range.
|
|
39
|
+
*/
|
|
40
|
+
function isRtlChar(codePoint) {
|
|
41
|
+
return (
|
|
42
|
+
// Arabic (0600–06FF)
|
|
43
|
+
(codePoint >= 0x0600 && codePoint <= 0x06ff) ||
|
|
44
|
+
// Arabic Supplement (0750–077F)
|
|
45
|
+
(codePoint >= 0x0750 && codePoint <= 0x077f) ||
|
|
46
|
+
// Arabic Extended-A (08A0–08FF)
|
|
47
|
+
(codePoint >= 0x08a0 && codePoint <= 0x08ff) ||
|
|
48
|
+
// Arabic Presentation Forms-A (FB50–FDFF)
|
|
49
|
+
(codePoint >= 0xfb50 && codePoint <= 0xfdff) ||
|
|
50
|
+
// Arabic Presentation Forms-B (FE70–FEFF)
|
|
51
|
+
(codePoint >= 0xfe70 && codePoint <= 0xfeff) ||
|
|
52
|
+
// Hebrew (0590–05FF)
|
|
53
|
+
(codePoint >= 0x0590 && codePoint <= 0x05ff) ||
|
|
54
|
+
// Hebrew Presentation Forms (FB1D–FB4F)
|
|
55
|
+
(codePoint >= 0xfb1d && codePoint <= 0xfb4f) ||
|
|
56
|
+
// Syriac (0700–074F)
|
|
57
|
+
(codePoint >= 0x0700 && codePoint <= 0x074f) ||
|
|
58
|
+
// Thaana (0780–07BF)
|
|
59
|
+
(codePoint >= 0x0780 && codePoint <= 0x07bf) ||
|
|
60
|
+
// NKo (07C0–07FF)
|
|
61
|
+
(codePoint >= 0x07c0 && codePoint <= 0x07ff));
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Check if the first character of a string is in an RTL Unicode range.
|
|
65
|
+
*/
|
|
66
|
+
function detectRtl(text) {
|
|
67
|
+
if (text.length === 0) {
|
|
68
|
+
return false;
|
|
69
|
+
}
|
|
70
|
+
const codePoint = text.codePointAt(0);
|
|
71
|
+
return codePoint !== undefined && isRtlChar(codePoint);
|
|
72
|
+
}
|
|
73
|
+
// =============================================================================
|
|
74
|
+
// Content Stream Interpreter
|
|
75
|
+
// =============================================================================
|
|
76
|
+
/**
|
|
77
|
+
* Extract text fragments from a page's content stream(s).
|
|
78
|
+
*/
|
|
79
|
+
export function extractTextFromPage(pageDict, doc) {
|
|
80
|
+
// Resolve page resources (centralized with cycle protection)
|
|
81
|
+
const resources = doc.resolvePageResources(pageDict);
|
|
82
|
+
const fonts = resolveFontResources(resources, doc);
|
|
83
|
+
// Get content stream(s)
|
|
84
|
+
const contentStreams = getContentStreams(pageDict, doc);
|
|
85
|
+
if (contentStreams.length === 0) {
|
|
86
|
+
return [];
|
|
87
|
+
}
|
|
88
|
+
const fragments = [];
|
|
89
|
+
const interpreter = new ContentInterpreter(fonts, doc, resources);
|
|
90
|
+
for (const streamData of contentStreams) {
|
|
91
|
+
interpreter.process(streamData, fragments);
|
|
92
|
+
}
|
|
93
|
+
return fragments;
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Resolve all font resources for a page.
|
|
97
|
+
*/
|
|
98
|
+
function resolveFontResources(resources, doc) {
|
|
99
|
+
const fonts = new Map();
|
|
100
|
+
const fontDict = resources.get("Font");
|
|
101
|
+
if (!fontDict) {
|
|
102
|
+
return fonts;
|
|
103
|
+
}
|
|
104
|
+
const resolvedFontDict = doc.derefDict(fontDict);
|
|
105
|
+
if (!resolvedFontDict) {
|
|
106
|
+
return fonts;
|
|
107
|
+
}
|
|
108
|
+
for (const [name, ref] of resolvedFontDict) {
|
|
109
|
+
const fd = doc.derefDict(ref);
|
|
110
|
+
if (fd) {
|
|
111
|
+
try {
|
|
112
|
+
fonts.set(name, resolveFont(fd, doc));
|
|
113
|
+
}
|
|
114
|
+
catch {
|
|
115
|
+
// Skip invalid fonts
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
return fonts;
|
|
120
|
+
}
|
|
121
|
+
/**
|
|
122
|
+
* Get decoded content stream data for a page.
|
|
123
|
+
* Handles both single stream and array of streams.
|
|
124
|
+
*/
|
|
125
|
+
function getContentStreams(pageDict, doc) {
|
|
126
|
+
const contents = pageDict.get("Contents");
|
|
127
|
+
if (!contents) {
|
|
128
|
+
return [];
|
|
129
|
+
}
|
|
130
|
+
if (isPdfRef(contents)) {
|
|
131
|
+
const result = doc.derefStreamWithObjNum(contents);
|
|
132
|
+
if (result) {
|
|
133
|
+
return [doc.getStreamData(result.stream, result.objNum, result.gen)];
|
|
134
|
+
}
|
|
135
|
+
const resolved = doc.deref(contents);
|
|
136
|
+
if (isPdfArray(resolved)) {
|
|
137
|
+
return resolveStreamArray(resolved, doc);
|
|
138
|
+
}
|
|
139
|
+
return [];
|
|
140
|
+
}
|
|
141
|
+
if (isPdfArray(contents)) {
|
|
142
|
+
return resolveStreamArray(contents, doc);
|
|
143
|
+
}
|
|
144
|
+
// Note: a direct PdfStream inside the page dict (not via ref) is technically
|
|
145
|
+
// invalid per the spec — Contents must be an indirect reference or array of refs.
|
|
146
|
+
// We don't handle it because we can't determine the correct objNum/gen for decryption.
|
|
147
|
+
return [];
|
|
148
|
+
}
|
|
149
|
+
function resolveStreamArray(arr, doc) {
|
|
150
|
+
const result = [];
|
|
151
|
+
for (const item of arr) {
|
|
152
|
+
const r = doc.derefStreamWithObjNum(item);
|
|
153
|
+
if (r) {
|
|
154
|
+
result.push(doc.getStreamData(r.stream, r.objNum, r.gen));
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
return result;
|
|
158
|
+
}
|
|
159
|
+
// =============================================================================
|
|
160
|
+
// Content Interpreter
|
|
161
|
+
// =============================================================================
|
|
162
|
+
class ContentInterpreter {
|
|
163
|
+
constructor(fonts, doc, resources) {
|
|
164
|
+
// Graphics state
|
|
165
|
+
this.stateStack = [];
|
|
166
|
+
this.ctm = [1, 0, 0, 1, 0, 0];
|
|
167
|
+
// Text state
|
|
168
|
+
this.textState = {
|
|
169
|
+
charSpacing: 0,
|
|
170
|
+
wordSpacing: 0,
|
|
171
|
+
horizontalScaling: 100,
|
|
172
|
+
leading: 0,
|
|
173
|
+
font: null,
|
|
174
|
+
fontSize: 0,
|
|
175
|
+
renderMode: 0,
|
|
176
|
+
rise: 0
|
|
177
|
+
};
|
|
178
|
+
// Text object state
|
|
179
|
+
this.textMatrix = [1, 0, 0, 1, 0, 0];
|
|
180
|
+
this.lineMatrix = [1, 0, 0, 1, 0, 0];
|
|
181
|
+
this.inTextObject = false;
|
|
182
|
+
// Form XObject recursion depth
|
|
183
|
+
this.formDepth = 0;
|
|
184
|
+
this.fonts = fonts;
|
|
185
|
+
this.doc = doc;
|
|
186
|
+
this.resources = resources;
|
|
187
|
+
}
|
|
188
|
+
process(streamData, fragments) {
|
|
189
|
+
const tokenizer = new PdfTokenizer(streamData);
|
|
190
|
+
const operands = [];
|
|
191
|
+
while (true) {
|
|
192
|
+
const token = tokenizer.next();
|
|
193
|
+
if (token.type === TokenType.EOF) {
|
|
194
|
+
break;
|
|
195
|
+
}
|
|
196
|
+
if (token.type === TokenType.Keyword) {
|
|
197
|
+
const keyword = token.strValue;
|
|
198
|
+
// Handle inline image: BI ... ID <data> EI
|
|
199
|
+
if (keyword === "BI") {
|
|
200
|
+
this.skipInlineImage(tokenizer);
|
|
201
|
+
operands.length = 0;
|
|
202
|
+
}
|
|
203
|
+
else {
|
|
204
|
+
this.executeOperator(keyword, operands, fragments);
|
|
205
|
+
operands.length = 0;
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
else if (token.type === TokenType.ArrayBegin) {
|
|
209
|
+
// Parse array inline (for TJ operator)
|
|
210
|
+
operands.push(this.parseInlineArray(tokenizer));
|
|
211
|
+
}
|
|
212
|
+
else {
|
|
213
|
+
operands.push(tokenToOperand(token));
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
/**
|
|
218
|
+
* Skip an inline image in the content stream.
|
|
219
|
+
*
|
|
220
|
+
* Inline images have the form: BI <key-value pairs> ID <image data> EI
|
|
221
|
+
* We need to parse past the key-value pairs (which the tokenizer handles),
|
|
222
|
+
* skip the single whitespace byte after ID, then scan for the EI marker.
|
|
223
|
+
*/
|
|
224
|
+
skipInlineImage(tokenizer) {
|
|
225
|
+
// Phase 1: Read key-value pairs until we encounter the ID keyword
|
|
226
|
+
while (true) {
|
|
227
|
+
const tok = tokenizer.next();
|
|
228
|
+
if (tok.type === TokenType.EOF) {
|
|
229
|
+
return;
|
|
230
|
+
}
|
|
231
|
+
if (tok.type === TokenType.Keyword && tok.strValue === "ID") {
|
|
232
|
+
break;
|
|
233
|
+
}
|
|
234
|
+
// Just consume the token (key-value pairs) — we don't need them
|
|
235
|
+
}
|
|
236
|
+
// Phase 2: Skip one whitespace byte after ID (per PDF spec)
|
|
237
|
+
const data = tokenizer.bytes;
|
|
238
|
+
let pos = tokenizer.position;
|
|
239
|
+
if (pos < data.length) {
|
|
240
|
+
// The byte immediately after ID should be a single whitespace byte
|
|
241
|
+
pos++;
|
|
242
|
+
}
|
|
243
|
+
// Phase 3: Scan forward for EI preceded by whitespace
|
|
244
|
+
// EI is 0x45 0x49, and must be preceded by whitespace and followed by
|
|
245
|
+
// whitespace or EOF to distinguish from image data containing "EI"
|
|
246
|
+
while (pos + 1 < data.length) {
|
|
247
|
+
if (data[pos] === 0x45 &&
|
|
248
|
+
data[pos + 1] === 0x49 &&
|
|
249
|
+
pos > 0 &&
|
|
250
|
+
isWhitespaceByte(data[pos - 1]) &&
|
|
251
|
+
(pos + 2 >= data.length ||
|
|
252
|
+
isWhitespaceByte(data[pos + 2]) ||
|
|
253
|
+
isDelimiterByte(data[pos + 2]))) {
|
|
254
|
+
// Found EI — advance past it
|
|
255
|
+
tokenizer.position = pos + 2;
|
|
256
|
+
return;
|
|
257
|
+
}
|
|
258
|
+
pos++;
|
|
259
|
+
}
|
|
260
|
+
// If we didn't find EI, just set position to end
|
|
261
|
+
tokenizer.position = data.length;
|
|
262
|
+
}
|
|
263
|
+
parseInlineArray(tokenizer) {
|
|
264
|
+
const arr = [];
|
|
265
|
+
while (true) {
|
|
266
|
+
const tok = tokenizer.next();
|
|
267
|
+
if (tok.type === TokenType.ArrayEnd || tok.type === TokenType.EOF) {
|
|
268
|
+
break;
|
|
269
|
+
}
|
|
270
|
+
arr.push(tokenToOperand(tok));
|
|
271
|
+
}
|
|
272
|
+
return arr;
|
|
273
|
+
}
|
|
274
|
+
executeOperator(op, operands, fragments) {
|
|
275
|
+
switch (op) {
|
|
276
|
+
// ---- Graphics State ----
|
|
277
|
+
case "q":
|
|
278
|
+
this.saveState();
|
|
279
|
+
break;
|
|
280
|
+
case "Q":
|
|
281
|
+
this.restoreState();
|
|
282
|
+
break;
|
|
283
|
+
case "cm":
|
|
284
|
+
if (operands.length >= 6) {
|
|
285
|
+
this.concatMatrix(nums(operands, 6));
|
|
286
|
+
}
|
|
287
|
+
break;
|
|
288
|
+
// ---- Graphics State (no-op for text extraction) ----
|
|
289
|
+
case "gs": // ExtGState
|
|
290
|
+
case "i": // Flatness
|
|
291
|
+
case "M": // Miter limit
|
|
292
|
+
case "ri": // Rendering intent
|
|
293
|
+
case "sh": // Shading
|
|
294
|
+
// Consume operands, no action needed for text extraction
|
|
295
|
+
break;
|
|
296
|
+
// ---- Clipping (no-op) ----
|
|
297
|
+
case "W": // Clipping (non-zero winding)
|
|
298
|
+
case "W*": // Clipping (even-odd)
|
|
299
|
+
break;
|
|
300
|
+
// ---- Color Operators (no-op for text extraction) ----
|
|
301
|
+
case "CS": // Set color space (stroking)
|
|
302
|
+
case "cs": // Set color space (non-stroking)
|
|
303
|
+
case "SC": // Set color (stroking)
|
|
304
|
+
case "sc": // Set color (non-stroking)
|
|
305
|
+
case "SCN": // Set color (stroking, extended)
|
|
306
|
+
case "scn": // Set color (non-stroking, extended)
|
|
307
|
+
case "G": // Set gray (stroking)
|
|
308
|
+
case "g": // Set gray (non-stroking)
|
|
309
|
+
case "RG": // Set RGB (stroking)
|
|
310
|
+
case "rg": // Set RGB (non-stroking)
|
|
311
|
+
case "K": // Set CMYK (stroking)
|
|
312
|
+
case "k": // Set CMYK (non-stroking)
|
|
313
|
+
// Consume operands, no action needed
|
|
314
|
+
break;
|
|
315
|
+
// ---- Marked Content (no-op for text extraction) ----
|
|
316
|
+
case "BDC": // Begin marked content with properties
|
|
317
|
+
case "BMC": // Begin marked content
|
|
318
|
+
case "EMC": // End marked content
|
|
319
|
+
case "MP": // Marked content point
|
|
320
|
+
case "DP": // Marked content point with properties
|
|
321
|
+
break;
|
|
322
|
+
// ---- Type3 Font Glyph Operators (no-op) ----
|
|
323
|
+
case "d0": // Set glyph width
|
|
324
|
+
case "d1": // Set glyph width and bounding box
|
|
325
|
+
break;
|
|
326
|
+
// ---- Path Construction/Painting (no-op for text extraction) ----
|
|
327
|
+
case "m": // moveto
|
|
328
|
+
case "l": // lineto
|
|
329
|
+
case "c": // curveto (cubic Bézier)
|
|
330
|
+
case "v": // curveto (initial point replicated)
|
|
331
|
+
case "y": // curveto (final point replicated)
|
|
332
|
+
case "h": // closepath
|
|
333
|
+
case "re": // rectangle
|
|
334
|
+
case "S": // stroke
|
|
335
|
+
case "s": // close and stroke
|
|
336
|
+
case "f": // fill (non-zero winding)
|
|
337
|
+
case "F": // fill (non-zero winding, obsolete)
|
|
338
|
+
case "f*": // fill (even-odd)
|
|
339
|
+
case "B": // fill and stroke (non-zero)
|
|
340
|
+
case "B*": // fill and stroke (even-odd)
|
|
341
|
+
case "b": // close, fill and stroke (non-zero)
|
|
342
|
+
case "b*": // close, fill and stroke (even-odd)
|
|
343
|
+
case "n": // end path without fill/stroke
|
|
344
|
+
case "j": // line join style
|
|
345
|
+
case "J": // line cap style
|
|
346
|
+
case "d": // dash pattern
|
|
347
|
+
case "w": // line width
|
|
348
|
+
break;
|
|
349
|
+
// ---- Text State ----
|
|
350
|
+
case "Tc":
|
|
351
|
+
this.textState.charSpacing = num(operands, 0);
|
|
352
|
+
break;
|
|
353
|
+
case "Tw":
|
|
354
|
+
this.textState.wordSpacing = num(operands, 0);
|
|
355
|
+
break;
|
|
356
|
+
case "Tz":
|
|
357
|
+
this.textState.horizontalScaling = num(operands, 0);
|
|
358
|
+
break;
|
|
359
|
+
case "TL":
|
|
360
|
+
this.textState.leading = num(operands, 0);
|
|
361
|
+
break;
|
|
362
|
+
case "Tf":
|
|
363
|
+
this.setFont(operands);
|
|
364
|
+
break;
|
|
365
|
+
case "Tr":
|
|
366
|
+
this.textState.renderMode = num(operands, 0);
|
|
367
|
+
break;
|
|
368
|
+
case "Ts":
|
|
369
|
+
this.textState.rise = num(operands, 0);
|
|
370
|
+
break;
|
|
371
|
+
// ---- Text Objects ----
|
|
372
|
+
case "BT":
|
|
373
|
+
this.beginText();
|
|
374
|
+
break;
|
|
375
|
+
case "ET":
|
|
376
|
+
this.inTextObject = false;
|
|
377
|
+
break;
|
|
378
|
+
// ---- Text Positioning ----
|
|
379
|
+
case "Td":
|
|
380
|
+
this.moveText(num(operands, 0), num(operands, 1));
|
|
381
|
+
break;
|
|
382
|
+
case "TD":
|
|
383
|
+
this.textState.leading = -num(operands, 1);
|
|
384
|
+
this.moveText(num(operands, 0), num(operands, 1));
|
|
385
|
+
break;
|
|
386
|
+
case "Tm":
|
|
387
|
+
if (operands.length >= 6) {
|
|
388
|
+
this.setTextMatrix(nums(operands, 6));
|
|
389
|
+
}
|
|
390
|
+
break;
|
|
391
|
+
case "T*":
|
|
392
|
+
this.moveText(0, -this.textState.leading);
|
|
393
|
+
break;
|
|
394
|
+
// ---- Text Showing ----
|
|
395
|
+
case "Tj":
|
|
396
|
+
this.showText(operands[0], fragments);
|
|
397
|
+
break;
|
|
398
|
+
case "TJ":
|
|
399
|
+
this.showTextArray(operands[0], fragments);
|
|
400
|
+
break;
|
|
401
|
+
case "'":
|
|
402
|
+
this.moveText(0, -this.textState.leading);
|
|
403
|
+
this.showText(operands[0], fragments);
|
|
404
|
+
break;
|
|
405
|
+
case '"':
|
|
406
|
+
this.textState.wordSpacing = num(operands, 0);
|
|
407
|
+
this.textState.charSpacing = num(operands, 1);
|
|
408
|
+
this.moveText(0, -this.textState.leading);
|
|
409
|
+
this.showText(operands[2], fragments);
|
|
410
|
+
break;
|
|
411
|
+
// ---- XObject ----
|
|
412
|
+
case "Do":
|
|
413
|
+
this.doXObject(operands, fragments);
|
|
414
|
+
break;
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
// ===========================================================================
|
|
418
|
+
// Graphics State
|
|
419
|
+
// ===========================================================================
|
|
420
|
+
saveState() {
|
|
421
|
+
this.stateStack.push({
|
|
422
|
+
ctm: [...this.ctm],
|
|
423
|
+
textState: { ...this.textState }
|
|
424
|
+
});
|
|
425
|
+
}
|
|
426
|
+
restoreState() {
|
|
427
|
+
const state = this.stateStack.pop();
|
|
428
|
+
if (state) {
|
|
429
|
+
this.ctm = state.ctm;
|
|
430
|
+
this.textState = state.textState;
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
concatMatrix(m) {
|
|
434
|
+
this.ctm = multiplyMatrices(m, this.ctm);
|
|
435
|
+
}
|
|
436
|
+
// ===========================================================================
|
|
437
|
+
// Text State
|
|
438
|
+
// ===========================================================================
|
|
439
|
+
setFont(operands) {
|
|
440
|
+
if (operands.length < 2) {
|
|
441
|
+
return;
|
|
442
|
+
}
|
|
443
|
+
const fontName = typeof operands[0] === "string" ? operands[0] : String(operands[0]);
|
|
444
|
+
const fontSize = typeof operands[1] === "number" ? operands[1] : 0;
|
|
445
|
+
this.textState.font = this.fonts.get(fontName) ?? null;
|
|
446
|
+
this.textState.fontSize = fontSize;
|
|
447
|
+
}
|
|
448
|
+
beginText() {
|
|
449
|
+
this.inTextObject = true;
|
|
450
|
+
this.textMatrix = [1, 0, 0, 1, 0, 0];
|
|
451
|
+
this.lineMatrix = [1, 0, 0, 1, 0, 0];
|
|
452
|
+
}
|
|
453
|
+
moveText(tx, ty) {
|
|
454
|
+
const m = [1, 0, 0, 1, tx, ty];
|
|
455
|
+
this.lineMatrix = multiplyMatrices(m, this.lineMatrix);
|
|
456
|
+
this.textMatrix = [...this.lineMatrix];
|
|
457
|
+
}
|
|
458
|
+
setTextMatrix(m) {
|
|
459
|
+
this.textMatrix = [...m];
|
|
460
|
+
this.lineMatrix = [...m];
|
|
461
|
+
}
|
|
462
|
+
// ===========================================================================
|
|
463
|
+
// Text Showing
|
|
464
|
+
// ===========================================================================
|
|
465
|
+
showText(operand, fragments) {
|
|
466
|
+
if (operand === undefined || !this.textState.font) {
|
|
467
|
+
return;
|
|
468
|
+
}
|
|
469
|
+
let bytes;
|
|
470
|
+
if (operand instanceof Uint8Array) {
|
|
471
|
+
bytes = operand;
|
|
472
|
+
}
|
|
473
|
+
else if (typeof operand === "string") {
|
|
474
|
+
bytes = _textEncoder.encode(operand);
|
|
475
|
+
}
|
|
476
|
+
else {
|
|
477
|
+
return;
|
|
478
|
+
}
|
|
479
|
+
const font = this.textState.font;
|
|
480
|
+
const text = decodeText(bytes, font);
|
|
481
|
+
if (text.length === 0) {
|
|
482
|
+
return;
|
|
483
|
+
}
|
|
484
|
+
// Calculate position using text matrix and CTM
|
|
485
|
+
const tm = multiplyMatrices(this.textMatrix, this.ctm);
|
|
486
|
+
const x = tm[4];
|
|
487
|
+
const y = tm[5];
|
|
488
|
+
const fontSize = this.textState.fontSize * Math.sqrt(tm[0] * tm[0] + tm[1] * tm[1]);
|
|
489
|
+
// Calculate text width
|
|
490
|
+
const width = this.calculateTextWidth(bytes, font);
|
|
491
|
+
// Determine vertical text: check if font has WMode=1
|
|
492
|
+
const isVertical = font.wmode === 1;
|
|
493
|
+
// Determine RTL: check the first character of the decoded text
|
|
494
|
+
const isRtl = detectRtl(text);
|
|
495
|
+
fragments.push({
|
|
496
|
+
text,
|
|
497
|
+
x,
|
|
498
|
+
y,
|
|
499
|
+
fontSize: Math.abs(fontSize),
|
|
500
|
+
fontName: font.baseFontName,
|
|
501
|
+
width,
|
|
502
|
+
charSpacing: this.textState.charSpacing,
|
|
503
|
+
wordSpacing: this.textState.wordSpacing,
|
|
504
|
+
horizontalScaling: this.textState.horizontalScaling,
|
|
505
|
+
isVertical,
|
|
506
|
+
isRtl
|
|
507
|
+
});
|
|
508
|
+
// Advance text matrix
|
|
509
|
+
this.advanceTextPosition(bytes, font);
|
|
510
|
+
}
|
|
511
|
+
showTextArray(operand, fragments) {
|
|
512
|
+
if (operand === undefined || !Array.isArray(operand)) {
|
|
513
|
+
return;
|
|
514
|
+
}
|
|
515
|
+
for (const item of operand) {
|
|
516
|
+
if (typeof item === "number") {
|
|
517
|
+
// Negative number = move right, positive = move left (in thousandths of text space unit)
|
|
518
|
+
const displacement = (-item / 1000) * this.textState.fontSize * (this.textState.horizontalScaling / 100);
|
|
519
|
+
this.textMatrix[4] += displacement * this.textMatrix[0];
|
|
520
|
+
this.textMatrix[5] += displacement * this.textMatrix[1];
|
|
521
|
+
}
|
|
522
|
+
else {
|
|
523
|
+
this.showText(item, fragments);
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
calculateTextWidth(bytes, font) {
|
|
528
|
+
let width = 0;
|
|
529
|
+
const scale = this.textState.fontSize * (this.textState.horizontalScaling / 100);
|
|
530
|
+
if (font.subtype === "Type0" || font.bytesPerCode === 2) {
|
|
531
|
+
// CID fonts: use CMap codespace ranges for variable-length code parsing,
|
|
532
|
+
// consistent with decodeCIDText in font-decoder.ts
|
|
533
|
+
let i = 0;
|
|
534
|
+
while (i < bytes.length) {
|
|
535
|
+
let codeLen = 0;
|
|
536
|
+
if (font.toUnicode?.hasCodeSpaceRanges) {
|
|
537
|
+
codeLen = font.toUnicode.getCodeLength(bytes[i]);
|
|
538
|
+
}
|
|
539
|
+
let code;
|
|
540
|
+
if (codeLen === 2 && i + 1 < bytes.length) {
|
|
541
|
+
code = (bytes[i] << 8) | bytes[i + 1];
|
|
542
|
+
i += 2;
|
|
543
|
+
}
|
|
544
|
+
else if (codeLen === 1) {
|
|
545
|
+
code = bytes[i];
|
|
546
|
+
i++;
|
|
547
|
+
}
|
|
548
|
+
else if (i + 1 < bytes.length) {
|
|
549
|
+
// Fallback: assume 2-byte
|
|
550
|
+
code = (bytes[i] << 8) | bytes[i + 1];
|
|
551
|
+
i += 2;
|
|
552
|
+
}
|
|
553
|
+
else {
|
|
554
|
+
code = bytes[i];
|
|
555
|
+
i++;
|
|
556
|
+
}
|
|
557
|
+
const w = getCharWidth(code, font) / 1000;
|
|
558
|
+
width += w * scale + this.textState.charSpacing;
|
|
559
|
+
if (code === 0x0020) {
|
|
560
|
+
width += this.textState.wordSpacing;
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
else {
|
|
565
|
+
for (let i = 0; i < bytes.length; i++) {
|
|
566
|
+
const w = getCharWidth(bytes[i], font) / 1000;
|
|
567
|
+
width += w * scale + this.textState.charSpacing;
|
|
568
|
+
if (bytes[i] === 0x20) {
|
|
569
|
+
width += this.textState.wordSpacing;
|
|
570
|
+
}
|
|
571
|
+
}
|
|
572
|
+
}
|
|
573
|
+
return width;
|
|
574
|
+
}
|
|
575
|
+
advanceTextPosition(bytes, font) {
|
|
576
|
+
const width = this.calculateTextWidth(bytes, font);
|
|
577
|
+
// Advance text matrix by the width of the rendered text
|
|
578
|
+
this.textMatrix[4] += width * this.textMatrix[0];
|
|
579
|
+
this.textMatrix[5] += width * this.textMatrix[1];
|
|
580
|
+
}
|
|
581
|
+
// ===========================================================================
|
|
582
|
+
// XObject Handling (Form XObjects may contain text)
|
|
583
|
+
// ===========================================================================
|
|
584
|
+
doXObject(operands, fragments) {
|
|
585
|
+
if (operands.length < 1) {
|
|
586
|
+
return;
|
|
587
|
+
}
|
|
588
|
+
// Guard against infinite recursion from self-referencing Form XObjects
|
|
589
|
+
if (this.formDepth >= MAX_FORM_DEPTH) {
|
|
590
|
+
return;
|
|
591
|
+
}
|
|
592
|
+
const name = typeof operands[0] === "string" ? operands[0] : String(operands[0]);
|
|
593
|
+
// Look up XObject in resources
|
|
594
|
+
const xobjects = this.resources.get("XObject");
|
|
595
|
+
if (!xobjects) {
|
|
596
|
+
return;
|
|
597
|
+
}
|
|
598
|
+
const xobjDict = this.doc.derefDict(xobjects);
|
|
599
|
+
if (!xobjDict) {
|
|
600
|
+
return;
|
|
601
|
+
}
|
|
602
|
+
const xobj = xobjDict.get(name);
|
|
603
|
+
if (!xobj) {
|
|
604
|
+
return;
|
|
605
|
+
}
|
|
606
|
+
const streamResult = this.doc.derefStreamWithObjNum(xobj);
|
|
607
|
+
if (!streamResult) {
|
|
608
|
+
return;
|
|
609
|
+
}
|
|
610
|
+
const stream = streamResult.stream;
|
|
611
|
+
const streamDict = stream.dict;
|
|
612
|
+
const subtype = dictGetName(streamDict, "Subtype");
|
|
613
|
+
if (subtype !== "Form") {
|
|
614
|
+
return;
|
|
615
|
+
}
|
|
616
|
+
// Process form XObject — it has its own resources and content stream
|
|
617
|
+
const formResources = streamDict.get("Resources");
|
|
618
|
+
const resolvedResources = formResources
|
|
619
|
+
? (this.doc.derefDict(formResources) ?? this.resources)
|
|
620
|
+
: this.resources;
|
|
621
|
+
// Resolve fonts from form's resources
|
|
622
|
+
const formFonts = resolveFontResources(resolvedResources, this.doc);
|
|
623
|
+
// Merge with page fonts
|
|
624
|
+
const mergedFonts = new Map(this.fonts);
|
|
625
|
+
for (const [k, v] of formFonts) {
|
|
626
|
+
mergedFonts.set(k, v);
|
|
627
|
+
}
|
|
628
|
+
// Process form content with saved state
|
|
629
|
+
const savedFonts = this.fonts;
|
|
630
|
+
this.fonts = mergedFonts;
|
|
631
|
+
// Apply form matrix if present
|
|
632
|
+
const matrix = dictGetArray(streamDict, "Matrix");
|
|
633
|
+
if (matrix && matrix.length === 6) {
|
|
634
|
+
this.saveState();
|
|
635
|
+
this.concatMatrix(matrix);
|
|
636
|
+
}
|
|
637
|
+
const formData = this.doc.getStreamData(stream, streamResult.objNum, streamResult.gen);
|
|
638
|
+
this.formDepth++;
|
|
639
|
+
this.process(formData, fragments);
|
|
640
|
+
this.formDepth--;
|
|
641
|
+
if (matrix && matrix.length === 6) {
|
|
642
|
+
this.restoreState();
|
|
643
|
+
}
|
|
644
|
+
this.fonts = savedFonts;
|
|
645
|
+
}
|
|
646
|
+
}
|
|
647
|
+
// =============================================================================
|
|
648
|
+
// Inline Image Helpers
|
|
649
|
+
// =============================================================================
|
|
650
|
+
/** Check if a byte is PDF whitespace */
|
|
651
|
+
function isWhitespaceByte(b) {
|
|
652
|
+
return b === 0x00 || b === 0x09 || b === 0x0a || b === 0x0d || b === 0x0c || b === 0x20;
|
|
653
|
+
}
|
|
654
|
+
/** Check if a byte is a PDF delimiter */
|
|
655
|
+
function isDelimiterByte(b) {
|
|
656
|
+
return (b === 0x28 || // (
|
|
657
|
+
b === 0x29 || // )
|
|
658
|
+
b === 0x3c || // <
|
|
659
|
+
b === 0x3e || // >
|
|
660
|
+
b === 0x5b || // [
|
|
661
|
+
b === 0x5d || // ]
|
|
662
|
+
b === 0x7b || // {
|
|
663
|
+
b === 0x7d || // }
|
|
664
|
+
b === 0x2f || // /
|
|
665
|
+
b === 0x25 // %
|
|
666
|
+
);
|
|
667
|
+
}
|
|
668
|
+
function tokenToOperand(token) {
|
|
669
|
+
switch (token.type) {
|
|
670
|
+
case TokenType.Number:
|
|
671
|
+
return token.numValue ?? 0;
|
|
672
|
+
case TokenType.Name:
|
|
673
|
+
return token.strValue ?? "";
|
|
674
|
+
case TokenType.LiteralString:
|
|
675
|
+
case TokenType.HexString:
|
|
676
|
+
return token.rawBytes ?? new Uint8Array(0);
|
|
677
|
+
case TokenType.Boolean:
|
|
678
|
+
return token.boolValue ?? false;
|
|
679
|
+
case TokenType.Null:
|
|
680
|
+
return null;
|
|
681
|
+
case TokenType.ArrayBegin:
|
|
682
|
+
// This shouldn't happen — arrays should be parsed before reaching here
|
|
683
|
+
return [];
|
|
684
|
+
default:
|
|
685
|
+
return token.strValue ?? null;
|
|
686
|
+
}
|
|
687
|
+
}
|
|
688
|
+
function num(operands, index) {
|
|
689
|
+
const val = operands[index];
|
|
690
|
+
return typeof val === "number" ? val : 0;
|
|
691
|
+
}
|
|
692
|
+
function nums(operands, count) {
|
|
693
|
+
const result = [];
|
|
694
|
+
for (let i = 0; i < count; i++) {
|
|
695
|
+
result.push(num(operands, i));
|
|
696
|
+
}
|
|
697
|
+
return result;
|
|
698
|
+
}
|
|
699
|
+
// =============================================================================
|
|
700
|
+
// Matrix Operations
|
|
701
|
+
// =============================================================================
|
|
702
|
+
/**
|
|
703
|
+
* Multiply two 3x3 transformation matrices (stored as [a,b,c,d,e,f]).
|
|
704
|
+
* Matrix format: [a b 0; c d 0; e f 1]
|
|
705
|
+
*/
|
|
706
|
+
function multiplyMatrices(m1, m2) {
|
|
707
|
+
return [
|
|
708
|
+
m1[0] * m2[0] + m1[1] * m2[2],
|
|
709
|
+
m1[0] * m2[1] + m1[1] * m2[3],
|
|
710
|
+
m1[2] * m2[0] + m1[3] * m2[2],
|
|
711
|
+
m1[2] * m2[1] + m1[3] * m2[3],
|
|
712
|
+
m1[4] * m2[0] + m1[5] * m2[2] + m2[4],
|
|
713
|
+
m1[4] * m2[1] + m1[5] * m2[3] + m2[5]
|
|
714
|
+
];
|
|
715
|
+
}
|