@llamaindex/liteparse 1.4.2 → 1.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/core/parser.d.ts.map +1 -1
- package/dist/src/core/parser.js +2 -2
- package/dist/src/core/parser.js.map +1 -1
- package/dist/src/engines/pdf/interface.d.ts +7 -2
- package/dist/src/engines/pdf/interface.d.ts.map +1 -1
- package/dist/src/engines/pdf/pdfjs.d.ts +3 -3
- package/dist/src/engines/pdf/pdfjs.d.ts.map +1 -1
- package/dist/src/engines/pdf/pdfjs.js +432 -220
- package/dist/src/engines/pdf/pdfjs.js.map +1 -1
- package/dist/src/processing/gridProjection.d.ts.map +1 -1
- package/dist/src/processing/gridProjection.js +18 -3
- package/dist/src/processing/gridProjection.js.map +1 -1
- package/dist/src/processing/gridProjection.test.js +30 -0
- package/dist/src/processing/gridProjection.test.js.map +1 -1
- package/dist/src/vendor/pdfjs/pdf.worker.mjs +2 -1
- package/package.json +1 -1
- package/src/vendor/pdfjs/pdf.worker.mjs +2 -1
|
@@ -36,198 +36,423 @@ function applyTransformation(point, transform) {
|
|
|
36
36
|
};
|
|
37
37
|
}
|
|
38
38
|
// Pre-compiled regex patterns for string decoding
|
|
39
|
-
const BUGGY_FONT_MARKER_REGEX = /:->\|>_(\d+)_\d+_<\|<-:/g;
|
|
40
39
|
const BUGGY_FONT_MARKER_CHECK = ":->|>";
|
|
41
40
|
const PIPE_PATTERN_REGEX = /\s*\|([^|])\|\s*/g;
|
|
42
41
|
/**
|
|
43
|
-
*
|
|
44
|
-
* Many fonts with "Differences" arrays use similar patterns for tabular digits.
|
|
45
|
-
* These mappings are derived from common font encoding conventions.
|
|
42
|
+
* Adobe Glyph List subset: maps standard PostScript glyph names to Unicode characters.
|
|
46
43
|
*
|
|
47
|
-
*
|
|
48
|
-
*
|
|
44
|
+
* When PDF.js detects a "buggy" font (one whose ToUnicode/encoding maps glyphs to
|
|
45
|
+
* control characters or PUA code points), it emits markers containing the glyph's
|
|
46
|
+
* original char code AND the glyph name from the font's /Differences or /Encoding
|
|
47
|
+
* dictionary. This map resolves those glyph names to correct Unicode characters.
|
|
49
48
|
*
|
|
50
|
-
*
|
|
51
|
-
*
|
|
52
|
-
*
|
|
49
|
+
* This is a ~200-entry subset of the full Adobe Glyph List (~4,300 entries).
|
|
50
|
+
* The full canonical source is: https://github.com/adobe-type-tools/agl-aglfn
|
|
51
|
+
* (see glyphlist.txt). Our subset covers basic Latin, digits, ligatures, punctuation,
|
|
52
|
+
* typographic characters, Greek, math symbols, and common accented Latin. Glyph names
|
|
53
|
+
* not in this subset fall through to the uniXXXX convention and ASCII-range fallbacks
|
|
54
|
+
* in resolveGlyphName(). Add entries here if a PDF's buggy font uses a standard glyph
|
|
55
|
+
* name that isn't covered and doesn't match those fallbacks.
|
|
53
56
|
*/
|
|
54
|
-
const
|
|
55
|
-
//
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
57
|
+
const ADOBE_GLYPH_MAP = {
|
|
58
|
+
// Basic Latin letters
|
|
59
|
+
A: "A",
|
|
60
|
+
B: "B",
|
|
61
|
+
C: "C",
|
|
62
|
+
D: "D",
|
|
63
|
+
E: "E",
|
|
64
|
+
F: "F",
|
|
65
|
+
G: "G",
|
|
66
|
+
H: "H",
|
|
67
|
+
I: "I",
|
|
68
|
+
J: "J",
|
|
69
|
+
K: "K",
|
|
70
|
+
L: "L",
|
|
71
|
+
M: "M",
|
|
72
|
+
N: "N",
|
|
73
|
+
O: "O",
|
|
74
|
+
P: "P",
|
|
75
|
+
Q: "Q",
|
|
76
|
+
R: "R",
|
|
77
|
+
S: "S",
|
|
78
|
+
T: "T",
|
|
79
|
+
U: "U",
|
|
80
|
+
V: "V",
|
|
81
|
+
W: "W",
|
|
82
|
+
X: "X",
|
|
83
|
+
Y: "Y",
|
|
84
|
+
Z: "Z",
|
|
85
|
+
a: "a",
|
|
86
|
+
b: "b",
|
|
87
|
+
c: "c",
|
|
88
|
+
d: "d",
|
|
89
|
+
e: "e",
|
|
90
|
+
f: "f",
|
|
91
|
+
g: "g",
|
|
92
|
+
h: "h",
|
|
93
|
+
i: "i",
|
|
94
|
+
j: "j",
|
|
95
|
+
k: "k",
|
|
96
|
+
l: "l",
|
|
97
|
+
m: "m",
|
|
98
|
+
n: "n",
|
|
99
|
+
o: "o",
|
|
100
|
+
p: "p",
|
|
101
|
+
q: "q",
|
|
102
|
+
r: "r",
|
|
103
|
+
s: "s",
|
|
104
|
+
t: "t",
|
|
105
|
+
u: "u",
|
|
106
|
+
v: "v",
|
|
107
|
+
w: "w",
|
|
108
|
+
x: "x",
|
|
109
|
+
y: "y",
|
|
110
|
+
z: "z",
|
|
111
|
+
// Digits
|
|
112
|
+
zero: "0",
|
|
113
|
+
one: "1",
|
|
114
|
+
two: "2",
|
|
115
|
+
three: "3",
|
|
116
|
+
four: "4",
|
|
117
|
+
five: "5",
|
|
118
|
+
six: "6",
|
|
119
|
+
seven: "7",
|
|
120
|
+
eight: "8",
|
|
121
|
+
nine: "9",
|
|
122
|
+
// Ligatures (Unicode presentation forms — decomposed later by stripControlChars)
|
|
123
|
+
fi: "\uFB01",
|
|
124
|
+
fl: "\uFB02",
|
|
125
|
+
ff: "\uFB00",
|
|
126
|
+
ffi: "\uFB03",
|
|
127
|
+
ffl: "\uFB04",
|
|
128
|
+
// Punctuation and symbols
|
|
129
|
+
space: " ",
|
|
130
|
+
period: ".",
|
|
131
|
+
comma: ",",
|
|
132
|
+
colon: ":",
|
|
133
|
+
semicolon: ";",
|
|
134
|
+
hyphen: "-",
|
|
135
|
+
minus: "\u2212",
|
|
136
|
+
slash: "/",
|
|
137
|
+
question: "?",
|
|
138
|
+
dollar: "$",
|
|
139
|
+
parenleft: "(",
|
|
140
|
+
parenright: ")",
|
|
141
|
+
asterisk: "*",
|
|
142
|
+
plus: "+",
|
|
143
|
+
equal: "=",
|
|
144
|
+
numbersign: "#",
|
|
145
|
+
percent: "%",
|
|
146
|
+
ampersand: "&",
|
|
147
|
+
at: "@",
|
|
148
|
+
exclam: "!",
|
|
149
|
+
bracketleft: "[",
|
|
150
|
+
bracketright: "]",
|
|
151
|
+
braceleft: "{",
|
|
152
|
+
braceright: "}",
|
|
153
|
+
underscore: "_",
|
|
154
|
+
quotedbl: '"',
|
|
155
|
+
quotesingle: "'",
|
|
156
|
+
backslash: "\\",
|
|
157
|
+
bar: "|",
|
|
158
|
+
asciitilde: "~",
|
|
159
|
+
asciicircum: "^",
|
|
160
|
+
grave: "`",
|
|
161
|
+
less: "<",
|
|
162
|
+
greater: ">",
|
|
163
|
+
// Typographic
|
|
164
|
+
quoteright: "\u2019",
|
|
165
|
+
quoteleft: "\u2018",
|
|
166
|
+
quotedblleft: "\u201C",
|
|
167
|
+
quotedblright: "\u201D",
|
|
168
|
+
quotesinglbase: "\u201A",
|
|
169
|
+
quotedblbase: "\u201E",
|
|
170
|
+
endash: "\u2013",
|
|
171
|
+
emdash: "\u2014",
|
|
172
|
+
bullet: "\u2022",
|
|
173
|
+
ellipsis: "\u2026",
|
|
174
|
+
dagger: "\u2020",
|
|
175
|
+
daggerdbl: "\u2021",
|
|
176
|
+
guilsinglleft: "\u2039",
|
|
177
|
+
guilsinglright: "\u203A",
|
|
178
|
+
guillemotleft: "\u00AB",
|
|
179
|
+
guillemotright: "\u00BB",
|
|
180
|
+
trademark: "\u2122",
|
|
181
|
+
registered: "\u00AE",
|
|
182
|
+
copyright: "\u00A9",
|
|
183
|
+
// Greek
|
|
184
|
+
Alpha: "\u0391",
|
|
185
|
+
Beta: "\u0392",
|
|
186
|
+
Gamma: "\u0393",
|
|
187
|
+
Delta: "\u2206",
|
|
188
|
+
Epsilon: "\u0395",
|
|
189
|
+
Zeta: "\u0396",
|
|
190
|
+
Eta: "\u0397",
|
|
191
|
+
Theta: "\u0398",
|
|
192
|
+
Iota: "\u0399",
|
|
193
|
+
Kappa: "\u039A",
|
|
194
|
+
Lambda: "\u039B",
|
|
195
|
+
Mu: "\u039C",
|
|
196
|
+
Nu: "\u039D",
|
|
197
|
+
Xi: "\u039E",
|
|
198
|
+
Omicron: "\u039F",
|
|
199
|
+
Pi: "\u03A0",
|
|
200
|
+
Rho: "\u03A1",
|
|
201
|
+
Sigma: "\u03A3",
|
|
202
|
+
Tau: "\u03A4",
|
|
203
|
+
Upsilon: "\u03A5",
|
|
204
|
+
Phi: "\u03A6",
|
|
205
|
+
Chi: "\u03A7",
|
|
206
|
+
Psi: "\u03A8",
|
|
207
|
+
Omega: "\u2126",
|
|
208
|
+
alpha: "\u03B1",
|
|
209
|
+
beta: "\u03B2",
|
|
210
|
+
gamma: "\u03B3",
|
|
211
|
+
delta: "\u03B4",
|
|
212
|
+
epsilon: "\u03B5",
|
|
213
|
+
zeta: "\u03B6",
|
|
214
|
+
eta: "\u03B7",
|
|
215
|
+
theta: "\u03B8",
|
|
216
|
+
iota: "\u03B9",
|
|
217
|
+
kappa: "\u03BA",
|
|
218
|
+
lambda: "\u03BB",
|
|
219
|
+
mu: "\u00B5",
|
|
220
|
+
nu: "\u03BD",
|
|
221
|
+
xi: "\u03BE",
|
|
222
|
+
omicron: "\u03BF",
|
|
223
|
+
pi: "\u03C0",
|
|
224
|
+
rho: "\u03C1",
|
|
225
|
+
sigma: "\u03C3",
|
|
226
|
+
tau: "\u03C4",
|
|
227
|
+
upsilon: "\u03C5",
|
|
228
|
+
phi: "\u03C6",
|
|
229
|
+
chi: "\u03C7",
|
|
230
|
+
psi: "\u03C8",
|
|
231
|
+
omega: "\u03C9",
|
|
232
|
+
// Math symbols
|
|
233
|
+
greaterequal: "\u2265",
|
|
234
|
+
lessequal: "\u2264",
|
|
235
|
+
notequal: "\u2260",
|
|
236
|
+
plusminus: "\u00B1",
|
|
237
|
+
multiply: "\u00D7",
|
|
238
|
+
divide: "\u00F7",
|
|
239
|
+
infinity: "\u221E",
|
|
240
|
+
summation: "\u2211",
|
|
241
|
+
integral: "\u222B",
|
|
242
|
+
partialdiff: "\u2202",
|
|
243
|
+
radical: "\u221A",
|
|
244
|
+
approxequal: "\u2248",
|
|
245
|
+
degree: "\u00B0",
|
|
246
|
+
// Accented Latin (common)
|
|
247
|
+
Aacute: "\u00C1",
|
|
248
|
+
Agrave: "\u00C0",
|
|
249
|
+
Acircumflex: "\u00C2",
|
|
250
|
+
Atilde: "\u00C3",
|
|
251
|
+
Adieresis: "\u00C4",
|
|
252
|
+
Aring: "\u00C5",
|
|
253
|
+
Eacute: "\u00C9",
|
|
254
|
+
Egrave: "\u00C8",
|
|
255
|
+
Ecircumflex: "\u00CA",
|
|
256
|
+
Edieresis: "\u00CB",
|
|
257
|
+
Iacute: "\u00CD",
|
|
258
|
+
Igrave: "\u00CC",
|
|
259
|
+
Icircumflex: "\u00CE",
|
|
260
|
+
Idieresis: "\u00CF",
|
|
261
|
+
Oacute: "\u00D3",
|
|
262
|
+
Ograve: "\u00D2",
|
|
263
|
+
Ocircumflex: "\u00D4",
|
|
264
|
+
Otilde: "\u00D5",
|
|
265
|
+
Odieresis: "\u00D6",
|
|
266
|
+
Uacute: "\u00DA",
|
|
267
|
+
Ugrave: "\u00D9",
|
|
268
|
+
Ucircumflex: "\u00DB",
|
|
269
|
+
Udieresis: "\u00DC",
|
|
270
|
+
Ntilde: "\u00D1",
|
|
271
|
+
Ccedilla: "\u00C7",
|
|
272
|
+
Scaron: "\u0160",
|
|
273
|
+
Zcaron: "\u017D",
|
|
274
|
+
aacute: "\u00E1",
|
|
275
|
+
agrave: "\u00E0",
|
|
276
|
+
acircumflex: "\u00E2",
|
|
277
|
+
atilde: "\u00E3",
|
|
278
|
+
adieresis: "\u00E4",
|
|
279
|
+
aring: "\u00E5",
|
|
280
|
+
eacute: "\u00E9",
|
|
281
|
+
egrave: "\u00E8",
|
|
282
|
+
ecircumflex: "\u00EA",
|
|
283
|
+
edieresis: "\u00EB",
|
|
284
|
+
iacute: "\u00ED",
|
|
285
|
+
igrave: "\u00EC",
|
|
286
|
+
icircumflex: "\u00EE",
|
|
287
|
+
idieresis: "\u00EF",
|
|
288
|
+
oacute: "\u00F3",
|
|
289
|
+
ograve: "\u00F2",
|
|
290
|
+
ocircumflex: "\u00F4",
|
|
291
|
+
otilde: "\u00F5",
|
|
292
|
+
odieresis: "\u00F6",
|
|
293
|
+
uacute: "\u00FA",
|
|
294
|
+
ugrave: "\u00F9",
|
|
295
|
+
ucircumflex: "\u00FB",
|
|
296
|
+
udieresis: "\u00FC",
|
|
297
|
+
ntilde: "\u00F1",
|
|
298
|
+
ccedilla: "\u00E7",
|
|
299
|
+
scaron: "\u0161",
|
|
300
|
+
zcaron: "\u017E",
|
|
301
|
+
ydieresis: "\u00FF",
|
|
302
|
+
// Miscellaneous
|
|
303
|
+
AE: "\u00C6",
|
|
304
|
+
ae: "\u00E6",
|
|
305
|
+
OE: "\u0152",
|
|
306
|
+
oe: "\u0153",
|
|
307
|
+
Eth: "\u00D0",
|
|
308
|
+
eth: "\u00F0",
|
|
309
|
+
Thorn: "\u00DE",
|
|
310
|
+
thorn: "\u00FE",
|
|
311
|
+
germandbls: "\u00DF",
|
|
312
|
+
dotlessi: "\u0131",
|
|
313
|
+
section: "\u00A7",
|
|
314
|
+
paragraph: "\u00B6",
|
|
315
|
+
currency: "\u00A4",
|
|
316
|
+
cent: "\u00A2",
|
|
317
|
+
sterling: "\u00A3",
|
|
318
|
+
yen: "\u00A5",
|
|
319
|
+
Euro: "\u20AC",
|
|
320
|
+
logicalnot: "\u00AC",
|
|
321
|
+
nbspace: "\u00A0",
|
|
322
|
+
};
|
|
98
323
|
/**
|
|
99
|
-
*
|
|
100
|
-
*
|
|
324
|
+
* Resolve a glyph name to its Unicode character using the Adobe Glyph List.
|
|
325
|
+
* Handles standard names, the "uniXXXX" convention, and underscore-separated
|
|
326
|
+
* composite names (e.g., "f_i" → resolve "f" + "i" = "fi").
|
|
101
327
|
*/
|
|
102
|
-
function
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
328
|
+
function resolveGlyphName(glyphName) {
|
|
329
|
+
if (glyphName in ADOBE_GLYPH_MAP)
|
|
330
|
+
return ADOBE_GLYPH_MAP[glyphName];
|
|
331
|
+
// Handle "uniXXXX" convention (e.g., "uni00A0" → U+00A0)
|
|
332
|
+
if (glyphName.startsWith("uni") && glyphName.length === 7) {
|
|
333
|
+
const code = parseInt(glyphName.slice(3), 16);
|
|
334
|
+
if (!isNaN(code) && code > 0)
|
|
335
|
+
return String.fromCharCode(code);
|
|
336
|
+
}
|
|
337
|
+
// Handle underscore-separated composite names (e.g., "f_i" → "fi", "f_f_i" → "ffi")
|
|
338
|
+
// Some fonts use this convention instead of standard ligature names
|
|
339
|
+
if (glyphName.includes("_")) {
|
|
340
|
+
const parts = glyphName.split("_");
|
|
341
|
+
const resolved = parts.map((p) => resolveGlyphName(p));
|
|
342
|
+
if (resolved.every((r) => r !== null)) {
|
|
343
|
+
return resolved.join("");
|
|
108
344
|
}
|
|
109
345
|
}
|
|
110
|
-
return
|
|
346
|
+
return null;
|
|
111
347
|
}
|
|
112
348
|
/**
|
|
113
|
-
*
|
|
114
|
-
*
|
|
349
|
+
* Decode buggy font markers emitted by patched PDF.js.
|
|
350
|
+
*
|
|
351
|
+
* Marker format: :->|>_<glyphId>_<fontCharCode>@<glyphName>@<|<-:
|
|
352
|
+
* The glyph name is delimited by @ instead of _ because some fonts use
|
|
353
|
+
* non-standard glyph names containing underscores (e.g., "f_i" for "fi").
|
|
354
|
+
*
|
|
355
|
+
* Resolution strategy:
|
|
356
|
+
* 1. Use glyph name from font's /Differences or /Encoding dictionary
|
|
357
|
+
* 2. Fall back to glyphId if it's in printable ASCII range (32-126)
|
|
358
|
+
* 3. Drop the character if neither works (better than guessing)
|
|
115
359
|
*/
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
}
|
|
134
|
-
// Pattern: percentage or simple number
|
|
135
|
-
if (/^\d+$/.test(decoded)) {
|
|
136
|
-
score += 3; // e.g., "897"
|
|
137
|
-
}
|
|
138
|
-
// Penalize bad patterns
|
|
139
|
-
// Consecutive punctuation marks (not valid in numbers)
|
|
140
|
-
if (/[.,]{2,}/.test(decoded)) {
|
|
141
|
-
score -= 10;
|
|
142
|
-
}
|
|
143
|
-
// Punctuation at start (except minus/asterisk) or end
|
|
144
|
-
if (/^[.,+]|[.,+]$/.test(decoded)) {
|
|
145
|
-
score -= 5;
|
|
146
|
-
}
|
|
147
|
-
// Comma followed by anything other than 3 digits then boundary
|
|
148
|
-
if (/,(?!\d{3}(?:[,.]|$))/.test(decoded)) {
|
|
149
|
-
score -= 3;
|
|
150
|
-
}
|
|
151
|
-
// Period not followed by digits (except at end)
|
|
152
|
-
if (/\.(?![0-9])/.test(decoded) && !decoded.endsWith(".")) {
|
|
153
|
-
score -= 3;
|
|
154
|
-
}
|
|
155
|
-
return score;
|
|
360
|
+
const BUGGY_FONT_MARKER_RE = /:->\|>_(\d+)_\d+@([^@]*)@<\|<-:/g;
|
|
361
|
+
function decodeBuggyFontMarkers(str) {
|
|
362
|
+
return str.replace(BUGGY_FONT_MARKER_RE, (_match, glyphIdStr, glyphName) => {
|
|
363
|
+
// Priority 1: Resolve via glyph name from font metadata
|
|
364
|
+
if (glyphName) {
|
|
365
|
+
const resolved = resolveGlyphName(glyphName);
|
|
366
|
+
if (resolved)
|
|
367
|
+
return resolved;
|
|
368
|
+
}
|
|
369
|
+
// Priority 2: If glyphId is in printable ASCII range, use it directly
|
|
370
|
+
const glyphId = parseInt(glyphIdStr);
|
|
371
|
+
if (glyphId >= 32 && glyphId <= 126) {
|
|
372
|
+
return String.fromCharCode(glyphId);
|
|
373
|
+
}
|
|
374
|
+
// Priority 3: Drop unresolvable characters
|
|
375
|
+
return "";
|
|
376
|
+
});
|
|
156
377
|
}
|
|
157
378
|
/**
|
|
158
|
-
*
|
|
159
|
-
* Returns the decoded string if a mapping produces valid-looking text,
|
|
160
|
-
* otherwise returns null to fall back to charCode decoding.
|
|
379
|
+
* Windows-1252 to Unicode mapping for the C1 control range (0x80-0x9F).
|
|
161
380
|
*
|
|
162
|
-
*
|
|
163
|
-
*
|
|
164
|
-
*
|
|
165
|
-
*
|
|
166
|
-
*
|
|
381
|
+
* Many PDFs encode smart quotes, em-dashes, and other typographic characters
|
|
382
|
+
* using Windows-1252 byte values. When PDF.js decodes these without a proper
|
|
383
|
+
* ToUnicode map, the raw byte values end up in the 0x80-0x9F range — which is
|
|
384
|
+
* technically the C1 control character block in Unicode. Rather than stripping
|
|
385
|
+
* them (which loses apostrophes, quotes, dashes, etc.), we map them to their
|
|
386
|
+
* correct Unicode equivalents.
|
|
167
387
|
*/
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
}
|
|
212
|
-
}
|
|
213
|
-
// Only return if we got a reasonable score (at least some digits, proper format)
|
|
214
|
-
if (bestResult && bestScore > 0) {
|
|
215
|
-
return bestResult;
|
|
216
|
-
}
|
|
217
|
-
return null;
|
|
218
|
-
}
|
|
388
|
+
const WINDOWS_1252_TO_UNICODE = {
|
|
389
|
+
0x80: "\u20AC", // €
|
|
390
|
+
0x82: "\u201A", // ‚
|
|
391
|
+
0x83: "\u0192", // ƒ
|
|
392
|
+
0x84: "\u201E", // „
|
|
393
|
+
0x85: "\u2026", // …
|
|
394
|
+
0x86: "\u2020", // †
|
|
395
|
+
0x87: "\u2021", // ‡
|
|
396
|
+
0x88: "\u02C6", // ˆ
|
|
397
|
+
0x89: "\u2030", // ‰
|
|
398
|
+
0x8a: "\u0160", // Š
|
|
399
|
+
0x8b: "\u2039", // ‹
|
|
400
|
+
0x8c: "\u0152", // Œ
|
|
401
|
+
0x8e: "\u017D", // Ž
|
|
402
|
+
0x91: "\u2018", // '
|
|
403
|
+
0x92: "\u2019", // ' (right single quote / apostrophe)
|
|
404
|
+
0x93: "\u201C", // "
|
|
405
|
+
0x94: "\u201D", // "
|
|
406
|
+
0x95: "\u2022", // •
|
|
407
|
+
0x96: "\u2013", // –
|
|
408
|
+
0x97: "\u2014", // —
|
|
409
|
+
0x98: "\u02DC", // ˜
|
|
410
|
+
0x99: "\u2122", // ™
|
|
411
|
+
0x9a: "\u0161", // š
|
|
412
|
+
0x9b: "\u203A", // ›
|
|
413
|
+
0x9c: "\u0153", // œ
|
|
414
|
+
0x9e: "\u017E", // ž
|
|
415
|
+
0x9f: "\u0178", // Ÿ
|
|
416
|
+
};
|
|
417
|
+
/**
|
|
418
|
+
* Unicode ligature decomposition map.
|
|
419
|
+
* PDF fonts often use ligature glyphs; decomposing them to plain ASCII
|
|
420
|
+
* ensures the text is searchable and NLP-friendly.
|
|
421
|
+
*/
|
|
422
|
+
const LIGATURE_MAP = {
|
|
423
|
+
"\uFB00": "ff",
|
|
424
|
+
"\uFB01": "fi",
|
|
425
|
+
"\uFB02": "fl",
|
|
426
|
+
"\uFB03": "ffi",
|
|
427
|
+
"\uFB04": "ffl",
|
|
428
|
+
"\uFB05": "st",
|
|
429
|
+
"\uFB06": "st",
|
|
430
|
+
};
|
|
219
431
|
/**
|
|
220
|
-
* Strip C0
|
|
221
|
-
*
|
|
222
|
-
*
|
|
432
|
+
* Strip C0 control characters from text (except common whitespace),
|
|
433
|
+
* map C1 control range (0x80-0x9F) to proper Unicode via Windows-1252,
|
|
434
|
+
* and decompose Unicode ligatures to plain text.
|
|
223
435
|
*/
|
|
224
436
|
function stripControlChars(str) {
|
|
225
437
|
let result = "";
|
|
226
438
|
for (const char of str) {
|
|
227
439
|
const code = char.charCodeAt(0);
|
|
228
|
-
//
|
|
229
|
-
if (
|
|
230
|
-
|
|
440
|
+
// Decompose Unicode ligatures (fi, fl, ff, ffi, ffl, st)
|
|
441
|
+
if (LIGATURE_MAP[char]) {
|
|
442
|
+
result += LIGATURE_MAP[char];
|
|
443
|
+
continue;
|
|
444
|
+
}
|
|
445
|
+
// Map Windows-1252 C1 range to proper Unicode (smart quotes, em-dashes, etc.)
|
|
446
|
+
if (code >= 0x80 && code <= 0x9f) {
|
|
447
|
+
const mapped = WINDOWS_1252_TO_UNICODE[code];
|
|
448
|
+
if (mapped) {
|
|
449
|
+
result += mapped;
|
|
450
|
+
}
|
|
451
|
+
// Undefined C1 positions (0x81, 0x8D, 0x8F, 0x90) are dropped
|
|
452
|
+
continue;
|
|
453
|
+
}
|
|
454
|
+
// Skip C0 controls (except tab, newline, carriage return)
|
|
455
|
+
if (code >= 0x00 && code <= 0x1f && code !== 0x09 && code !== 0x0a && code !== 0x0d) {
|
|
231
456
|
continue;
|
|
232
457
|
}
|
|
233
458
|
result += char;
|
|
@@ -260,11 +485,19 @@ function isGarbledFontOutput(str) {
|
|
|
260
485
|
for (const char of str) {
|
|
261
486
|
const code = char.charCodeAt(0);
|
|
262
487
|
// C0 control characters (0x00-0x1F) except common whitespace (tab, newline, carriage return)
|
|
263
|
-
|
|
264
|
-
if ((code >= 0x00 && code <= 0x1f && code !== 0x09 && code !== 0x0a && code !== 0x0d) ||
|
|
265
|
-
(code >= 0x80 && code <= 0x9f)) {
|
|
488
|
+
if (code >= 0x00 && code <= 0x1f && code !== 0x09 && code !== 0x0a && code !== 0x0d) {
|
|
266
489
|
controlCharCount++;
|
|
267
490
|
}
|
|
491
|
+
// C1 range (0x80-0x9F): only count as control chars if NOT a valid Windows-1252 character.
|
|
492
|
+
// Many PDFs use Windows-1252 encoding for smart quotes, em-dashes, etc.
|
|
493
|
+
else if (code >= 0x80 && code <= 0x9f) {
|
|
494
|
+
if (WINDOWS_1252_TO_UNICODE[code]) {
|
|
495
|
+
normalCharCount++; // Valid Windows-1252 char (smart quote, dash, etc.)
|
|
496
|
+
}
|
|
497
|
+
else {
|
|
498
|
+
controlCharCount++; // Undefined C1 position — likely garbled
|
|
499
|
+
}
|
|
500
|
+
}
|
|
268
501
|
// Private Use Area (U+E000-U+F8FF) - almost always garbled
|
|
269
502
|
else if (code >= 0xe000 && code <= 0xf8ff) {
|
|
270
503
|
privateUseCount++;
|
|
@@ -385,7 +618,7 @@ export class PdfJsEngine {
|
|
|
385
618
|
_pdfDocument: pdfDocument,
|
|
386
619
|
};
|
|
387
620
|
}
|
|
388
|
-
async extractPage(doc, pageNum) {
|
|
621
|
+
async extractPage(doc, pageNum, options) {
|
|
389
622
|
const pdfDocument = doc._pdfDocument;
|
|
390
623
|
const page = await pdfDocument.getPage(pageNum);
|
|
391
624
|
// Get viewport
|
|
@@ -406,18 +639,9 @@ export class PdfJsEngine {
|
|
|
406
639
|
const cm = multiplyMatrices(viewportTransform, item.transform);
|
|
407
640
|
// Get lower-left corner (text space origin)
|
|
408
641
|
const ll = applyTransformation({ x: 0, y: 0 }, cm);
|
|
409
|
-
// Extract scale factors directly from matrix components (not SVD).
|
|
410
|
-
// For matrix [a, b, c, d, tx, ty]:
|
|
411
|
-
// - Horizontal scale = sqrt(a² + b²)
|
|
412
|
-
// - Vertical scale = sqrt(c² + d²)
|
|
413
|
-
// This correctly preserves axis association unlike SVD which returns
|
|
414
|
-
// singular values sorted by magnitude (causing x/y swap for some fonts).
|
|
415
642
|
const scaleX = Math.sqrt(item.transform[0] ** 2 + item.transform[1] ** 2);
|
|
416
643
|
const scaleY = Math.sqrt(item.transform[2] ** 2 + item.transform[3] ** 2);
|
|
417
|
-
// Get upper-right corner by first converting width/height to text space
|
|
418
|
-
// (dividing by the scale factors), then transforming to viewport space
|
|
419
644
|
const ur = applyTransformation({ x: item.width / scaleX, y: item.height / scaleY }, cm);
|
|
420
|
-
// Calculate final bounding box in viewport space
|
|
421
645
|
const left = Math.min(ll.x, ur.x);
|
|
422
646
|
const right = Math.max(ll.x, ur.x);
|
|
423
647
|
const top = Math.min(ll.y, ur.y);
|
|
@@ -427,44 +651,30 @@ export class PdfJsEngine {
|
|
|
427
651
|
continue;
|
|
428
652
|
const width = right - left;
|
|
429
653
|
const height = bottom - top;
|
|
430
|
-
//
|
|
654
|
+
// Get rotation angle from the transformation matrix
|
|
431
655
|
let rotation = getRotation(cm);
|
|
432
|
-
|
|
433
|
-
if (rotation < 0) {
|
|
656
|
+
if (rotation < 0)
|
|
434
657
|
rotation += 360;
|
|
435
|
-
|
|
436
|
-
// Decode buggy font markers from PDF.js (only if marker is present)
|
|
437
|
-
// Format: :->|>_<charCode>_<fontChar>_<|<-:
|
|
658
|
+
// Decode buggy font markers using glyph names from font metadata
|
|
438
659
|
let decodedStr = item.str;
|
|
439
660
|
if (decodedStr.includes(BUGGY_FONT_MARKER_CHECK)) {
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
if (tabularDecoded) {
|
|
443
|
-
decodedStr = tabularDecoded;
|
|
444
|
-
}
|
|
445
|
-
else {
|
|
446
|
-
// Fall back to original approach: use glyph ID as character code
|
|
447
|
-
BUGGY_FONT_MARKER_REGEX.lastIndex = 0; // Reset regex state
|
|
448
|
-
decodedStr = decodedStr.replace(BUGGY_FONT_MARKER_REGEX, (_, charCode) => String.fromCharCode(parseInt(charCode)));
|
|
449
|
-
}
|
|
661
|
+
BUGGY_FONT_MARKER_RE.lastIndex = 0;
|
|
662
|
+
decodedStr = decodeBuggyFontMarkers(decodedStr);
|
|
450
663
|
}
|
|
451
664
|
// Handle pipe-separated characters: " |a| |r| |X| " -> "arX"
|
|
452
|
-
// Some PDFs encode text with characters separated by pipes and spaces
|
|
453
665
|
if (decodedStr.includes("|")) {
|
|
454
|
-
PIPE_PATTERN_REGEX.lastIndex = 0;
|
|
666
|
+
PIPE_PATTERN_REGEX.lastIndex = 0;
|
|
455
667
|
const matches = [...decodedStr.matchAll(PIPE_PATTERN_REGEX)];
|
|
456
668
|
if (matches.length > 0) {
|
|
457
669
|
decodedStr = matches.map((m) => m[1]).join("");
|
|
458
670
|
}
|
|
459
671
|
}
|
|
460
672
|
// Skip garbled text from fonts with corrupted ToUnicode mappings
|
|
461
|
-
// Save the bounding box so OCR can fill in these specific regions
|
|
462
673
|
if (isGarbledFontOutput(decodedStr)) {
|
|
463
674
|
garbledTextRegions.push({ x: left, y: top, width, height });
|
|
464
675
|
continue;
|
|
465
676
|
}
|
|
466
|
-
// Strip
|
|
467
|
-
// (e.g., form feed chars that sneak into ligatures like "fi")
|
|
677
|
+
// Strip remaining control characters, map Windows-1252, decompose ligatures
|
|
468
678
|
decodedStr = stripControlChars(decodedStr);
|
|
469
679
|
textItems.push({
|
|
470
680
|
str: decodedStr,
|
|
@@ -481,22 +691,24 @@ export class PdfJsEngine {
|
|
|
481
691
|
});
|
|
482
692
|
}
|
|
483
693
|
let images = [];
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
this.pdfiumRenderer
|
|
488
|
-
|
|
694
|
+
if (options?.extractImages !== false) {
|
|
695
|
+
try {
|
|
696
|
+
const pdfInput = this.currentPdfPath || this.currentPdfData || doc.data;
|
|
697
|
+
if (!this.pdfiumRenderer) {
|
|
698
|
+
this.pdfiumRenderer = new PdfiumRenderer();
|
|
699
|
+
await this.pdfiumRenderer.loadDocument(pdfInput);
|
|
700
|
+
}
|
|
701
|
+
const imageBounds = await this.pdfiumRenderer.extractImageBounds(pdfInput, pageNum);
|
|
702
|
+
images = imageBounds.map((bounds) => ({
|
|
703
|
+
x: bounds.x,
|
|
704
|
+
y: bounds.y,
|
|
705
|
+
width: bounds.width,
|
|
706
|
+
height: bounds.height,
|
|
707
|
+
}));
|
|
708
|
+
}
|
|
709
|
+
catch {
|
|
710
|
+
// Image extraction is best-effort
|
|
489
711
|
}
|
|
490
|
-
const imageBounds = await this.pdfiumRenderer.extractImageBounds(pdfInput, pageNum);
|
|
491
|
-
images = imageBounds.map((bounds) => ({
|
|
492
|
-
x: bounds.x,
|
|
493
|
-
y: bounds.y,
|
|
494
|
-
width: bounds.width,
|
|
495
|
-
height: bounds.height,
|
|
496
|
-
}));
|
|
497
|
-
}
|
|
498
|
-
catch {
|
|
499
|
-
// Image extraction is best-effort
|
|
500
712
|
}
|
|
501
713
|
// Skip annotation extraction - not currently used in processing pipeline
|
|
502
714
|
// Can be re-enabled if needed for link extraction, etc.
|
|
@@ -512,7 +724,7 @@ export class PdfJsEngine {
|
|
|
512
724
|
garbledTextRegions: garbledTextRegions.length > 0 ? garbledTextRegions : undefined,
|
|
513
725
|
};
|
|
514
726
|
}
|
|
515
|
-
async extractAllPages(doc, maxPages, targetPages) {
|
|
727
|
+
async extractAllPages(doc, maxPages, targetPages, options) {
|
|
516
728
|
const numPages = Math.min(doc.numPages, maxPages || doc.numPages);
|
|
517
729
|
const pages = [];
|
|
518
730
|
// Parse target pages if specified
|
|
@@ -527,7 +739,7 @@ export class PdfJsEngine {
|
|
|
527
739
|
if (maxPages && pages.length >= maxPages) {
|
|
528
740
|
break;
|
|
529
741
|
}
|
|
530
|
-
const pageData = await this.extractPage(doc, pageNum);
|
|
742
|
+
const pageData = await this.extractPage(doc, pageNum, options);
|
|
531
743
|
pages.push(pageData);
|
|
532
744
|
}
|
|
533
745
|
return pages;
|