@elizaos/plugin-pdf 2.0.0-alpha.11 → 2.0.0-alpha.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/index.browser.js +5 -27
- package/dist/browser/index.browser.js.map +4 -5
- package/dist/cjs/index.node.cjs +112 -1628
- package/dist/cjs/index.node.js.map +4 -21
- package/dist/node/index.node.js +83 -1631
- package/dist/node/index.node.js.map +4 -21
- package/dist/services/pdf.d.ts +3 -2
- package/dist/services/pdf.d.ts.map +1 -1
- package/package.json +8 -10
- package/dist/browser/index.d.ts +0 -2
- package/dist/cjs/index.d.ts +0 -2
- package/dist/node/index.d.ts +0 -2
- package/dist/tsconfig.tsbuildinfo +0 -1
package/dist/cjs/index.node.cjs
CHANGED
|
@@ -4,38 +4,59 @@ var __defProp = Object.defineProperty;
|
|
|
4
4
|
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
5
|
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
6
6
|
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
7
|
+
function __accessProp(key) {
|
|
8
|
+
return this[key];
|
|
9
|
+
}
|
|
10
|
+
var __toESMCache_node;
|
|
11
|
+
var __toESMCache_esm;
|
|
7
12
|
var __toESM = (mod, isNodeMode, target) => {
|
|
13
|
+
var canCache = mod != null && typeof mod === "object";
|
|
14
|
+
if (canCache) {
|
|
15
|
+
var cache = isNodeMode ? __toESMCache_node ??= new WeakMap : __toESMCache_esm ??= new WeakMap;
|
|
16
|
+
var cached = cache.get(mod);
|
|
17
|
+
if (cached)
|
|
18
|
+
return cached;
|
|
19
|
+
}
|
|
8
20
|
target = mod != null ? __create(__getProtoOf(mod)) : {};
|
|
9
21
|
const to = isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target;
|
|
10
22
|
for (let key of __getOwnPropNames(mod))
|
|
11
23
|
if (!__hasOwnProp.call(to, key))
|
|
12
24
|
__defProp(to, key, {
|
|
13
|
-
get: (
|
|
25
|
+
get: __accessProp.bind(mod, key),
|
|
14
26
|
enumerable: true
|
|
15
27
|
});
|
|
28
|
+
if (canCache)
|
|
29
|
+
cache.set(mod, to);
|
|
16
30
|
return to;
|
|
17
31
|
};
|
|
18
|
-
var __moduleCache = /* @__PURE__ */ new WeakMap;
|
|
19
32
|
var __toCommonJS = (from) => {
|
|
20
|
-
var entry = __moduleCache.get(from), desc;
|
|
33
|
+
var entry = (__moduleCache ??= new WeakMap).get(from), desc;
|
|
21
34
|
if (entry)
|
|
22
35
|
return entry;
|
|
23
36
|
entry = __defProp({}, "__esModule", { value: true });
|
|
24
|
-
if (from && typeof from === "object" || typeof from === "function")
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
37
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
38
|
+
for (var key of __getOwnPropNames(from))
|
|
39
|
+
if (!__hasOwnProp.call(entry, key))
|
|
40
|
+
__defProp(entry, key, {
|
|
41
|
+
get: __accessProp.bind(from, key),
|
|
42
|
+
enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable
|
|
43
|
+
});
|
|
44
|
+
}
|
|
29
45
|
__moduleCache.set(from, entry);
|
|
30
46
|
return entry;
|
|
31
47
|
};
|
|
48
|
+
var __moduleCache;
|
|
49
|
+
var __returnValue = (v) => v;
|
|
50
|
+
function __exportSetter(name, newValue) {
|
|
51
|
+
this[name] = __returnValue.bind(null, newValue);
|
|
52
|
+
}
|
|
32
53
|
var __export = (target, all) => {
|
|
33
54
|
for (var name in all)
|
|
34
55
|
__defProp(target, name, {
|
|
35
56
|
get: all[name],
|
|
36
57
|
enumerable: true,
|
|
37
58
|
configurable: true,
|
|
38
|
-
set: (
|
|
59
|
+
set: __exportSetter.bind(all, name)
|
|
39
60
|
});
|
|
40
61
|
};
|
|
41
62
|
|
|
@@ -50,1630 +71,69 @@ module.exports = __toCommonJS(exports_index_node);
|
|
|
50
71
|
|
|
51
72
|
// services/pdf.ts
|
|
52
73
|
var import_core = require("@elizaos/core");
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
getException: () => getException,
|
|
58
|
-
VerbosityLevel: () => import_pdf.VerbosityLevel,
|
|
59
|
-
UnknownErrorException: () => UnknownErrorException,
|
|
60
|
-
Table: () => Table,
|
|
61
|
-
Shape: () => Shape,
|
|
62
|
-
ResponseException: () => ResponseException,
|
|
63
|
-
Rectangle: () => Rectangle,
|
|
64
|
-
Point: () => Point,
|
|
65
|
-
PasswordException: () => PasswordException,
|
|
66
|
-
PDFParse: () => PDFParse,
|
|
67
|
-
LineStore: () => LineStore,
|
|
68
|
-
LineDirection: () => LineDirection,
|
|
69
|
-
Line: () => Line,
|
|
70
|
-
InvalidPDFException: () => InvalidPDFException,
|
|
71
|
-
FormatError: () => FormatError,
|
|
72
|
-
AbortException: () => AbortException
|
|
73
|
-
});
|
|
74
|
-
|
|
75
|
-
// node_modules/pdf-parse/dist/pdf-parse/esm/PDFParse.js
|
|
76
|
-
var pdfjs2 = __toESM(require("pdfjs-dist/legacy/build/pdf.mjs"));
|
|
77
|
-
|
|
78
|
-
// node_modules/pdf-parse/dist/pdf-parse/esm/Exception.js
|
|
79
|
-
class InvalidPDFException extends Error {
|
|
80
|
-
constructor(message, cause) {
|
|
81
|
-
if (cause !== undefined) {
|
|
82
|
-
super(message ?? "Invalid PDF", { cause });
|
|
83
|
-
} else {
|
|
84
|
-
super(message ?? "Invalid PDF");
|
|
85
|
-
}
|
|
86
|
-
this.name = "InvalidPDFException";
|
|
87
|
-
Object.setPrototypeOf(this, InvalidPDFException.prototype);
|
|
88
|
-
if (typeof Error.captureStackTrace === "function") {
|
|
89
|
-
Error.captureStackTrace(this, InvalidPDFException);
|
|
90
|
-
}
|
|
91
|
-
}
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
class PasswordException extends Error {
|
|
95
|
-
constructor(message, cause) {
|
|
96
|
-
if (cause !== undefined) {
|
|
97
|
-
super(message ?? "Password required or incorrect", { cause });
|
|
98
|
-
} else {
|
|
99
|
-
super(message ?? "Password required or incorrect");
|
|
100
|
-
}
|
|
101
|
-
this.name = "PasswordException";
|
|
102
|
-
Object.setPrototypeOf(this, PasswordException.prototype);
|
|
103
|
-
if (typeof Error.captureStackTrace === "function") {
|
|
104
|
-
Error.captureStackTrace(this, PasswordException);
|
|
105
|
-
}
|
|
106
|
-
}
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
class FormatError extends Error {
|
|
110
|
-
constructor(message, cause) {
|
|
111
|
-
if (cause !== undefined) {
|
|
112
|
-
super(message ?? "PDF format error", { cause });
|
|
113
|
-
} else {
|
|
114
|
-
super(message ?? "PDF format error");
|
|
115
|
-
}
|
|
116
|
-
this.name = "FormatError";
|
|
117
|
-
Object.setPrototypeOf(this, FormatError.prototype);
|
|
118
|
-
if (typeof Error.captureStackTrace === "function") {
|
|
119
|
-
Error.captureStackTrace(this, FormatError);
|
|
120
|
-
}
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
class UnknownErrorException extends Error {
|
|
125
|
-
constructor(message, details, cause) {
|
|
126
|
-
if (cause !== undefined) {
|
|
127
|
-
super(message ?? "Unknown error", { cause });
|
|
128
|
-
} else {
|
|
129
|
-
super(message ?? "Unknown error");
|
|
130
|
-
}
|
|
131
|
-
this.name = "UnknownErrorException";
|
|
132
|
-
Object.setPrototypeOf(this, UnknownErrorException.prototype);
|
|
133
|
-
if (typeof Error.captureStackTrace === "function") {
|
|
134
|
-
Error.captureStackTrace(this, UnknownErrorException);
|
|
135
|
-
}
|
|
136
|
-
this.details = details;
|
|
137
|
-
}
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
class ResponseException extends Error {
|
|
141
|
-
constructor(message, status, missing, cause) {
|
|
142
|
-
if (cause !== undefined) {
|
|
143
|
-
super(message ?? "Response error", { cause });
|
|
144
|
-
} else {
|
|
145
|
-
super(message ?? "Response error");
|
|
146
|
-
}
|
|
147
|
-
this.name = "ResponseException";
|
|
148
|
-
Object.setPrototypeOf(this, ResponseException.prototype);
|
|
149
|
-
if (typeof Error.captureStackTrace === "function") {
|
|
150
|
-
Error.captureStackTrace(this, ResponseException);
|
|
151
|
-
}
|
|
152
|
-
this.status = status;
|
|
153
|
-
this.missing = missing;
|
|
154
|
-
}
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
class AbortException extends Error {
|
|
158
|
-
constructor(message, cause) {
|
|
159
|
-
if (cause !== undefined) {
|
|
160
|
-
super(message ?? "Operation aborted", { cause });
|
|
161
|
-
} else {
|
|
162
|
-
super(message ?? "Operation aborted");
|
|
163
|
-
}
|
|
164
|
-
this.name = "AbortException";
|
|
165
|
-
Object.setPrototypeOf(this, AbortException.prototype);
|
|
166
|
-
if (typeof Error.captureStackTrace === "function") {
|
|
167
|
-
Error.captureStackTrace(this, AbortException);
|
|
168
|
-
}
|
|
169
|
-
}
|
|
170
|
-
}
|
|
171
|
-
function getException(error) {
|
|
172
|
-
if (error instanceof Error) {
|
|
173
|
-
switch (error.name) {
|
|
174
|
-
case "InvalidPDFException":
|
|
175
|
-
return new InvalidPDFException(error.message, error);
|
|
176
|
-
case "PasswordException":
|
|
177
|
-
return new PasswordException(error.message, error);
|
|
178
|
-
case "FormatError":
|
|
179
|
-
return new FormatError(error.message, error);
|
|
180
|
-
case "UnknownErrorException":
|
|
181
|
-
return new UnknownErrorException(error.message, error.details, error);
|
|
182
|
-
case "ResponseException":
|
|
183
|
-
return new ResponseException(error.message, error.status, error.missing, error);
|
|
184
|
-
case "AbortException":
|
|
185
|
-
return new AbortException(error.message, error);
|
|
186
|
-
default:
|
|
187
|
-
return error;
|
|
188
|
-
}
|
|
189
|
-
}
|
|
190
|
-
return new Error(String(error));
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
// node_modules/pdf-parse/dist/pdf-parse/esm/geometry/Shape.js
|
|
194
|
-
class Shape {
|
|
195
|
-
static tolerance = 2;
|
|
196
|
-
static applyTransform(p, m) {
|
|
197
|
-
const xt = p[0] * m[0] + p[1] * m[2] + m[4];
|
|
198
|
-
const yt = p[0] * m[1] + p[1] * m[3] + m[5];
|
|
199
|
-
return [xt, yt];
|
|
200
|
-
}
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
// node_modules/pdf-parse/dist/pdf-parse/esm/geometry/Point.js
|
|
204
|
-
class Point extends Shape {
|
|
205
|
-
x;
|
|
206
|
-
y;
|
|
207
|
-
constructor(x, y) {
|
|
208
|
-
super();
|
|
209
|
-
this.x = x;
|
|
210
|
-
this.y = y;
|
|
211
|
-
}
|
|
212
|
-
equal(point) {
|
|
213
|
-
return point.x === this.x && point.y === this.y;
|
|
214
|
-
}
|
|
215
|
-
transform(matrix) {
|
|
216
|
-
const p = Shape.applyTransform([this.x, this.y], matrix);
|
|
217
|
-
this.x = p[0];
|
|
218
|
-
this.y = p[1];
|
|
219
|
-
return this;
|
|
220
|
-
}
|
|
221
|
-
}
|
|
222
|
-
|
|
223
|
-
// node_modules/pdf-parse/dist/pdf-parse/esm/geometry/Line.js
|
|
224
|
-
var LineDirection;
|
|
225
|
-
(function(LineDirection2) {
|
|
226
|
-
LineDirection2[LineDirection2["None"] = 0] = "None";
|
|
227
|
-
LineDirection2[LineDirection2["Horizontal"] = 1] = "Horizontal";
|
|
228
|
-
LineDirection2[LineDirection2["Vertical"] = 2] = "Vertical";
|
|
229
|
-
})(LineDirection || (LineDirection = {}));
|
|
230
|
-
|
|
231
|
-
class Line extends Shape {
|
|
232
|
-
from;
|
|
233
|
-
to;
|
|
234
|
-
direction = LineDirection.None;
|
|
235
|
-
length = 0;
|
|
236
|
-
intersections = [];
|
|
237
|
-
gaps = [];
|
|
238
|
-
constructor(from, to) {
|
|
239
|
-
super();
|
|
240
|
-
this.from = from;
|
|
241
|
-
this.to = to;
|
|
242
|
-
this.init();
|
|
243
|
-
}
|
|
244
|
-
init() {
|
|
245
|
-
let from = this.from;
|
|
246
|
-
let to = this.to;
|
|
247
|
-
if (Math.abs(from.y - to.y) < Shape.tolerance) {
|
|
248
|
-
this.direction = LineDirection.Horizontal;
|
|
249
|
-
to.y = from.y;
|
|
250
|
-
if (from.x > to.x) {
|
|
251
|
-
const temp = from;
|
|
252
|
-
from = to;
|
|
253
|
-
to = temp;
|
|
254
|
-
}
|
|
255
|
-
this.length = to.x - from.x;
|
|
256
|
-
} else if (Math.abs(from.x - to.x) < Shape.tolerance) {
|
|
257
|
-
this.direction = LineDirection.Vertical;
|
|
258
|
-
to.x = from.x;
|
|
259
|
-
if (from.y > to.y) {
|
|
260
|
-
const temp = from;
|
|
261
|
-
from = to;
|
|
262
|
-
to = temp;
|
|
263
|
-
}
|
|
264
|
-
this.length = to.y - from.y;
|
|
265
|
-
}
|
|
266
|
-
this.from = from;
|
|
267
|
-
this.to = to;
|
|
268
|
-
}
|
|
269
|
-
_valid = undefined;
|
|
270
|
-
get valid() {
|
|
271
|
-
if (this._valid === undefined) {
|
|
272
|
-
this._valid = this.direction !== LineDirection.None && this.length > Shape.tolerance;
|
|
273
|
-
}
|
|
274
|
-
return this._valid;
|
|
275
|
-
}
|
|
276
|
-
get normalized() {
|
|
277
|
-
if (this.direction === LineDirection.Horizontal) {
|
|
278
|
-
return new Line(new Point(this.from.x - Shape.tolerance, this.from.y), new Point(this.to.x + Shape.tolerance, this.from.y));
|
|
279
|
-
} else if (this.direction === LineDirection.Vertical) {
|
|
280
|
-
return new Line(new Point(this.from.x, this.from.y - Shape.tolerance), new Point(this.from.x, this.to.y + Shape.tolerance));
|
|
281
|
-
}
|
|
282
|
-
return this;
|
|
283
|
-
}
|
|
284
|
-
addGap(line) {
|
|
285
|
-
this.gaps.push(line);
|
|
286
|
-
}
|
|
287
|
-
containsPoint(p) {
|
|
288
|
-
if (this.direction === LineDirection.Vertical) {
|
|
289
|
-
return this.from.x === p.x && p.y >= this.from.y && p.y <= this.to.y;
|
|
290
|
-
} else if (this.direction === LineDirection.Horizontal) {
|
|
291
|
-
return this.from.y === p.y && p.x >= this.from.x && p.x <= this.to.x;
|
|
292
|
-
}
|
|
293
|
-
return false;
|
|
294
|
-
}
|
|
295
|
-
addIntersectionPoint(point) {
|
|
296
|
-
for (const intPoint of this.intersections) {
|
|
297
|
-
if (intPoint.equal(point))
|
|
298
|
-
return;
|
|
299
|
-
}
|
|
300
|
-
this.intersections.push(point);
|
|
301
|
-
}
|
|
302
|
-
intersection(line) {
|
|
303
|
-
let result;
|
|
304
|
-
if (!this.valid || !line.valid) {
|
|
305
|
-
return result;
|
|
306
|
-
}
|
|
307
|
-
const thisNormalized = this.normalized;
|
|
308
|
-
const lineNormalized = line.normalized;
|
|
309
|
-
if (this.direction === LineDirection.Horizontal && line.direction === LineDirection.Vertical) {
|
|
310
|
-
const x = lineNormalized.from.x;
|
|
311
|
-
const y = thisNormalized.from.y;
|
|
312
|
-
const isOk = x > thisNormalized.from.x && x < thisNormalized.to.x && y > lineNormalized.from.y && y < lineNormalized.to.y;
|
|
313
|
-
if (isOk) {
|
|
314
|
-
const intPoint = new Point(x, y);
|
|
315
|
-
this.addIntersectionPoint(intPoint);
|
|
316
|
-
line.addIntersectionPoint(intPoint);
|
|
317
|
-
result = intPoint;
|
|
318
|
-
}
|
|
319
|
-
} else if (this.direction === LineDirection.Vertical && line.direction === LineDirection.Horizontal) {
|
|
320
|
-
const x = thisNormalized.from.x;
|
|
321
|
-
const y = lineNormalized.from.y;
|
|
322
|
-
const isOk = x > lineNormalized.from.x && x < lineNormalized.to.x && y > thisNormalized.from.y && y < thisNormalized.to.y;
|
|
323
|
-
if (isOk) {
|
|
324
|
-
const intPoint = new Point(x, y);
|
|
325
|
-
this.addIntersectionPoint(intPoint);
|
|
326
|
-
line.addIntersectionPoint(intPoint);
|
|
327
|
-
result = intPoint;
|
|
328
|
-
}
|
|
329
|
-
}
|
|
330
|
-
return result;
|
|
331
|
-
}
|
|
332
|
-
transform(matrix) {
|
|
333
|
-
const p1 = this.from.transform(matrix);
|
|
334
|
-
const p2 = this.to.transform(matrix);
|
|
335
|
-
const x = Math.min(p1.x, p2.x);
|
|
336
|
-
const y = Math.min(p1.y, p2.y);
|
|
337
|
-
const width = Math.abs(p1.x - p2.x);
|
|
338
|
-
const height = Math.abs(p1.y - p2.y);
|
|
339
|
-
this.from = new Point(x, y);
|
|
340
|
-
this.to = new Point(x + width, y + height);
|
|
341
|
-
this.init();
|
|
342
|
-
return this;
|
|
343
|
-
}
|
|
344
|
-
}
|
|
345
|
-
// node_modules/pdf-parse/dist/pdf-parse/esm/geometry/TableData.js
|
|
346
|
-
class TableData {
|
|
347
|
-
minXY;
|
|
348
|
-
maxXY;
|
|
349
|
-
rows;
|
|
350
|
-
rowPivots;
|
|
351
|
-
colPivots;
|
|
352
|
-
constructor(minXY, maxXY, rowPivots, colPivots) {
|
|
353
|
-
this.minXY = minXY;
|
|
354
|
-
this.maxXY = maxXY;
|
|
355
|
-
this.rows = [];
|
|
356
|
-
this.rowPivots = rowPivots;
|
|
357
|
-
this.colPivots = colPivots;
|
|
358
|
-
}
|
|
359
|
-
findCell(x, y) {
|
|
360
|
-
if (x >= this.minXY.x && y >= this.minXY.y && x <= this.maxXY.x && y <= this.maxXY.y) {
|
|
361
|
-
for (const row of this.rows) {
|
|
362
|
-
for (const cell of row) {
|
|
363
|
-
if (cell.minXY.x <= x && cell.minXY.y <= y && cell.maxXY.x >= x && cell.maxXY.y >= y) {
|
|
364
|
-
return cell;
|
|
365
|
-
}
|
|
366
|
-
}
|
|
367
|
-
}
|
|
368
|
-
}
|
|
369
|
-
return;
|
|
370
|
-
}
|
|
371
|
-
get cellCount() {
|
|
372
|
-
return this.rows.reduce((acc, row) => acc + row.length, 0);
|
|
373
|
-
}
|
|
374
|
-
get rowCount() {
|
|
375
|
-
return this.rows.length;
|
|
376
|
-
}
|
|
377
|
-
check() {
|
|
378
|
-
const virtualCellCount = (this.colPivots.length - 1) * (this.rowPivots.length - 1);
|
|
379
|
-
let allCellCount = 0;
|
|
380
|
-
for (const row of this.rows) {
|
|
381
|
-
for (const cell of row) {
|
|
382
|
-
const count = (cell.colspan || 1) * (cell.rowspan || 1);
|
|
383
|
-
allCellCount += count;
|
|
384
|
-
}
|
|
385
|
-
}
|
|
386
|
-
if (virtualCellCount !== allCellCount) {
|
|
387
|
-
return false;
|
|
388
|
-
}
|
|
389
|
-
return true;
|
|
390
|
-
}
|
|
391
|
-
toArray() {
|
|
392
|
-
const tableArr = [];
|
|
393
|
-
for (const row of this.rows) {
|
|
394
|
-
const rowArr = [];
|
|
395
|
-
for (const cell of row) {
|
|
396
|
-
let text = cell.text.join("");
|
|
397
|
-
text = text.replace(/^[\s]+|[\s]+$/g, "");
|
|
398
|
-
text = text.trim();
|
|
399
|
-
rowArr.push(text);
|
|
400
|
-
}
|
|
401
|
-
tableArr.push(rowArr);
|
|
402
|
-
}
|
|
403
|
-
return tableArr;
|
|
404
|
-
}
|
|
405
|
-
}
|
|
406
|
-
|
|
407
|
-
// node_modules/pdf-parse/dist/pdf-parse/esm/geometry/Table.js
|
|
408
|
-
class Table {
|
|
409
|
-
hLines = [];
|
|
410
|
-
vLines = [];
|
|
411
|
-
constructor(line) {
|
|
412
|
-
if (line.direction === LineDirection.Horizontal) {
|
|
413
|
-
this.hLines.push(line);
|
|
414
|
-
} else if (line.direction === LineDirection.Vertical) {
|
|
415
|
-
this.vLines.push(line);
|
|
416
|
-
}
|
|
417
|
-
}
|
|
418
|
-
get isValid() {
|
|
419
|
-
return this.hLines.length + this.vLines.length > 4;
|
|
420
|
-
}
|
|
421
|
-
get rowPivots() {
|
|
422
|
-
const rowSet = new Set;
|
|
423
|
-
for (const line of this.hLines) {
|
|
424
|
-
rowSet.add(line.from.y);
|
|
425
|
-
}
|
|
426
|
-
return [...rowSet].sort((a, b) => a - b);
|
|
427
|
-
}
|
|
428
|
-
get colPivots() {
|
|
429
|
-
const colSet = new Set;
|
|
430
|
-
for (const line of this.vLines) {
|
|
431
|
-
colSet.add(line.from.x);
|
|
432
|
-
}
|
|
433
|
-
return [...colSet].sort((a, b) => a - b);
|
|
434
|
-
}
|
|
435
|
-
add(line) {
|
|
436
|
-
const hasIntersection = this.intersection(line);
|
|
437
|
-
if (hasIntersection) {
|
|
438
|
-
if (line.direction === LineDirection.Horizontal) {
|
|
439
|
-
this.hLines.push(line);
|
|
440
|
-
return true;
|
|
441
|
-
} else if (line.direction === LineDirection.Vertical) {
|
|
442
|
-
this.vLines.push(line);
|
|
443
|
-
return true;
|
|
444
|
-
}
|
|
445
|
-
}
|
|
446
|
-
return false;
|
|
447
|
-
}
|
|
448
|
-
intersection(line) {
|
|
449
|
-
let flag = false;
|
|
450
|
-
if (!line.valid)
|
|
451
|
-
return flag;
|
|
452
|
-
if (line.direction === LineDirection.Horizontal) {
|
|
453
|
-
for (const vLine of this.vLines) {
|
|
454
|
-
const p = line.intersection(vLine);
|
|
455
|
-
if (p) {
|
|
456
|
-
flag = true;
|
|
457
|
-
}
|
|
458
|
-
}
|
|
459
|
-
} else if (line.direction === LineDirection.Vertical) {
|
|
460
|
-
for (const hLine of this.hLines) {
|
|
461
|
-
const p = line.intersection(hLine);
|
|
462
|
-
if (p) {
|
|
463
|
-
flag = true;
|
|
464
|
-
}
|
|
465
|
-
}
|
|
466
|
-
}
|
|
467
|
-
return flag;
|
|
468
|
-
}
|
|
469
|
-
getSameHorizontal(line) {
|
|
470
|
-
const same = [line];
|
|
471
|
-
const other = [];
|
|
472
|
-
while (this.hLines.length > 0) {
|
|
473
|
-
const hLine = this.hLines.shift();
|
|
474
|
-
if (!hLine)
|
|
475
|
-
continue;
|
|
476
|
-
if (hLine.from.y === line.from.y) {
|
|
477
|
-
same.push(hLine);
|
|
478
|
-
} else {
|
|
479
|
-
other.push(hLine);
|
|
480
|
-
}
|
|
481
|
-
}
|
|
482
|
-
this.hLines = other;
|
|
483
|
-
return same;
|
|
484
|
-
}
|
|
485
|
-
getSameVertical(line) {
|
|
486
|
-
const same = [line];
|
|
487
|
-
const other = [];
|
|
488
|
-
while (this.vLines.length > 0) {
|
|
489
|
-
const vLine = this.vLines.shift();
|
|
490
|
-
if (!vLine)
|
|
491
|
-
continue;
|
|
492
|
-
if (vLine.from.x === line.from.x) {
|
|
493
|
-
same.push(vLine);
|
|
494
|
-
} else {
|
|
495
|
-
other.push(vLine);
|
|
496
|
-
}
|
|
497
|
-
}
|
|
498
|
-
this.vLines = other;
|
|
499
|
-
return same;
|
|
500
|
-
}
|
|
501
|
-
mergeHorizontalLines(lines) {
|
|
502
|
-
lines.sort((l1, l2) => l1.from.x - l2.from.x);
|
|
503
|
-
const minX = lines[0].from.x;
|
|
504
|
-
const maxX = lines[lines.length - 1].to.x;
|
|
505
|
-
const resultLine = new Line(new Point(minX, lines[0].from.y), new Point(maxX, lines[0].from.y));
|
|
506
|
-
for (let i = 1;i < lines.length; i++) {
|
|
507
|
-
const prevLine = lines[i - 1];
|
|
508
|
-
const currLine = lines[i];
|
|
509
|
-
if (Math.abs(prevLine.to.x - currLine.from.x) > Shape.tolerance) {
|
|
510
|
-
const gapLine = new Line(new Point(prevLine.to.x, prevLine.from.y), new Point(currLine.from.x, currLine.from.y));
|
|
511
|
-
resultLine.addGap(gapLine);
|
|
512
|
-
}
|
|
513
|
-
}
|
|
514
|
-
return resultLine;
|
|
515
|
-
}
|
|
516
|
-
mergeVerticalLines(lines) {
|
|
517
|
-
lines.sort((l1, l2) => l1.from.y - l2.from.y);
|
|
518
|
-
const minY = lines[0].from.y;
|
|
519
|
-
const maxY = lines[lines.length - 1].to.y;
|
|
520
|
-
const resultLine = new Line(new Point(lines[0].from.x, minY), new Point(lines[0].from.x, maxY));
|
|
521
|
-
for (let i = 1;i < lines.length; i++) {
|
|
522
|
-
const prevLine = lines[i - 1];
|
|
523
|
-
const currLine = lines[i];
|
|
524
|
-
if (Math.abs(prevLine.to.y - currLine.from.y) > Shape.tolerance) {
|
|
525
|
-
const gapLine = new Line(new Point(prevLine.to.x, prevLine.to.y), new Point(prevLine.to.x, currLine.from.y));
|
|
526
|
-
resultLine.addGap(gapLine);
|
|
527
|
-
}
|
|
528
|
-
}
|
|
529
|
-
return resultLine;
|
|
530
|
-
}
|
|
531
|
-
normalize() {
|
|
532
|
-
this.hLines = this.hLines.filter((l) => l.intersections.length > 1);
|
|
533
|
-
this.vLines = this.vLines.filter((l) => l.intersections.length > 1);
|
|
534
|
-
this.hLines.sort((l1, l2) => l1.from.y - l2.from.y);
|
|
535
|
-
this.vLines.sort((l1, l2) => l1.from.x - l2.from.x);
|
|
536
|
-
const newHLines = [];
|
|
537
|
-
while (this.hLines.length > 0) {
|
|
538
|
-
const line = this.hLines.shift();
|
|
539
|
-
if (!line)
|
|
540
|
-
continue;
|
|
541
|
-
const lines = this.getSameHorizontal(line);
|
|
542
|
-
const merged = this.mergeHorizontalLines(lines);
|
|
543
|
-
newHLines.push(merged);
|
|
544
|
-
}
|
|
545
|
-
this.hLines = newHLines;
|
|
546
|
-
const newVLines = [];
|
|
547
|
-
while (this.vLines.length > 0) {
|
|
548
|
-
const line = this.vLines.shift();
|
|
549
|
-
if (!line)
|
|
550
|
-
continue;
|
|
551
|
-
const lines = this.getSameVertical(line);
|
|
552
|
-
const merged = this.mergeVerticalLines(lines);
|
|
553
|
-
newVLines.push(merged);
|
|
554
|
-
}
|
|
555
|
-
this.vLines = newVLines;
|
|
556
|
-
}
|
|
557
|
-
verticalExists(line, y1, y2) {
|
|
558
|
-
if (line.direction !== LineDirection.Vertical) {
|
|
559
|
-
throw new Error("Line is not vertical");
|
|
560
|
-
}
|
|
561
|
-
if (y1 >= y2) {
|
|
562
|
-
throw new Error("y1 must be less than y2");
|
|
563
|
-
}
|
|
564
|
-
if (line.from.y <= y1 && line.to.y >= y2) {
|
|
565
|
-
for (const gap of line.gaps) {
|
|
566
|
-
if (gap.from.y <= y1 && gap.to.y >= y2) {
|
|
567
|
-
return false;
|
|
568
|
-
}
|
|
569
|
-
}
|
|
570
|
-
return true;
|
|
571
|
-
}
|
|
572
|
-
return false;
|
|
573
|
-
}
|
|
574
|
-
horizontalExists(line, x1, x2) {
|
|
575
|
-
if (line.direction !== LineDirection.Horizontal) {
|
|
576
|
-
throw new Error("Line is not horizontal");
|
|
577
|
-
}
|
|
578
|
-
if (x1 >= x2) {
|
|
579
|
-
throw new Error("x1 must be less than x2");
|
|
580
|
-
}
|
|
581
|
-
if (line.from.x <= x1 && line.to.x >= x2) {
|
|
582
|
-
for (const gap of line.gaps) {
|
|
583
|
-
if (gap.from.x <= x1 && gap.to.x >= x2) {
|
|
584
|
-
return false;
|
|
585
|
-
}
|
|
586
|
-
}
|
|
587
|
-
return true;
|
|
588
|
-
}
|
|
589
|
-
return false;
|
|
590
|
-
}
|
|
591
|
-
findBottomLineIndex(h2Index, xMiddle) {
|
|
592
|
-
for (let i = h2Index;i < this.hLines.length; i++) {
|
|
593
|
-
const hLine = this.hLines[i];
|
|
594
|
-
if (hLine.from.x <= xMiddle && hLine.to.x >= xMiddle) {
|
|
595
|
-
return i;
|
|
596
|
-
}
|
|
597
|
-
}
|
|
598
|
-
return -1;
|
|
599
|
-
}
|
|
600
|
-
findVerticalLineIndexs(topHLine, yMiddle) {
|
|
601
|
-
const result = [];
|
|
602
|
-
for (let i = 0;i < this.vLines.length; i++) {
|
|
603
|
-
const vLine = this.vLines[i];
|
|
604
|
-
if (vLine.from.y <= yMiddle && vLine.to.y >= yMiddle && topHLine.intersection(vLine)) {
|
|
605
|
-
result.push(i);
|
|
606
|
-
}
|
|
607
|
-
}
|
|
608
|
-
return result;
|
|
609
|
-
}
|
|
610
|
-
getRow(h1Index, h2Index, yMiddle) {
|
|
611
|
-
const tableRow = [];
|
|
612
|
-
const topHLine = this.hLines[h1Index];
|
|
613
|
-
const vLineIndexes = this.findVerticalLineIndexs(topHLine, yMiddle);
|
|
614
|
-
for (let i = 1;i < vLineIndexes.length; i++) {
|
|
615
|
-
const leftVLine = this.vLines[vLineIndexes[i - 1]];
|
|
616
|
-
const rightVLine = this.vLines[vLineIndexes[i]];
|
|
617
|
-
const xMiddle = (leftVLine.from.x + rightVLine.from.x) / 2;
|
|
618
|
-
const bottomHLineIndex = this.findBottomLineIndex(h2Index, xMiddle);
|
|
619
|
-
const bottomHLine = this.hLines[bottomHLineIndex];
|
|
620
|
-
const tableCell = {
|
|
621
|
-
minXY: new Point(leftVLine.from.x, topHLine.from.y),
|
|
622
|
-
maxXY: new Point(rightVLine.from.x, bottomHLine.from.y),
|
|
623
|
-
width: rightVLine.from.x - leftVLine.from.x,
|
|
624
|
-
height: bottomHLine.from.y - topHLine.from.y,
|
|
625
|
-
text: []
|
|
626
|
-
};
|
|
627
|
-
const colSpan = vLineIndexes[i] - vLineIndexes[i - 1];
|
|
628
|
-
const rowSpan = bottomHLineIndex - h1Index;
|
|
629
|
-
if (colSpan > 1) {
|
|
630
|
-
tableCell.colspan = colSpan;
|
|
631
|
-
}
|
|
632
|
-
if (rowSpan > 1) {
|
|
633
|
-
tableCell.rowspan = rowSpan;
|
|
634
|
-
}
|
|
635
|
-
tableRow.push(tableCell);
|
|
636
|
-
}
|
|
637
|
-
return tableRow;
|
|
638
|
-
}
|
|
639
|
-
toData() {
|
|
640
|
-
const rowPivots = this.rowPivots;
|
|
641
|
-
const colPivots = this.colPivots;
|
|
642
|
-
const minXY = new Point(colPivots[0], rowPivots[0]);
|
|
643
|
-
const maxXY = new Point(colPivots[colPivots.length - 1], rowPivots[rowPivots.length - 1]);
|
|
644
|
-
const result = new TableData(minXY, maxXY, rowPivots, colPivots);
|
|
645
|
-
for (let h1 = 1;h1 < this.hLines.length; h1++) {
|
|
646
|
-
const prevHLine = this.hLines[h1 - 1];
|
|
647
|
-
const currHLine = this.hLines[h1];
|
|
648
|
-
const YMiddle = (prevHLine.from.y + currHLine.from.y) / 2;
|
|
649
|
-
const rowData = this.getRow(h1 - 1, h1, YMiddle);
|
|
650
|
-
result.rows.push(rowData);
|
|
651
|
-
}
|
|
652
|
-
return result;
|
|
653
|
-
}
|
|
654
|
-
}
|
|
655
|
-
|
|
656
|
-
// node_modules/pdf-parse/dist/pdf-parse/esm/geometry/LineStore.js
|
|
657
|
-
class LineStore {
|
|
658
|
-
hLines = [];
|
|
659
|
-
vLines = [];
|
|
660
|
-
add(line) {
|
|
661
|
-
if (line.valid) {
|
|
662
|
-
if (line.direction === LineDirection.Horizontal) {
|
|
663
|
-
this.hLines.push(line);
|
|
664
|
-
} else if (line.direction === LineDirection.Vertical) {
|
|
665
|
-
this.vLines.push(line);
|
|
666
|
-
}
|
|
667
|
-
}
|
|
668
|
-
}
|
|
669
|
-
addRectangle(rect) {
|
|
670
|
-
for (const line of rect.getLines()) {
|
|
671
|
-
this.add(line);
|
|
672
|
-
}
|
|
673
|
-
}
|
|
674
|
-
getTableData() {
|
|
675
|
-
const result = [];
|
|
676
|
-
const tables = this.getTables();
|
|
677
|
-
for (const table of tables) {
|
|
678
|
-
const data = table.toData();
|
|
679
|
-
if (data) {
|
|
680
|
-
result.push(data);
|
|
681
|
-
}
|
|
682
|
-
}
|
|
683
|
-
return result;
|
|
684
|
-
}
|
|
685
|
-
getTables() {
|
|
686
|
-
const result = [];
|
|
687
|
-
while (this.hLines.length !== 0) {
|
|
688
|
-
const hLine = this.hLines.shift();
|
|
689
|
-
if (!hLine)
|
|
690
|
-
continue;
|
|
691
|
-
const filled = this.tryFill(result, hLine);
|
|
692
|
-
if (filled)
|
|
693
|
-
continue;
|
|
694
|
-
const table = new Table(hLine);
|
|
695
|
-
this.fillTable(table);
|
|
696
|
-
result.push(table);
|
|
697
|
-
}
|
|
698
|
-
while (this.vLines.length !== 0) {
|
|
699
|
-
const vLine = this.vLines.shift();
|
|
700
|
-
if (!vLine)
|
|
701
|
-
continue;
|
|
702
|
-
const filled = this.tryFill(result, vLine);
|
|
703
|
-
if (filled)
|
|
704
|
-
continue;
|
|
705
|
-
const table = new Table(vLine);
|
|
706
|
-
this.fillTable(table);
|
|
707
|
-
result.push(table);
|
|
708
|
-
}
|
|
709
|
-
const validTables = result.filter((t) => t.isValid);
|
|
710
|
-
for (const table of validTables) {
|
|
711
|
-
table.normalize();
|
|
712
|
-
}
|
|
713
|
-
return validTables;
|
|
714
|
-
}
|
|
715
|
-
normalize() {
|
|
716
|
-
this.normalizeHorizontal();
|
|
717
|
-
this.normalizeVertical();
|
|
718
|
-
}
|
|
719
|
-
normalizeHorizontal() {
|
|
720
|
-
this.hLines.sort((l1, l2) => l1.from.y - l2.from.y);
|
|
721
|
-
const newLines = [];
|
|
722
|
-
let sameY = [];
|
|
723
|
-
for (const line of this.hLines) {
|
|
724
|
-
if (sameY.length === 0) {
|
|
725
|
-
sameY.push(line);
|
|
726
|
-
} else if (Math.abs(sameY[0]?.from.y - line.from.y) < Shape.tolerance) {
|
|
727
|
-
sameY.push(line);
|
|
728
|
-
} else {
|
|
729
|
-
const merged = this.margeHorizontalLines(sameY);
|
|
730
|
-
newLines.push(...merged);
|
|
731
|
-
sameY = [line];
|
|
732
|
-
}
|
|
733
|
-
}
|
|
734
|
-
if (sameY.length > 0) {
|
|
735
|
-
const merged = this.margeHorizontalLines(sameY);
|
|
736
|
-
newLines.push(...merged);
|
|
737
|
-
}
|
|
738
|
-
this.hLines = newLines;
|
|
739
|
-
}
|
|
740
|
-
normalizeVertical() {
|
|
741
|
-
this.vLines.sort((l1, l2) => l1.from.x - l2.from.x);
|
|
742
|
-
const newLines = [];
|
|
743
|
-
let sameX = [];
|
|
744
|
-
for (const line of this.vLines) {
|
|
745
|
-
if (sameX.length === 0) {
|
|
746
|
-
sameX.push(line);
|
|
747
|
-
} else if (Math.abs(sameX[0]?.from.x - line.from.x) < Shape.tolerance) {
|
|
748
|
-
sameX.push(line);
|
|
749
|
-
} else {
|
|
750
|
-
const merged = this.margeVerticalLines(sameX);
|
|
751
|
-
newLines.push(...merged);
|
|
752
|
-
sameX = [line];
|
|
753
|
-
}
|
|
754
|
-
}
|
|
755
|
-
if (sameX.length > 0) {
|
|
756
|
-
const merged = this.margeVerticalLines(sameX);
|
|
757
|
-
newLines.push(...merged);
|
|
758
|
-
}
|
|
759
|
-
this.vLines = newLines;
|
|
760
|
-
}
|
|
761
|
-
fillTable(table) {
|
|
762
|
-
const newVLines = [];
|
|
763
|
-
const newHLines = [];
|
|
764
|
-
for (const vLine of this.vLines) {
|
|
765
|
-
if (!table.add(vLine)) {
|
|
766
|
-
newVLines.push(vLine);
|
|
767
|
-
}
|
|
768
|
-
}
|
|
769
|
-
for (const hLine of this.hLines) {
|
|
770
|
-
if (!table.add(hLine)) {
|
|
771
|
-
newHLines.push(hLine);
|
|
772
|
-
}
|
|
773
|
-
}
|
|
774
|
-
this.hLines = newHLines;
|
|
775
|
-
this.vLines = newVLines;
|
|
776
|
-
}
|
|
777
|
-
tryFill(tables, line) {
|
|
778
|
-
for (const table of tables) {
|
|
779
|
-
if (table.add(line)) {
|
|
780
|
-
this.fillTable(table);
|
|
781
|
-
return true;
|
|
782
|
-
}
|
|
783
|
-
}
|
|
784
|
-
return false;
|
|
785
|
-
}
|
|
786
|
-
margeHorizontalLines(sameYLines) {
|
|
787
|
-
const result = [];
|
|
788
|
-
sameYLines.sort((l1, l2) => l1.from.x - l2.from.x);
|
|
789
|
-
const sameY = sameYLines[0]?.from.y;
|
|
790
|
-
if (sameY === undefined)
|
|
791
|
-
return result;
|
|
792
|
-
let minX = Number.MAX_SAFE_INTEGER;
|
|
793
|
-
let maxX = Number.MIN_SAFE_INTEGER;
|
|
794
|
-
for (const line of sameYLines) {
|
|
795
|
-
if (line.from.x - maxX < Shape.tolerance) {
|
|
796
|
-
if (line.from.x < minX) {
|
|
797
|
-
minX = line.from.x;
|
|
798
|
-
}
|
|
799
|
-
if (line.to.x > maxX) {
|
|
800
|
-
maxX = line.to.x;
|
|
801
|
-
}
|
|
802
|
-
} else {
|
|
803
|
-
if (maxX > minX) {
|
|
804
|
-
result.push(new Line(new Point(minX, sameY), new Point(maxX, sameY)));
|
|
805
|
-
}
|
|
806
|
-
minX = line.from.x;
|
|
807
|
-
maxX = line.to.x;
|
|
808
|
-
}
|
|
809
|
-
}
|
|
810
|
-
const last = result[result.length - 1];
|
|
811
|
-
if (last) {
|
|
812
|
-
if (last.from.x !== minX && last.to.x !== maxX) {
|
|
813
|
-
result.push(new Line(new Point(minX, sameY), new Point(maxX, sameY)));
|
|
814
|
-
}
|
|
815
|
-
} else {
|
|
816
|
-
result.push(new Line(new Point(minX, sameY), new Point(maxX, sameY)));
|
|
817
|
-
}
|
|
818
|
-
return result;
|
|
819
|
-
}
|
|
820
|
-
margeVerticalLines(sameXLines) {
|
|
821
|
-
const result = [];
|
|
822
|
-
sameXLines.sort((l1, l2) => l1.from.y - l2.from.y);
|
|
823
|
-
const sameX = sameXLines[0]?.from.x;
|
|
824
|
-
if (sameX === undefined)
|
|
825
|
-
return result;
|
|
826
|
-
let minY = Number.MAX_SAFE_INTEGER;
|
|
827
|
-
let maxY = Number.MIN_SAFE_INTEGER;
|
|
828
|
-
for (const line of sameXLines) {
|
|
829
|
-
if (line.from.y - maxY < Shape.tolerance) {
|
|
830
|
-
if (line.from.y < minY) {
|
|
831
|
-
minY = line.from.y;
|
|
832
|
-
}
|
|
833
|
-
if (line.to.y > maxY) {
|
|
834
|
-
maxY = line.to.y;
|
|
835
|
-
}
|
|
836
|
-
} else {
|
|
837
|
-
if (maxY > minY) {
|
|
838
|
-
result.push(new Line(new Point(sameX, minY), new Point(sameX, maxY)));
|
|
839
|
-
}
|
|
840
|
-
minY = line.from.y;
|
|
841
|
-
maxY = line.to.y;
|
|
842
|
-
}
|
|
843
|
-
}
|
|
844
|
-
const last = result[result.length - 1];
|
|
845
|
-
if (last) {
|
|
846
|
-
if (last.from.y !== minY && last.to.y !== maxY) {
|
|
847
|
-
result.push(new Line(new Point(sameX, minY), new Point(sameX, maxY)));
|
|
848
|
-
}
|
|
849
|
-
} else {
|
|
850
|
-
result.push(new Line(new Point(sameX, minY), new Point(sameX, maxY)));
|
|
851
|
-
}
|
|
852
|
-
return result;
|
|
853
|
-
}
|
|
854
|
-
}
|
|
855
|
-
// node_modules/pdf-parse/dist/pdf-parse/esm/geometry/Rectangle.js
|
|
856
|
-
class Rectangle extends Shape {
|
|
857
|
-
from;
|
|
858
|
-
width;
|
|
859
|
-
height;
|
|
860
|
-
constructor(from, width, height) {
|
|
861
|
-
super();
|
|
862
|
-
this.from = from;
|
|
863
|
-
this.width = width;
|
|
864
|
-
this.height = height;
|
|
865
|
-
}
|
|
866
|
-
get to() {
|
|
867
|
-
return new Point(this.from.x + this.width, this.from.y + this.height);
|
|
868
|
-
}
|
|
869
|
-
getLines() {
|
|
870
|
-
const to = this.to;
|
|
871
|
-
const lines = [
|
|
872
|
-
new Line(this.from, new Point(to.x, this.from.y)),
|
|
873
|
-
new Line(this.from, new Point(this.from.x, to.y)),
|
|
874
|
-
new Line(new Point(to.x, this.from.y), to),
|
|
875
|
-
new Line(new Point(this.from.x, to.y), to)
|
|
876
|
-
];
|
|
877
|
-
return lines.filter((l) => l.valid);
|
|
878
|
-
}
|
|
879
|
-
transform(matrix) {
|
|
880
|
-
const p1 = Shape.applyTransform([this.from.x, this.from.y], matrix);
|
|
881
|
-
const p2 = Shape.applyTransform([this.from.x + this.width, this.from.y + this.height], matrix);
|
|
882
|
-
const x = Math.min(p1[0], p2[0]);
|
|
883
|
-
const y = Math.min(p1[1], p2[1]);
|
|
884
|
-
const width = Math.abs(p1[0] - p2[0]);
|
|
885
|
-
const height = Math.abs(p1[1] - p2[1]);
|
|
886
|
-
this.from = new Point(x, y);
|
|
887
|
-
this.width = width;
|
|
888
|
-
this.height = height;
|
|
889
|
-
return this;
|
|
890
|
-
}
|
|
891
|
-
}
|
|
892
|
-
// node_modules/pdf-parse/dist/pdf-parse/esm/ImageResult.js
|
|
893
|
-
class ImageResult {
|
|
894
|
-
pages = [];
|
|
895
|
-
total = 0;
|
|
896
|
-
getPageImage(num, name) {
|
|
897
|
-
for (const pageData of this.pages) {
|
|
898
|
-
if (pageData.pageNumber === num) {
|
|
899
|
-
for (const img of pageData.images) {
|
|
900
|
-
if (img.name === name) {
|
|
901
|
-
return img;
|
|
902
|
-
}
|
|
903
|
-
}
|
|
904
|
-
}
|
|
905
|
-
}
|
|
906
|
-
return null;
|
|
907
|
-
}
|
|
908
|
-
constructor(total) {
|
|
909
|
-
this.total = total;
|
|
910
|
-
}
|
|
911
|
-
}
|
|
912
|
-
|
|
913
|
-
// node_modules/pdf-parse/dist/pdf-parse/esm/InfoResult.js
|
|
914
|
-
var pdfjs = __toESM(require("pdfjs-dist/legacy/build/pdf.mjs"));
|
|
915
|
-
var XMP_DATE_PROPERTIES = [
|
|
916
|
-
"xmp:createdate",
|
|
917
|
-
"xmp:modifydate",
|
|
918
|
-
"xmp:metadatadate",
|
|
919
|
-
"xap:createdate",
|
|
920
|
-
"xap:modifydate",
|
|
921
|
-
"xap:metadatadate"
|
|
922
|
-
];
|
|
923
|
-
|
|
924
|
-
class InfoResult {
|
|
925
|
-
total;
|
|
926
|
-
info;
|
|
927
|
-
metadata;
|
|
928
|
-
fingerprints;
|
|
929
|
-
permission;
|
|
930
|
-
outline;
|
|
931
|
-
pages = [];
|
|
932
|
-
getDateNode() {
|
|
933
|
-
const result = {};
|
|
934
|
-
const CreationDate = this.info?.CreationDate;
|
|
935
|
-
if (CreationDate) {
|
|
936
|
-
result.CreationDate = pdfjs.PDFDateString.toDateObject(CreationDate);
|
|
937
|
-
}
|
|
938
|
-
const ModDate = this.info?.ModDate;
|
|
939
|
-
if (ModDate) {
|
|
940
|
-
result.ModDate = pdfjs.PDFDateString.toDateObject(ModDate);
|
|
941
|
-
}
|
|
942
|
-
if (!this.metadata) {
|
|
943
|
-
return result;
|
|
944
|
-
}
|
|
945
|
-
for (const prop of XMP_DATE_PROPERTIES) {
|
|
946
|
-
const value = this.metadata?.get(prop);
|
|
947
|
-
const date = this.parseISODateString(value);
|
|
948
|
-
switch (prop) {
|
|
949
|
-
case XMP_DATE_PROPERTIES[0]:
|
|
950
|
-
result.XmpCreateDate = date;
|
|
951
|
-
break;
|
|
952
|
-
case XMP_DATE_PROPERTIES[1]:
|
|
953
|
-
result.XmpModifyDate = date;
|
|
954
|
-
break;
|
|
955
|
-
case XMP_DATE_PROPERTIES[2]:
|
|
956
|
-
result.XmpMetadataDate = date;
|
|
957
|
-
break;
|
|
958
|
-
case XMP_DATE_PROPERTIES[3]:
|
|
959
|
-
result.XapCreateDate = date;
|
|
960
|
-
break;
|
|
961
|
-
case XMP_DATE_PROPERTIES[4]:
|
|
962
|
-
result.XapModifyDate = date;
|
|
963
|
-
break;
|
|
964
|
-
case XMP_DATE_PROPERTIES[5]:
|
|
965
|
-
result.XapMetadataDate = date;
|
|
966
|
-
break;
|
|
967
|
-
}
|
|
968
|
-
}
|
|
969
|
-
return result;
|
|
970
|
-
}
|
|
971
|
-
parseISODateString(isoDateString) {
|
|
972
|
-
if (!isoDateString)
|
|
973
|
-
return;
|
|
974
|
-
const parsedDate = Date.parse(isoDateString);
|
|
975
|
-
if (!Number.isNaN(parsedDate)) {
|
|
976
|
-
return new Date(parsedDate);
|
|
977
|
-
}
|
|
978
|
-
return;
|
|
979
|
-
}
|
|
980
|
-
constructor(total) {
|
|
981
|
-
this.total = total;
|
|
982
|
-
}
|
|
983
|
-
}
|
|
984
|
-
|
|
985
|
-
// node_modules/pdf-parse/dist/pdf-parse/esm/ParseParameters.js
|
|
986
|
-
function setDefaultParseParameters(params) {
|
|
987
|
-
params.lineThreshold = params?.lineThreshold ?? 4.6;
|
|
988
|
-
params.cellThreshold = params?.cellThreshold ?? 7;
|
|
989
|
-
params.cellSeparator = params?.cellSeparator ?? "\t";
|
|
990
|
-
params.lineEnforce = params?.lineEnforce ?? true;
|
|
991
|
-
params.pageJoiner = params?.pageJoiner ?? `
|
|
992
|
-
-- page_number of total_number --`;
|
|
993
|
-
params.imageThreshold = params?.imageThreshold ?? 80;
|
|
994
|
-
params.imageDataUrl = params?.imageDataUrl ?? true;
|
|
995
|
-
params.imageBuffer = params?.imageBuffer ?? true;
|
|
996
|
-
params.scale = params?.scale ?? 1;
|
|
997
|
-
return params;
|
|
998
|
-
}
|
|
999
|
-
|
|
1000
|
-
// node_modules/pdf-parse/dist/pdf-parse/esm/PathGeometry.js
|
|
1001
|
-
var PathGeometry;
|
|
1002
|
-
(function(PathGeometry2) {
|
|
1003
|
-
PathGeometry2[PathGeometry2["undefined"] = 0] = "undefined";
|
|
1004
|
-
PathGeometry2[PathGeometry2["hline"] = 1] = "hline";
|
|
1005
|
-
PathGeometry2[PathGeometry2["vline"] = 2] = "vline";
|
|
1006
|
-
PathGeometry2[PathGeometry2["rectangle"] = 3] = "rectangle";
|
|
1007
|
-
})(PathGeometry || (PathGeometry = {}));
|
|
1008
|
-
var DrawOPS;
|
|
1009
|
-
(function(DrawOPS2) {
|
|
1010
|
-
DrawOPS2[DrawOPS2["moveTo"] = 0] = "moveTo";
|
|
1011
|
-
DrawOPS2[DrawOPS2["lineTo"] = 1] = "lineTo";
|
|
1012
|
-
DrawOPS2[DrawOPS2["curveTo"] = 2] = "curveTo";
|
|
1013
|
-
DrawOPS2[DrawOPS2["closePath"] = 3] = "closePath";
|
|
1014
|
-
DrawOPS2[DrawOPS2["rectangle"] = 4] = "rectangle";
|
|
1015
|
-
})(DrawOPS || (DrawOPS = {}));
|
|
1016
|
-
|
|
1017
|
-
// node_modules/pdf-parse/dist/pdf-parse/esm/ScreenshotResult.js
|
|
1018
|
-
class ScreenshotResult {
|
|
1019
|
-
pages = [];
|
|
1020
|
-
total = 0;
|
|
1021
|
-
constructor(total) {
|
|
1022
|
-
this.total = total;
|
|
1023
|
-
}
|
|
1024
|
-
}
|
|
1025
|
-
|
|
1026
|
-
// node_modules/pdf-parse/dist/pdf-parse/esm/TableResult.js
|
|
1027
|
-
class TableResult {
|
|
1028
|
-
pages = [];
|
|
1029
|
-
mergedTables = [];
|
|
1030
|
-
total = 0;
|
|
1031
|
-
constructor(total) {
|
|
1032
|
-
this.total = total;
|
|
1033
|
-
}
|
|
1034
|
-
}
|
|
1035
|
-
|
|
1036
|
-
// node_modules/pdf-parse/dist/pdf-parse/esm/TextResult.js
|
|
1037
|
-
class TextResult {
|
|
1038
|
-
pages = [];
|
|
1039
|
-
text = "";
|
|
1040
|
-
total = 0;
|
|
1041
|
-
getPageText(num) {
|
|
1042
|
-
for (const pageData of this.pages) {
|
|
1043
|
-
if (pageData.num === num)
|
|
1044
|
-
return pageData.text;
|
|
1045
|
-
}
|
|
1046
|
-
return "";
|
|
1047
|
-
}
|
|
1048
|
-
constructor(total) {
|
|
1049
|
-
this.total = total;
|
|
1050
|
-
}
|
|
1051
|
-
}
|
|
1052
|
-
|
|
1053
|
-
// node_modules/pdf-parse/dist/pdf-parse/esm/PDFParse.js
|
|
1054
|
-
class PDFParse {
|
|
1055
|
-
options;
|
|
1056
|
-
doc;
|
|
1057
|
-
progress = { loaded: -1, total: 0 };
|
|
1058
|
-
constructor(options) {
|
|
1059
|
-
if (options.verbosity === undefined) {
|
|
1060
|
-
options.verbosity = pdfjs2.VerbosityLevel.ERRORS;
|
|
1061
|
-
}
|
|
1062
|
-
if (typeof Buffer !== "undefined" && options.data instanceof Buffer) {
|
|
1063
|
-
options.data = new Uint8Array(options.data);
|
|
1064
|
-
}
|
|
1065
|
-
this.options = options;
|
|
1066
|
-
}
|
|
1067
|
-
async destroy() {
|
|
1068
|
-
if (this.doc) {
|
|
1069
|
-
await this.doc.destroy();
|
|
1070
|
-
this.doc = undefined;
|
|
1071
|
-
}
|
|
1072
|
-
}
|
|
1073
|
-
static get isNodeJS() {
|
|
1074
|
-
const isNodeJS = typeof process === "object" && `${process}` === "[object process]" && !process.versions.nw && !(process.versions.electron && typeof process.type !== "undefined" && process.type !== "browser");
|
|
1075
|
-
return isNodeJS;
|
|
1076
|
-
}
|
|
1077
|
-
static setWorker(workerSrc) {
|
|
1078
|
-
if (typeof globalThis.pdfjs === "undefined") {
|
|
1079
|
-
globalThis.pdfjs = pdfjs2;
|
|
1080
|
-
}
|
|
1081
|
-
if (pdfjs2?.GlobalWorkerOptions === null)
|
|
1082
|
-
return "";
|
|
1083
|
-
if (workerSrc !== undefined) {
|
|
1084
|
-
pdfjs2.GlobalWorkerOptions.workerSrc = workerSrc;
|
|
1085
|
-
return pdfjs2.GlobalWorkerOptions.workerSrc;
|
|
1086
|
-
}
|
|
1087
|
-
return pdfjs2.GlobalWorkerOptions.workerSrc;
|
|
1088
|
-
}
|
|
1089
|
-
async getInfo(params = {}) {
|
|
1090
|
-
const doc = await this.load();
|
|
1091
|
-
const result = new InfoResult(doc.numPages);
|
|
1092
|
-
const { info, metadata } = await doc.getMetadata();
|
|
1093
|
-
result.info = info;
|
|
1094
|
-
result.metadata = metadata;
|
|
1095
|
-
result.fingerprints = doc.fingerprints;
|
|
1096
|
-
result.outline = await doc.getOutline();
|
|
1097
|
-
result.permission = await doc.getPermissions();
|
|
1098
|
-
const pageLabels = await doc.getPageLabels();
|
|
1099
|
-
if (params.parsePageInfo) {
|
|
1100
|
-
for (let i = 1;i <= result.total; i++) {
|
|
1101
|
-
if (this.shouldParse(i, result.total, params)) {
|
|
1102
|
-
const page = await doc.getPage(i);
|
|
1103
|
-
const pageLinkResult = await this.getPageLinks(page);
|
|
1104
|
-
pageLinkResult.pageLabel = pageLabels?.[page.pageNumber];
|
|
1105
|
-
result.pages.push(pageLinkResult);
|
|
1106
|
-
page.cleanup();
|
|
1107
|
-
}
|
|
1108
|
-
}
|
|
1109
|
-
}
|
|
1110
|
-
return result;
|
|
1111
|
-
}
|
|
1112
|
-
async getPageLinks(page) {
|
|
1113
|
-
const viewport = page.getViewport({ scale: 1 });
|
|
1114
|
-
const result = {
|
|
1115
|
-
pageNumber: page.pageNumber,
|
|
1116
|
-
links: [],
|
|
1117
|
-
width: viewport.width,
|
|
1118
|
-
height: viewport.height
|
|
1119
|
-
};
|
|
1120
|
-
const annotations = await page.getAnnotations({ intent: "display" }) || [];
|
|
1121
|
-
for (const i of annotations) {
|
|
1122
|
-
if (i.subtype !== "Link")
|
|
1123
|
-
continue;
|
|
1124
|
-
const url = i.url ?? i.unsafeUrl;
|
|
1125
|
-
if (!url)
|
|
1126
|
-
continue;
|
|
1127
|
-
const text = i.overlaidText || "";
|
|
1128
|
-
result.links.push({ url, text });
|
|
1129
|
-
}
|
|
1130
|
-
return result;
|
|
1131
|
-
}
|
|
1132
|
-
async getText(params = {}) {
|
|
1133
|
-
const doc = await this.load();
|
|
1134
|
-
const result = new TextResult(doc.numPages);
|
|
1135
|
-
for (let i = 1;i <= result.total; i++) {
|
|
1136
|
-
if (this.shouldParse(i, result.total, params)) {
|
|
1137
|
-
const page = await doc.getPage(i);
|
|
1138
|
-
const text = await this.getPageText(page, params, result.total);
|
|
1139
|
-
result.pages.push({
|
|
1140
|
-
text,
|
|
1141
|
-
num: i
|
|
1142
|
-
});
|
|
1143
|
-
page.cleanup();
|
|
1144
|
-
}
|
|
1145
|
-
}
|
|
1146
|
-
for (const page of result.pages) {
|
|
1147
|
-
if (params.pageJoiner) {
|
|
1148
|
-
let pageNumber = params.pageJoiner.replace("page_number", `${page.num}`);
|
|
1149
|
-
pageNumber = pageNumber.replace("total_number", `${result.total}`);
|
|
1150
|
-
result.text += `${page.text}
|
|
1151
|
-
${pageNumber}
|
|
1152
|
-
|
|
1153
|
-
`;
|
|
1154
|
-
} else {
|
|
1155
|
-
result.text += `${page.text}
|
|
1156
|
-
|
|
1157
|
-
`;
|
|
1158
|
-
}
|
|
1159
|
-
}
|
|
1160
|
-
return result;
|
|
1161
|
-
}
|
|
1162
|
-
async load() {
|
|
1163
|
-
try {
|
|
1164
|
-
if (this.doc === undefined) {
|
|
1165
|
-
const loadingTask = pdfjs2.getDocument(this.options);
|
|
1166
|
-
loadingTask.onProgress = (progress) => {
|
|
1167
|
-
this.progress = progress;
|
|
1168
|
-
};
|
|
1169
|
-
this.doc = await loadingTask.promise;
|
|
1170
|
-
}
|
|
1171
|
-
return this.doc;
|
|
1172
|
-
} catch (error) {
|
|
1173
|
-
throw getException(error);
|
|
1174
|
-
}
|
|
1175
|
-
}
|
|
1176
|
-
shouldParse(currentPage, totalPage, params) {
|
|
1177
|
-
params.partial = params?.partial ?? [];
|
|
1178
|
-
params.first = params?.first ?? 0;
|
|
1179
|
-
params.last = params?.last ?? 0;
|
|
1180
|
-
if (params.partial.length > 0) {
|
|
1181
|
-
if (params.partial.includes(currentPage)) {
|
|
1182
|
-
return true;
|
|
1183
|
-
}
|
|
1184
|
-
return false;
|
|
1185
|
-
}
|
|
1186
|
-
if (params.first > 0 && params.last > 0) {
|
|
1187
|
-
if (currentPage >= params.first && currentPage <= params.last) {
|
|
1188
|
-
return true;
|
|
1189
|
-
}
|
|
1190
|
-
return false;
|
|
1191
|
-
}
|
|
1192
|
-
if (params.first > 0) {
|
|
1193
|
-
if (currentPage <= params.first) {
|
|
1194
|
-
return true;
|
|
1195
|
-
}
|
|
1196
|
-
return false;
|
|
1197
|
-
}
|
|
1198
|
-
if (params.last > 0) {
|
|
1199
|
-
if (currentPage > totalPage - params.last) {
|
|
1200
|
-
return true;
|
|
1201
|
-
}
|
|
1202
|
-
return false;
|
|
1203
|
-
}
|
|
1204
|
-
return true;
|
|
1205
|
-
}
|
|
1206
|
-
async getPageText(page, parseParams, total) {
|
|
1207
|
-
const viewport = page.getViewport({ scale: 1 });
|
|
1208
|
-
const params = setDefaultParseParameters(parseParams);
|
|
1209
|
-
const textContent = await page.getTextContent({
|
|
1210
|
-
includeMarkedContent: !!params.includeMarkedContent,
|
|
1211
|
-
disableNormalization: !!params.disableNormalization
|
|
1212
|
-
});
|
|
1213
|
-
let links = new Map;
|
|
1214
|
-
if (params.parseHyperlinks) {
|
|
1215
|
-
links = await this.getHyperlinks(page, viewport);
|
|
1216
|
-
}
|
|
1217
|
-
const strBuf = [];
|
|
1218
|
-
let lastX;
|
|
1219
|
-
let lastY;
|
|
1220
|
-
let lineHeight = 0;
|
|
1221
|
-
for (const item of textContent.items) {
|
|
1222
|
-
if (!("str" in item))
|
|
1223
|
-
continue;
|
|
1224
|
-
const tm = item.transform ?? item.transform;
|
|
1225
|
-
const [x, y] = viewport.convertToViewportPoint(tm[4], tm[5]);
|
|
1226
|
-
if (params.parseHyperlinks) {
|
|
1227
|
-
const posArr = links.get(item.str) || [];
|
|
1228
|
-
const hit = posArr.find((l) => x >= l.rect.left && x <= l.rect.right && y >= l.rect.top && y <= l.rect.bottom);
|
|
1229
|
-
if (hit) {
|
|
1230
|
-
item.str = `[${item.str}](${hit.url})`;
|
|
1231
|
-
}
|
|
1232
|
-
}
|
|
1233
|
-
if (params.lineEnforce) {
|
|
1234
|
-
if (lastY !== undefined && Math.abs(lastY - y) > params.lineThreshold) {
|
|
1235
|
-
const lastItem = strBuf.length ? strBuf[strBuf.length - 1] : undefined;
|
|
1236
|
-
const isCurrentItemHasNewLine = item.str.startsWith(`
|
|
1237
|
-
`) || item.str.trim() === "" && item.hasEOL;
|
|
1238
|
-
if (lastItem?.endsWith(`
|
|
1239
|
-
`) === false && !isCurrentItemHasNewLine) {
|
|
1240
|
-
const ydiff = Math.abs(lastY - y);
|
|
1241
|
-
if (ydiff - 1 > lineHeight) {
|
|
1242
|
-
strBuf.push(`
|
|
1243
|
-
`);
|
|
1244
|
-
lineHeight = 0;
|
|
1245
|
-
}
|
|
1246
|
-
}
|
|
1247
|
-
}
|
|
1248
|
-
}
|
|
1249
|
-
if (params.cellSeparator) {
|
|
1250
|
-
if (lastY !== undefined && Math.abs(lastY - y) < params.lineThreshold) {
|
|
1251
|
-
if (lastX !== undefined && Math.abs(lastX - x) > params.cellThreshold) {
|
|
1252
|
-
item.str = `${params.cellSeparator}${item.str}`;
|
|
1253
|
-
}
|
|
1254
|
-
}
|
|
1255
|
-
}
|
|
1256
|
-
strBuf.push(item.str);
|
|
1257
|
-
lastX = x + item.width;
|
|
1258
|
-
lastY = y;
|
|
1259
|
-
lineHeight = Math.max(lineHeight, item.height);
|
|
1260
|
-
if (item.hasEOL) {
|
|
1261
|
-
strBuf.push(`
|
|
1262
|
-
`);
|
|
1263
|
-
}
|
|
1264
|
-
if (item.hasEOL || item.str.endsWith(`
|
|
1265
|
-
`)) {
|
|
1266
|
-
lineHeight = 0;
|
|
1267
|
-
}
|
|
1268
|
-
}
|
|
1269
|
-
if (params.itemJoiner) {
|
|
1270
|
-
return strBuf.join(params.itemJoiner);
|
|
1271
|
-
}
|
|
1272
|
-
return strBuf.join("");
|
|
1273
|
-
}
|
|
1274
|
-
async getHyperlinks(page, viewport) {
|
|
1275
|
-
const result = new Map;
|
|
1276
|
-
const annotations = await page.getAnnotations({ intent: "display" }) || [];
|
|
1277
|
-
for (const i of annotations) {
|
|
1278
|
-
if (i.subtype !== "Link")
|
|
1279
|
-
continue;
|
|
1280
|
-
const url = i.url ?? i.unsafeUrl;
|
|
1281
|
-
if (!url)
|
|
1282
|
-
continue;
|
|
1283
|
-
const text = i.overlaidText;
|
|
1284
|
-
if (!text)
|
|
1285
|
-
continue;
|
|
1286
|
-
const rectVp = viewport.convertToViewportRectangle(i.rect);
|
|
1287
|
-
const left = Math.min(rectVp[0], rectVp[2]) - 0.5;
|
|
1288
|
-
const top = Math.min(rectVp[1], rectVp[3]) - 0.5;
|
|
1289
|
-
const right = Math.max(rectVp[0], rectVp[2]) + 0.5;
|
|
1290
|
-
const bottom = Math.max(rectVp[1], rectVp[3]) + 0.5;
|
|
1291
|
-
const pos = { rect: { left, top, right, bottom }, url, text, used: false };
|
|
1292
|
-
const el = result.get(text);
|
|
1293
|
-
if (el) {
|
|
1294
|
-
el.push(pos);
|
|
1295
|
-
} else {
|
|
1296
|
-
result.set(text, [pos]);
|
|
1297
|
-
}
|
|
1298
|
-
}
|
|
1299
|
-
return result;
|
|
1300
|
-
}
|
|
1301
|
-
async getImage(params = {}) {
|
|
1302
|
-
const doc = await this.load();
|
|
1303
|
-
const result = new ImageResult(doc.numPages);
|
|
1304
|
-
setDefaultParseParameters(params);
|
|
1305
|
-
for (let i = 1;i <= result.total; i++) {
|
|
1306
|
-
if (this.shouldParse(i, result.total, params)) {
|
|
1307
|
-
const page = await doc.getPage(i);
|
|
1308
|
-
const ops = await page.getOperatorList();
|
|
1309
|
-
const pageImages = { pageNumber: i, images: [] };
|
|
1310
|
-
result.pages.push(pageImages);
|
|
1311
|
-
for (let j = 0;j < ops.fnArray.length; j++) {
|
|
1312
|
-
if (ops.fnArray[j] === pdfjs2.OPS.paintInlineImageXObject || ops.fnArray[j] === pdfjs2.OPS.paintImageXObject) {
|
|
1313
|
-
const name = ops.argsArray[j][0];
|
|
1314
|
-
const isCommon = page.commonObjs.has(name);
|
|
1315
|
-
const imgPromise = isCommon ? this.resolveEmbeddedImage(page.commonObjs, name) : this.resolveEmbeddedImage(page.objs, name);
|
|
1316
|
-
const { width, height, kind, data } = await imgPromise;
|
|
1317
|
-
if (params.imageThreshold) {
|
|
1318
|
-
if (params.imageThreshold >= width || params.imageThreshold >= height) {
|
|
1319
|
-
continue;
|
|
1320
|
-
}
|
|
1321
|
-
}
|
|
1322
|
-
const canvasFactory = doc.canvasFactory;
|
|
1323
|
-
const canvasAndContext = canvasFactory.create(width, height);
|
|
1324
|
-
const context = canvasAndContext.context;
|
|
1325
|
-
let imgData = null;
|
|
1326
|
-
if (kind === pdfjs2.ImageKind.RGBA_32BPP) {
|
|
1327
|
-
imgData = context.createImageData(width, height);
|
|
1328
|
-
imgData.data.set(data);
|
|
1329
|
-
} else {
|
|
1330
|
-
imgData = context.createImageData(width, height);
|
|
1331
|
-
this.convertToRGBA({
|
|
1332
|
-
src: data,
|
|
1333
|
-
dest: new Uint32Array(imgData.data.buffer),
|
|
1334
|
-
width,
|
|
1335
|
-
height,
|
|
1336
|
-
kind
|
|
1337
|
-
});
|
|
1338
|
-
}
|
|
1339
|
-
context.putImageData(imgData, 0, 0);
|
|
1340
|
-
let buffer = new Uint8Array;
|
|
1341
|
-
let dataUrl = "";
|
|
1342
|
-
if (typeof canvasAndContext.canvas.toBuffer === "function") {
|
|
1343
|
-
let nodeBuffer;
|
|
1344
|
-
if (params.imageBuffer) {
|
|
1345
|
-
nodeBuffer = canvasAndContext.canvas.toBuffer("image/png");
|
|
1346
|
-
buffer = new Uint8Array(nodeBuffer);
|
|
1347
|
-
}
|
|
1348
|
-
if (params.imageDataUrl) {
|
|
1349
|
-
if (nodeBuffer) {
|
|
1350
|
-
dataUrl = `data:image/png;base64,${nodeBuffer.toString("base64")}`;
|
|
1351
|
-
} else {
|
|
1352
|
-
nodeBuffer = canvasAndContext.canvas.toBuffer("image/png");
|
|
1353
|
-
buffer = new Uint8Array(nodeBuffer);
|
|
1354
|
-
dataUrl = `data:image/png;base64,${nodeBuffer.toString("base64")}`;
|
|
1355
|
-
}
|
|
1356
|
-
}
|
|
1357
|
-
} else {
|
|
1358
|
-
if (params.imageBuffer) {
|
|
1359
|
-
const imageData = canvasAndContext.context.getImageData(0, 0, canvasAndContext.canvas.width, canvasAndContext.canvas.height);
|
|
1360
|
-
buffer = new Uint8Array(imageData.data);
|
|
1361
|
-
}
|
|
1362
|
-
if (params.imageDataUrl) {
|
|
1363
|
-
dataUrl = canvasAndContext.canvas.toDataURL("image/png");
|
|
1364
|
-
}
|
|
1365
|
-
}
|
|
1366
|
-
pageImages.images.push({
|
|
1367
|
-
data: buffer,
|
|
1368
|
-
dataUrl,
|
|
1369
|
-
name,
|
|
1370
|
-
height,
|
|
1371
|
-
width,
|
|
1372
|
-
kind
|
|
1373
|
-
});
|
|
1374
|
-
}
|
|
1375
|
-
}
|
|
1376
|
-
}
|
|
1377
|
-
}
|
|
1378
|
-
return result;
|
|
1379
|
-
}
|
|
1380
|
-
convertToRGBA({ src, dest, width, height, kind }) {
|
|
1381
|
-
if (kind === pdfjs2.ImageKind.RGB_24BPP) {
|
|
1382
|
-
for (let i = 0, j = 0;i < src.length; i += 3, j++) {
|
|
1383
|
-
const r = src[i];
|
|
1384
|
-
const g = src[i + 1];
|
|
1385
|
-
const b = src[i + 2];
|
|
1386
|
-
dest[j] = 255 << 24 | b << 16 | g << 8 | r;
|
|
1387
|
-
}
|
|
1388
|
-
} else if (kind === pdfjs2.ImageKind.GRAYSCALE_1BPP) {
|
|
1389
|
-
let pixelIndex = 0;
|
|
1390
|
-
for (let i = 0;i < src.length; i++) {
|
|
1391
|
-
const byte = src[i];
|
|
1392
|
-
for (let bit = 7;bit >= 0; bit--) {
|
|
1393
|
-
if (pixelIndex >= width * height)
|
|
1394
|
-
break;
|
|
1395
|
-
const isWhite = (byte >> bit & 1) === 1;
|
|
1396
|
-
const gray = isWhite ? 255 : 0;
|
|
1397
|
-
dest[pixelIndex++] = 255 << 24 | gray << 16 | gray << 8 | gray;
|
|
1398
|
-
}
|
|
1399
|
-
}
|
|
1400
|
-
} else if (kind === undefined || kind === null) {
|
|
1401
|
-
const bytesPerPixel = src.length / (width * height);
|
|
1402
|
-
if (Math.abs(bytesPerPixel - 3) < 0.1) {
|
|
1403
|
-
for (let i = 0, j = 0;i < src.length; i += 3, j++) {
|
|
1404
|
-
const r = src[i];
|
|
1405
|
-
const g = src[i + 1];
|
|
1406
|
-
const b = src[i + 2];
|
|
1407
|
-
dest[j] = 255 << 24 | b << 16 | g << 8 | r;
|
|
1408
|
-
}
|
|
1409
|
-
} else if (Math.abs(bytesPerPixel - 4) < 0.1) {
|
|
1410
|
-
for (let i = 0, j = 0;i < src.length; i += 4, j++) {
|
|
1411
|
-
const r = src[i];
|
|
1412
|
-
const g = src[i + 1];
|
|
1413
|
-
const b = src[i + 2];
|
|
1414
|
-
const a = src[i + 3];
|
|
1415
|
-
dest[j] = a << 24 | b << 16 | g << 8 | r;
|
|
1416
|
-
}
|
|
1417
|
-
} else if (Math.abs(bytesPerPixel - 1) < 0.1) {
|
|
1418
|
-
for (let i = 0;i < src.length; i++) {
|
|
1419
|
-
const gray = src[i];
|
|
1420
|
-
dest[i] = 255 << 24 | gray << 16 | gray << 8 | gray;
|
|
1421
|
-
}
|
|
1422
|
-
} else {
|
|
1423
|
-
throw new Error(`convertToRGBA: Cannot infer image format. kind: ${kind}, bytesPerPixel: ${bytesPerPixel}, width: ${width}, height: ${height}, dataLength: ${src.length}`);
|
|
1424
|
-
}
|
|
1425
|
-
} else {
|
|
1426
|
-
throw new Error(`convertToRGBA: Unsupported image kind: ${kind}. Available kinds: GRAYSCALE_1BPP=${pdfjs2.ImageKind.GRAYSCALE_1BPP}, RGB_24BPP=${pdfjs2.ImageKind.RGB_24BPP}, RGBA_32BPP=${pdfjs2.ImageKind.RGBA_32BPP}`);
|
|
1427
|
-
}
|
|
1428
|
-
}
|
|
1429
|
-
resolveEmbeddedImage(pdfObjects, name) {
|
|
1430
|
-
return new Promise((resolve, reject) => {
|
|
1431
|
-
pdfObjects.get(name, (imgData) => {
|
|
1432
|
-
if (imgData) {
|
|
1433
|
-
let dataBuff;
|
|
1434
|
-
if (imgData.data instanceof Uint8Array) {
|
|
1435
|
-
dataBuff = imgData.data;
|
|
1436
|
-
} else if (imgData.data instanceof Uint8ClampedArray) {
|
|
1437
|
-
dataBuff = new Uint8Array(imgData.data);
|
|
1438
|
-
} else if (imgData.data?.buffer) {
|
|
1439
|
-
dataBuff = new Uint8Array(imgData.data.buffer);
|
|
1440
|
-
} else if (imgData.bitmap) {
|
|
1441
|
-
const canvasFactory = this.doc.canvasFactory;
|
|
1442
|
-
const canvasAndContext = canvasFactory.create(imgData.bitmap.width, imgData.bitmap.height);
|
|
1443
|
-
canvasAndContext.context.drawImage(imgData.bitmap, 0, 0);
|
|
1444
|
-
const imageData = canvasAndContext.context.getImageData(0, 0, imgData.bitmap.width, imgData.bitmap.height);
|
|
1445
|
-
dataBuff = new Uint8Array(imageData.data.buffer);
|
|
1446
|
-
} else if (ArrayBuffer.isView(imgData.data)) {
|
|
1447
|
-
dataBuff = new Uint8Array(imgData.data.buffer, imgData.data.byteOffset, imgData.data.byteLength);
|
|
1448
|
-
}
|
|
1449
|
-
if (!dataBuff) {
|
|
1450
|
-
reject(new Error(`Image object ${name}: data field is empty or invalid. Available fields: ${Object.keys(imgData).join(", ")}`));
|
|
1451
|
-
return;
|
|
1452
|
-
}
|
|
1453
|
-
if (dataBuff.length === 0) {
|
|
1454
|
-
reject(new Error(`Image object ${name}: data buffer is empty (length: 0)`));
|
|
1455
|
-
return;
|
|
1456
|
-
}
|
|
1457
|
-
resolve({ width: imgData.width, height: imgData.height, kind: imgData.kind, data: dataBuff });
|
|
1458
|
-
} else {
|
|
1459
|
-
reject(new Error(`Image object ${name} not found`));
|
|
1460
|
-
}
|
|
1461
|
-
});
|
|
1462
|
-
});
|
|
1463
|
-
}
|
|
1464
|
-
async getScreenshot(parseParams = {}) {
|
|
1465
|
-
const params = setDefaultParseParameters(parseParams);
|
|
1466
|
-
const doc = await this.load();
|
|
1467
|
-
const result = new ScreenshotResult(doc.numPages);
|
|
1468
|
-
if (this.doc === undefined) {
|
|
1469
|
-
throw new Error("PDF document not loaded");
|
|
1470
|
-
}
|
|
1471
|
-
for (let i = 1;i <= result.total; i++) {
|
|
1472
|
-
if (this.shouldParse(i, result.total, params)) {
|
|
1473
|
-
const page = await this.doc.getPage(i);
|
|
1474
|
-
let viewport = page.getViewport({ scale: params.scale });
|
|
1475
|
-
if (params.desiredWidth) {
|
|
1476
|
-
viewport = page.getViewport({ scale: 1 });
|
|
1477
|
-
const scale = params.desiredWidth / viewport.width;
|
|
1478
|
-
viewport = page.getViewport({ scale });
|
|
1479
|
-
}
|
|
1480
|
-
const canvasFactory = this.doc.canvasFactory;
|
|
1481
|
-
const canvasAndContext = canvasFactory.create(viewport.width, viewport.height);
|
|
1482
|
-
const renderContext = {
|
|
1483
|
-
canvasContext: canvasAndContext.context,
|
|
1484
|
-
viewport,
|
|
1485
|
-
canvas: canvasAndContext.canvas
|
|
1486
|
-
};
|
|
1487
|
-
const renderTask = page.render(renderContext);
|
|
1488
|
-
await renderTask.promise;
|
|
1489
|
-
let data = new Uint8Array;
|
|
1490
|
-
let dataUrl = "";
|
|
1491
|
-
if (typeof canvasAndContext.canvas.toBuffer === "function") {
|
|
1492
|
-
let nodeBuffer;
|
|
1493
|
-
if (params.imageBuffer) {
|
|
1494
|
-
nodeBuffer = canvasAndContext.canvas.toBuffer("image/png");
|
|
1495
|
-
data = new Uint8Array(nodeBuffer);
|
|
1496
|
-
}
|
|
1497
|
-
if (params.imageDataUrl) {
|
|
1498
|
-
if (nodeBuffer) {
|
|
1499
|
-
dataUrl = `data:image/png;base64,${nodeBuffer.toString("base64")}`;
|
|
1500
|
-
} else {
|
|
1501
|
-
nodeBuffer = canvasAndContext.canvas.toBuffer("image/png");
|
|
1502
|
-
data = new Uint8Array(nodeBuffer);
|
|
1503
|
-
dataUrl = `data:image/png;base64,${nodeBuffer.toString("base64")}`;
|
|
1504
|
-
}
|
|
1505
|
-
}
|
|
1506
|
-
} else {
|
|
1507
|
-
if (params.imageBuffer) {
|
|
1508
|
-
const imageData = canvasAndContext.context.getImageData(0, 0, canvasAndContext.canvas.width, canvasAndContext.canvas.height);
|
|
1509
|
-
data = new Uint8Array(imageData.data);
|
|
1510
|
-
}
|
|
1511
|
-
if (params.imageDataUrl) {
|
|
1512
|
-
dataUrl = canvasAndContext.canvas.toDataURL("image/png");
|
|
1513
|
-
}
|
|
1514
|
-
}
|
|
1515
|
-
result.pages.push({
|
|
1516
|
-
data,
|
|
1517
|
-
dataUrl,
|
|
1518
|
-
pageNumber: i,
|
|
1519
|
-
width: viewport.width,
|
|
1520
|
-
height: viewport.height,
|
|
1521
|
-
scale: viewport.scale
|
|
1522
|
-
});
|
|
1523
|
-
page.cleanup();
|
|
1524
|
-
}
|
|
1525
|
-
}
|
|
1526
|
-
return result;
|
|
1527
|
-
}
|
|
1528
|
-
async getTable(params = {}) {
|
|
1529
|
-
const doc = await this.load();
|
|
1530
|
-
const result = new TableResult(doc.numPages);
|
|
1531
|
-
if (this.doc === undefined) {
|
|
1532
|
-
throw new Error("PDF document not loaded");
|
|
1533
|
-
}
|
|
1534
|
-
for (let i = 1;i <= result.total; i++) {
|
|
1535
|
-
if (this.shouldParse(i, result.total, params)) {
|
|
1536
|
-
const page = await this.doc.getPage(i);
|
|
1537
|
-
const store = await this.getPageTables(page);
|
|
1538
|
-
store.normalize();
|
|
1539
|
-
const tableDataArr = store.getTableData();
|
|
1540
|
-
await this.fillPageTables(page, tableDataArr);
|
|
1541
|
-
const pageTableResult = { num: i, tables: [] };
|
|
1542
|
-
for (const table of tableDataArr) {
|
|
1543
|
-
pageTableResult.tables.push(table.toArray());
|
|
1544
|
-
}
|
|
1545
|
-
result.pages.push(pageTableResult);
|
|
1546
|
-
page.cleanup();
|
|
1547
|
-
}
|
|
1548
|
-
}
|
|
1549
|
-
return result;
|
|
1550
|
-
}
|
|
1551
|
-
getPathGeometry(mm) {
|
|
1552
|
-
const width = mm[2] - mm[0];
|
|
1553
|
-
const height = mm[3] - mm[1];
|
|
1554
|
-
if (mm[0] === Infinity) {
|
|
1555
|
-
return PathGeometry.undefined;
|
|
1556
|
-
}
|
|
1557
|
-
if (width > 5 && height > 5) {
|
|
1558
|
-
return PathGeometry.rectangle;
|
|
1559
|
-
} else if (width > 5 && height === 0) {
|
|
1560
|
-
return PathGeometry.hline;
|
|
1561
|
-
} else if (width === 0 && height > 5) {
|
|
1562
|
-
return PathGeometry.vline;
|
|
1563
|
-
}
|
|
1564
|
-
return PathGeometry.undefined;
|
|
1565
|
-
}
|
|
1566
|
-
async getPageTables(page) {
|
|
1567
|
-
const lineStore = new LineStore;
|
|
1568
|
-
const viewport = page.getViewport({ scale: 1 });
|
|
1569
|
-
let transformMatrix = [1, 0, 0, 1, 0, 0];
|
|
1570
|
-
const transformStack = [];
|
|
1571
|
-
const opList = await page.getOperatorList();
|
|
1572
|
-
for (let i = 0;i < opList.fnArray.length; i++) {
|
|
1573
|
-
const fn = opList.fnArray[i];
|
|
1574
|
-
const args = opList.argsArray[i];
|
|
1575
|
-
const op = args?.[0] ?? 0;
|
|
1576
|
-
const mm = args?.[2] ?? [Infinity, Infinity, -Infinity, -Infinity];
|
|
1577
|
-
if (fn === pdfjs2.OPS.constructPath) {
|
|
1578
|
-
if (op === pdfjs2.OPS.fill) {}
|
|
1579
|
-
if (op !== pdfjs2.OPS.stroke) {
|
|
1580
|
-
continue;
|
|
1581
|
-
}
|
|
1582
|
-
const pg = this.getPathGeometry(mm);
|
|
1583
|
-
if (pg === PathGeometry.rectangle) {
|
|
1584
|
-
const rect = new Rectangle(new Point(mm[0], mm[1]), mm[2] - mm[0], mm[3] - mm[1]);
|
|
1585
|
-
rect.transform(transformMatrix);
|
|
1586
|
-
rect.transform(viewport.transform);
|
|
1587
|
-
lineStore.addRectangle(rect);
|
|
1588
|
-
} else if (pg === PathGeometry.hline || pg === PathGeometry.vline) {
|
|
1589
|
-
const from = new Point(mm[0], mm[1]);
|
|
1590
|
-
const to = new Point(mm[2], mm[3]);
|
|
1591
|
-
const line = new Line(from, to);
|
|
1592
|
-
line.transform(transformMatrix);
|
|
1593
|
-
line.transform(viewport.transform);
|
|
1594
|
-
lineStore.add(line);
|
|
1595
|
-
} else {}
|
|
1596
|
-
} else if (fn === pdfjs2.OPS.setLineWidth) {} else if (fn === pdfjs2.OPS.save) {
|
|
1597
|
-
transformStack.push(transformMatrix);
|
|
1598
|
-
} else if (fn === pdfjs2.OPS.restore) {
|
|
1599
|
-
const restoredMatrix = transformStack.pop();
|
|
1600
|
-
if (restoredMatrix) {
|
|
1601
|
-
transformMatrix = restoredMatrix;
|
|
1602
|
-
}
|
|
1603
|
-
} else if (fn === pdfjs2.OPS.transform) {
|
|
1604
|
-
transformMatrix = pdfjs2.Util.transform(transformMatrix, args);
|
|
1605
|
-
}
|
|
1606
|
-
}
|
|
1607
|
-
return lineStore;
|
|
1608
|
-
}
|
|
1609
|
-
async fillPageTables(page, pageTables) {
|
|
1610
|
-
const viewport = page.getViewport({ scale: 1 });
|
|
1611
|
-
const textContent = await page.getTextContent({
|
|
1612
|
-
includeMarkedContent: false,
|
|
1613
|
-
disableNormalization: false
|
|
1614
|
-
});
|
|
1615
|
-
for (const textItem of textContent.items) {
|
|
1616
|
-
if (!("str" in textItem))
|
|
1617
|
-
continue;
|
|
1618
|
-
const tx = pdfjs2.Util.transform(pdfjs2.Util.transform(viewport.transform, textItem.transform), [1, 0, 0, -1, 0, 0]);
|
|
1619
|
-
for (const pageTable of pageTables) {
|
|
1620
|
-
const cell = pageTable.findCell(tx[4], tx[5]);
|
|
1621
|
-
if (cell) {
|
|
1622
|
-
cell.text.push(textItem.str);
|
|
1623
|
-
if (textItem.hasEOL) {
|
|
1624
|
-
cell.text.push(`
|
|
1625
|
-
`);
|
|
1626
|
-
}
|
|
1627
|
-
break;
|
|
1628
|
-
}
|
|
1629
|
-
}
|
|
1630
|
-
}
|
|
1631
|
-
}
|
|
74
|
+
var import_pdfjs_dist = __toESM(require("pdfjs-dist"));
|
|
75
|
+
var { getDocument } = import_pdfjs_dist.default;
|
|
76
|
+
function isTextItem(item) {
|
|
77
|
+
return "str" in item;
|
|
1632
78
|
}
|
|
1633
79
|
|
|
1634
|
-
// node_modules/pdf-parse/dist/pdf-parse/esm/index.js
|
|
1635
|
-
var import_pdf = require("pdfjs-dist/legacy/build/pdf.mjs");
|
|
1636
|
-
|
|
1637
|
-
// services/pdf.ts
|
|
1638
|
-
var pdfParse = undefined || exports_esm;
|
|
1639
|
-
|
|
1640
80
|
class PdfService extends import_core.Service {
|
|
1641
81
|
static serviceType = import_core.ServiceType.PDF;
|
|
1642
82
|
capabilityDescription = "The agent is able to convert PDF files to text";
|
|
1643
|
-
static async start(
|
|
1644
|
-
|
|
83
|
+
static async start(runtime) {
|
|
84
|
+
const service = new PdfService(runtime);
|
|
85
|
+
return service;
|
|
1645
86
|
}
|
|
1646
87
|
static async stop(runtime) {
|
|
1647
|
-
const service =
|
|
88
|
+
const service = runtime.getService(import_core.ServiceType.PDF);
|
|
1648
89
|
if (service) {
|
|
1649
|
-
await service.stop
|
|
90
|
+
await service.stop();
|
|
1650
91
|
}
|
|
1651
92
|
}
|
|
1652
93
|
async stop() {}
|
|
1653
94
|
async convertPdfToText(pdfBuffer) {
|
|
1654
95
|
try {
|
|
1655
|
-
const
|
|
1656
|
-
|
|
96
|
+
const uint8Array = new Uint8Array(pdfBuffer);
|
|
97
|
+
const pdf = await getDocument({ data: uint8Array }).promise;
|
|
98
|
+
const numPages = pdf.numPages;
|
|
99
|
+
const textPages = [];
|
|
100
|
+
for (let pageNum = 1;pageNum <= numPages; pageNum++) {
|
|
101
|
+
const page = await pdf.getPage(pageNum);
|
|
102
|
+
const textContent = await page.getTextContent();
|
|
103
|
+
const pageText = textContent.items.filter(isTextItem).map((item) => item.str).join(" ");
|
|
104
|
+
textPages.push(pageText);
|
|
105
|
+
}
|
|
106
|
+
const rawText = textPages.join(`
|
|
107
|
+
`);
|
|
108
|
+
return this.cleanUpContent(rawText);
|
|
1657
109
|
} catch (error) {
|
|
1658
|
-
import_core.logger.error(`PdfService: Failed to convert PDF to text - error: ${error}`);
|
|
110
|
+
import_core.logger.error(`PdfService: Failed to convert PDF to text - error: ${error}, bufferSize: ${pdfBuffer.length}`);
|
|
1659
111
|
throw error;
|
|
1660
112
|
}
|
|
1661
113
|
}
|
|
1662
114
|
async convertPdfToTextWithOptions(pdfBuffer, options = {}) {
|
|
1663
115
|
try {
|
|
1664
|
-
const
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
const
|
|
1669
|
-
|
|
116
|
+
const uint8Array = new Uint8Array(pdfBuffer);
|
|
117
|
+
const pdf = await getDocument({ data: uint8Array }).promise;
|
|
118
|
+
const numPages = pdf.numPages;
|
|
119
|
+
const startPage = Math.max(1, options.startPage || 1);
|
|
120
|
+
const endPage = Math.min(numPages, options.endPage || numPages);
|
|
121
|
+
const textPages = [];
|
|
122
|
+
for (let pageNum = startPage;pageNum <= endPage; pageNum++) {
|
|
123
|
+
const page = await pdf.getPage(pageNum);
|
|
124
|
+
const textContent = await page.getTextContent();
|
|
125
|
+
const pageText = textContent.items.filter(isTextItem).map((item) => item.str).join(options.preserveWhitespace ? "" : " ");
|
|
126
|
+
textPages.push(pageText);
|
|
127
|
+
}
|
|
128
|
+
let text = textPages.join(`
|
|
129
|
+
`);
|
|
1670
130
|
if (options.cleanContent !== false) {
|
|
1671
131
|
text = this.cleanUpContent(text);
|
|
1672
132
|
}
|
|
1673
133
|
return {
|
|
1674
134
|
success: true,
|
|
1675
135
|
text,
|
|
1676
|
-
pageCount:
|
|
136
|
+
pageCount: numPages
|
|
1677
137
|
};
|
|
1678
138
|
} catch (error) {
|
|
1679
139
|
return {
|
|
@@ -1683,30 +143,54 @@ class PdfService extends import_core.Service {
|
|
|
1683
143
|
}
|
|
1684
144
|
}
|
|
1685
145
|
async getDocumentInfo(pdfBuffer) {
|
|
1686
|
-
|
|
1687
|
-
|
|
1688
|
-
|
|
1689
|
-
|
|
1690
|
-
|
|
1691
|
-
|
|
1692
|
-
|
|
1693
|
-
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
1697
|
-
|
|
1698
|
-
|
|
1699
|
-
|
|
1700
|
-
|
|
1701
|
-
|
|
1702
|
-
|
|
146
|
+
const uint8Array = new Uint8Array(pdfBuffer);
|
|
147
|
+
const pdf = await getDocument({ data: uint8Array }).promise;
|
|
148
|
+
const numPages = pdf.numPages;
|
|
149
|
+
const metadataResult = await pdf.getMetadata();
|
|
150
|
+
const info = metadataResult.info;
|
|
151
|
+
const metadata = {
|
|
152
|
+
title: info.Title,
|
|
153
|
+
author: info.Author,
|
|
154
|
+
subject: info.Subject,
|
|
155
|
+
keywords: info.Keywords,
|
|
156
|
+
creator: info.Creator,
|
|
157
|
+
producer: info.Producer,
|
|
158
|
+
creationDate: info.CreationDate ? new Date(info.CreationDate) : undefined,
|
|
159
|
+
modificationDate: info.ModDate ? new Date(info.ModDate) : undefined
|
|
160
|
+
};
|
|
161
|
+
const pages = [];
|
|
162
|
+
const allText = [];
|
|
163
|
+
for (let pageNum = 1;pageNum <= numPages; pageNum++) {
|
|
164
|
+
const page = await pdf.getPage(pageNum);
|
|
165
|
+
const viewport = page.getViewport({ scale: 1 });
|
|
166
|
+
const textContent = await page.getTextContent();
|
|
167
|
+
const pageText = textContent.items.filter(isTextItem).map((item) => item.str).join(" ");
|
|
168
|
+
pages.push({
|
|
169
|
+
pageNumber: pageNum,
|
|
170
|
+
width: viewport.width,
|
|
171
|
+
height: viewport.height,
|
|
172
|
+
text: this.cleanUpContent(pageText)
|
|
173
|
+
});
|
|
174
|
+
allText.push(pageText);
|
|
175
|
+
}
|
|
176
|
+
return {
|
|
177
|
+
pageCount: numPages,
|
|
178
|
+
metadata,
|
|
179
|
+
text: this.cleanUpContent(allText.join(`
|
|
180
|
+
`)),
|
|
181
|
+
pages
|
|
182
|
+
};
|
|
1703
183
|
}
|
|
1704
184
|
cleanUpContent(content) {
|
|
1705
185
|
try {
|
|
1706
|
-
const
|
|
186
|
+
const filtered = content.split("").filter((char) => {
|
|
187
|
+
const charCode = char.charCodeAt(0);
|
|
188
|
+
return !(charCode === 0 || charCode >= 1 && charCode <= 8 || charCode >= 11 && charCode <= 12 || charCode >= 14 && charCode <= 31 || charCode === 127);
|
|
189
|
+
}).join("");
|
|
190
|
+
const cleaned = filtered.replace(/[^\S\r\n]+/g, " ").replace(/[ \t]+(\r?\n)/g, "$1").trim();
|
|
1707
191
|
return cleaned;
|
|
1708
192
|
} catch (error) {
|
|
1709
|
-
import_core.logger.error(`PdfService: Failed to clean up content - error: ${error}`);
|
|
193
|
+
import_core.logger.error(`PdfService: Failed to clean up content - error: ${error}, contentLength: ${content.length}`);
|
|
1710
194
|
return content;
|
|
1711
195
|
}
|
|
1712
196
|
}
|
|
@@ -1720,4 +204,4 @@ var pdfPlugin = {
|
|
|
1720
204
|
};
|
|
1721
205
|
var typescript_default = pdfPlugin;
|
|
1722
206
|
|
|
1723
|
-
//# debugId=
|
|
207
|
+
//# debugId=0A7FDC6F763358C464756E2164756E21
|