@elizaos/plugin-pdf 2.0.0-alpha.13 → 2.0.0-alpha.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/index.browser.js +27 -5
- package/dist/browser/index.browser.js.map +5 -4
- package/dist/browser/index.d.ts +2 -0
- package/dist/cjs/index.d.ts +2 -0
- package/dist/cjs/index.node.cjs +1618 -82
- package/dist/cjs/index.node.js.map +21 -4
- package/dist/node/index.d.ts +2 -0
- package/dist/node/index.node.js +1634 -83
- package/dist/node/index.node.js.map +21 -4
- package/dist/services/pdf.d.ts +2 -3
- package/dist/services/pdf.d.ts.map +1 -1
- package/dist/tsconfig.tsbuildinfo +1 -0
- package/package.json +10 -8
package/dist/cjs/index.node.cjs
CHANGED
|
@@ -64,76 +64,1637 @@ var __export = (target, all) => {
|
|
|
64
64
|
var exports_index_node = {};
|
|
65
65
|
__export(exports_index_node, {
|
|
66
66
|
pdfPlugin: () => pdfPlugin,
|
|
67
|
-
default: () =>
|
|
67
|
+
default: () => import__.default,
|
|
68
68
|
PdfService: () => PdfService
|
|
69
69
|
});
|
|
70
70
|
module.exports = __toCommonJS(exports_index_node);
|
|
71
71
|
|
|
72
72
|
// services/pdf.ts
|
|
73
73
|
var import_core = require("@elizaos/core");
|
|
74
|
+
|
|
75
|
+
// node_modules/pdf-parse/dist/pdf-parse/esm/index.js
|
|
76
|
+
var exports_esm = {};
|
|
77
|
+
__export(exports_esm, {
|
|
78
|
+
getException: () => getException,
|
|
79
|
+
VerbosityLevel: () => import_pdf.VerbosityLevel,
|
|
80
|
+
UnknownErrorException: () => UnknownErrorException,
|
|
81
|
+
Table: () => Table,
|
|
82
|
+
Shape: () => Shape,
|
|
83
|
+
ResponseException: () => ResponseException,
|
|
84
|
+
Rectangle: () => Rectangle,
|
|
85
|
+
Point: () => Point,
|
|
86
|
+
PasswordException: () => PasswordException,
|
|
87
|
+
PDFParse: () => PDFParse,
|
|
88
|
+
LineStore: () => LineStore,
|
|
89
|
+
LineDirection: () => LineDirection,
|
|
90
|
+
Line: () => Line,
|
|
91
|
+
InvalidPDFException: () => InvalidPDFException,
|
|
92
|
+
FormatError: () => FormatError,
|
|
93
|
+
AbortException: () => AbortException
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
// node_modules/pdf-parse/dist/pdf-parse/esm/PDFParse.js
|
|
97
|
+
var pdfjs2 = __toESM(require("pdfjs-dist/legacy/build/pdf.mjs"));
|
|
98
|
+
|
|
99
|
+
// node_modules/pdf-parse/dist/pdf-parse/esm/Exception.js
|
|
100
|
+
class InvalidPDFException extends Error {
|
|
101
|
+
constructor(message, cause) {
|
|
102
|
+
if (cause !== undefined) {
|
|
103
|
+
super(message ?? "Invalid PDF", { cause });
|
|
104
|
+
} else {
|
|
105
|
+
super(message ?? "Invalid PDF");
|
|
106
|
+
}
|
|
107
|
+
this.name = "InvalidPDFException";
|
|
108
|
+
Object.setPrototypeOf(this, InvalidPDFException.prototype);
|
|
109
|
+
if (typeof Error.captureStackTrace === "function") {
|
|
110
|
+
Error.captureStackTrace(this, InvalidPDFException);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
class PasswordException extends Error {
|
|
116
|
+
constructor(message, cause) {
|
|
117
|
+
if (cause !== undefined) {
|
|
118
|
+
super(message ?? "Password required or incorrect", { cause });
|
|
119
|
+
} else {
|
|
120
|
+
super(message ?? "Password required or incorrect");
|
|
121
|
+
}
|
|
122
|
+
this.name = "PasswordException";
|
|
123
|
+
Object.setPrototypeOf(this, PasswordException.prototype);
|
|
124
|
+
if (typeof Error.captureStackTrace === "function") {
|
|
125
|
+
Error.captureStackTrace(this, PasswordException);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
class FormatError extends Error {
|
|
131
|
+
constructor(message, cause) {
|
|
132
|
+
if (cause !== undefined) {
|
|
133
|
+
super(message ?? "PDF format error", { cause });
|
|
134
|
+
} else {
|
|
135
|
+
super(message ?? "PDF format error");
|
|
136
|
+
}
|
|
137
|
+
this.name = "FormatError";
|
|
138
|
+
Object.setPrototypeOf(this, FormatError.prototype);
|
|
139
|
+
if (typeof Error.captureStackTrace === "function") {
|
|
140
|
+
Error.captureStackTrace(this, FormatError);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
class UnknownErrorException extends Error {
|
|
146
|
+
constructor(message, details, cause) {
|
|
147
|
+
if (cause !== undefined) {
|
|
148
|
+
super(message ?? "Unknown error", { cause });
|
|
149
|
+
} else {
|
|
150
|
+
super(message ?? "Unknown error");
|
|
151
|
+
}
|
|
152
|
+
this.name = "UnknownErrorException";
|
|
153
|
+
Object.setPrototypeOf(this, UnknownErrorException.prototype);
|
|
154
|
+
if (typeof Error.captureStackTrace === "function") {
|
|
155
|
+
Error.captureStackTrace(this, UnknownErrorException);
|
|
156
|
+
}
|
|
157
|
+
this.details = details;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
class ResponseException extends Error {
|
|
162
|
+
constructor(message, status, missing, cause) {
|
|
163
|
+
if (cause !== undefined) {
|
|
164
|
+
super(message ?? "Response error", { cause });
|
|
165
|
+
} else {
|
|
166
|
+
super(message ?? "Response error");
|
|
167
|
+
}
|
|
168
|
+
this.name = "ResponseException";
|
|
169
|
+
Object.setPrototypeOf(this, ResponseException.prototype);
|
|
170
|
+
if (typeof Error.captureStackTrace === "function") {
|
|
171
|
+
Error.captureStackTrace(this, ResponseException);
|
|
172
|
+
}
|
|
173
|
+
this.status = status;
|
|
174
|
+
this.missing = missing;
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
class AbortException extends Error {
|
|
179
|
+
constructor(message, cause) {
|
|
180
|
+
if (cause !== undefined) {
|
|
181
|
+
super(message ?? "Operation aborted", { cause });
|
|
182
|
+
} else {
|
|
183
|
+
super(message ?? "Operation aborted");
|
|
184
|
+
}
|
|
185
|
+
this.name = "AbortException";
|
|
186
|
+
Object.setPrototypeOf(this, AbortException.prototype);
|
|
187
|
+
if (typeof Error.captureStackTrace === "function") {
|
|
188
|
+
Error.captureStackTrace(this, AbortException);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
function getException(error) {
|
|
193
|
+
if (error instanceof Error) {
|
|
194
|
+
switch (error.name) {
|
|
195
|
+
case "InvalidPDFException":
|
|
196
|
+
return new InvalidPDFException(error.message, error);
|
|
197
|
+
case "PasswordException":
|
|
198
|
+
return new PasswordException(error.message, error);
|
|
199
|
+
case "FormatError":
|
|
200
|
+
return new FormatError(error.message, error);
|
|
201
|
+
case "UnknownErrorException":
|
|
202
|
+
return new UnknownErrorException(error.message, error.details, error);
|
|
203
|
+
case "ResponseException":
|
|
204
|
+
return new ResponseException(error.message, error.status, error.missing, error);
|
|
205
|
+
case "AbortException":
|
|
206
|
+
return new AbortException(error.message, error);
|
|
207
|
+
default:
|
|
208
|
+
return error;
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
return new Error(String(error));
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// node_modules/pdf-parse/dist/pdf-parse/esm/geometry/Shape.js
|
|
215
|
+
class Shape {
|
|
216
|
+
static tolerance = 2;
|
|
217
|
+
static applyTransform(p, m) {
|
|
218
|
+
const xt = p[0] * m[0] + p[1] * m[2] + m[4];
|
|
219
|
+
const yt = p[0] * m[1] + p[1] * m[3] + m[5];
|
|
220
|
+
return [xt, yt];
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
// node_modules/pdf-parse/dist/pdf-parse/esm/geometry/Point.js
|
|
225
|
+
class Point extends Shape {
|
|
226
|
+
x;
|
|
227
|
+
y;
|
|
228
|
+
constructor(x, y) {
|
|
229
|
+
super();
|
|
230
|
+
this.x = x;
|
|
231
|
+
this.y = y;
|
|
232
|
+
}
|
|
233
|
+
equal(point) {
|
|
234
|
+
return point.x === this.x && point.y === this.y;
|
|
235
|
+
}
|
|
236
|
+
transform(matrix) {
|
|
237
|
+
const p = Shape.applyTransform([this.x, this.y], matrix);
|
|
238
|
+
this.x = p[0];
|
|
239
|
+
this.y = p[1];
|
|
240
|
+
return this;
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
// node_modules/pdf-parse/dist/pdf-parse/esm/geometry/Line.js
|
|
245
|
+
var LineDirection;
|
|
246
|
+
(function(LineDirection2) {
|
|
247
|
+
LineDirection2[LineDirection2["None"] = 0] = "None";
|
|
248
|
+
LineDirection2[LineDirection2["Horizontal"] = 1] = "Horizontal";
|
|
249
|
+
LineDirection2[LineDirection2["Vertical"] = 2] = "Vertical";
|
|
250
|
+
})(LineDirection || (LineDirection = {}));
|
|
251
|
+
|
|
252
|
+
class Line extends Shape {
|
|
253
|
+
from;
|
|
254
|
+
to;
|
|
255
|
+
direction = LineDirection.None;
|
|
256
|
+
length = 0;
|
|
257
|
+
intersections = [];
|
|
258
|
+
gaps = [];
|
|
259
|
+
constructor(from, to) {
|
|
260
|
+
super();
|
|
261
|
+
this.from = from;
|
|
262
|
+
this.to = to;
|
|
263
|
+
this.init();
|
|
264
|
+
}
|
|
265
|
+
init() {
|
|
266
|
+
let from = this.from;
|
|
267
|
+
let to = this.to;
|
|
268
|
+
if (Math.abs(from.y - to.y) < Shape.tolerance) {
|
|
269
|
+
this.direction = LineDirection.Horizontal;
|
|
270
|
+
to.y = from.y;
|
|
271
|
+
if (from.x > to.x) {
|
|
272
|
+
const temp = from;
|
|
273
|
+
from = to;
|
|
274
|
+
to = temp;
|
|
275
|
+
}
|
|
276
|
+
this.length = to.x - from.x;
|
|
277
|
+
} else if (Math.abs(from.x - to.x) < Shape.tolerance) {
|
|
278
|
+
this.direction = LineDirection.Vertical;
|
|
279
|
+
to.x = from.x;
|
|
280
|
+
if (from.y > to.y) {
|
|
281
|
+
const temp = from;
|
|
282
|
+
from = to;
|
|
283
|
+
to = temp;
|
|
284
|
+
}
|
|
285
|
+
this.length = to.y - from.y;
|
|
286
|
+
}
|
|
287
|
+
this.from = from;
|
|
288
|
+
this.to = to;
|
|
289
|
+
}
|
|
290
|
+
_valid = undefined;
|
|
291
|
+
get valid() {
|
|
292
|
+
if (this._valid === undefined) {
|
|
293
|
+
this._valid = this.direction !== LineDirection.None && this.length > Shape.tolerance;
|
|
294
|
+
}
|
|
295
|
+
return this._valid;
|
|
296
|
+
}
|
|
297
|
+
get normalized() {
|
|
298
|
+
if (this.direction === LineDirection.Horizontal) {
|
|
299
|
+
return new Line(new Point(this.from.x - Shape.tolerance, this.from.y), new Point(this.to.x + Shape.tolerance, this.from.y));
|
|
300
|
+
} else if (this.direction === LineDirection.Vertical) {
|
|
301
|
+
return new Line(new Point(this.from.x, this.from.y - Shape.tolerance), new Point(this.from.x, this.to.y + Shape.tolerance));
|
|
302
|
+
}
|
|
303
|
+
return this;
|
|
304
|
+
}
|
|
305
|
+
addGap(line) {
|
|
306
|
+
this.gaps.push(line);
|
|
307
|
+
}
|
|
308
|
+
containsPoint(p) {
|
|
309
|
+
if (this.direction === LineDirection.Vertical) {
|
|
310
|
+
return this.from.x === p.x && p.y >= this.from.y && p.y <= this.to.y;
|
|
311
|
+
} else if (this.direction === LineDirection.Horizontal) {
|
|
312
|
+
return this.from.y === p.y && p.x >= this.from.x && p.x <= this.to.x;
|
|
313
|
+
}
|
|
314
|
+
return false;
|
|
315
|
+
}
|
|
316
|
+
addIntersectionPoint(point) {
|
|
317
|
+
for (const intPoint of this.intersections) {
|
|
318
|
+
if (intPoint.equal(point))
|
|
319
|
+
return;
|
|
320
|
+
}
|
|
321
|
+
this.intersections.push(point);
|
|
322
|
+
}
|
|
323
|
+
intersection(line) {
|
|
324
|
+
let result;
|
|
325
|
+
if (!this.valid || !line.valid) {
|
|
326
|
+
return result;
|
|
327
|
+
}
|
|
328
|
+
const thisNormalized = this.normalized;
|
|
329
|
+
const lineNormalized = line.normalized;
|
|
330
|
+
if (this.direction === LineDirection.Horizontal && line.direction === LineDirection.Vertical) {
|
|
331
|
+
const x = lineNormalized.from.x;
|
|
332
|
+
const y = thisNormalized.from.y;
|
|
333
|
+
const isOk = x > thisNormalized.from.x && x < thisNormalized.to.x && y > lineNormalized.from.y && y < lineNormalized.to.y;
|
|
334
|
+
if (isOk) {
|
|
335
|
+
const intPoint = new Point(x, y);
|
|
336
|
+
this.addIntersectionPoint(intPoint);
|
|
337
|
+
line.addIntersectionPoint(intPoint);
|
|
338
|
+
result = intPoint;
|
|
339
|
+
}
|
|
340
|
+
} else if (this.direction === LineDirection.Vertical && line.direction === LineDirection.Horizontal) {
|
|
341
|
+
const x = thisNormalized.from.x;
|
|
342
|
+
const y = lineNormalized.from.y;
|
|
343
|
+
const isOk = x > lineNormalized.from.x && x < lineNormalized.to.x && y > thisNormalized.from.y && y < thisNormalized.to.y;
|
|
344
|
+
if (isOk) {
|
|
345
|
+
const intPoint = new Point(x, y);
|
|
346
|
+
this.addIntersectionPoint(intPoint);
|
|
347
|
+
line.addIntersectionPoint(intPoint);
|
|
348
|
+
result = intPoint;
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
return result;
|
|
352
|
+
}
|
|
353
|
+
transform(matrix) {
|
|
354
|
+
const p1 = this.from.transform(matrix);
|
|
355
|
+
const p2 = this.to.transform(matrix);
|
|
356
|
+
const x = Math.min(p1.x, p2.x);
|
|
357
|
+
const y = Math.min(p1.y, p2.y);
|
|
358
|
+
const width = Math.abs(p1.x - p2.x);
|
|
359
|
+
const height = Math.abs(p1.y - p2.y);
|
|
360
|
+
this.from = new Point(x, y);
|
|
361
|
+
this.to = new Point(x + width, y + height);
|
|
362
|
+
this.init();
|
|
363
|
+
return this;
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
// node_modules/pdf-parse/dist/pdf-parse/esm/geometry/TableData.js
|
|
367
|
+
class TableData {
|
|
368
|
+
minXY;
|
|
369
|
+
maxXY;
|
|
370
|
+
rows;
|
|
371
|
+
rowPivots;
|
|
372
|
+
colPivots;
|
|
373
|
+
constructor(minXY, maxXY, rowPivots, colPivots) {
|
|
374
|
+
this.minXY = minXY;
|
|
375
|
+
this.maxXY = maxXY;
|
|
376
|
+
this.rows = [];
|
|
377
|
+
this.rowPivots = rowPivots;
|
|
378
|
+
this.colPivots = colPivots;
|
|
379
|
+
}
|
|
380
|
+
findCell(x, y) {
|
|
381
|
+
if (x >= this.minXY.x && y >= this.minXY.y && x <= this.maxXY.x && y <= this.maxXY.y) {
|
|
382
|
+
for (const row of this.rows) {
|
|
383
|
+
for (const cell of row) {
|
|
384
|
+
if (cell.minXY.x <= x && cell.minXY.y <= y && cell.maxXY.x >= x && cell.maxXY.y >= y) {
|
|
385
|
+
return cell;
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
return;
|
|
391
|
+
}
|
|
392
|
+
get cellCount() {
|
|
393
|
+
return this.rows.reduce((acc, row) => acc + row.length, 0);
|
|
394
|
+
}
|
|
395
|
+
get rowCount() {
|
|
396
|
+
return this.rows.length;
|
|
397
|
+
}
|
|
398
|
+
check() {
|
|
399
|
+
const virtualCellCount = (this.colPivots.length - 1) * (this.rowPivots.length - 1);
|
|
400
|
+
let allCellCount = 0;
|
|
401
|
+
for (const row of this.rows) {
|
|
402
|
+
for (const cell of row) {
|
|
403
|
+
const count = (cell.colspan || 1) * (cell.rowspan || 1);
|
|
404
|
+
allCellCount += count;
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
if (virtualCellCount !== allCellCount) {
|
|
408
|
+
return false;
|
|
409
|
+
}
|
|
410
|
+
return true;
|
|
411
|
+
}
|
|
412
|
+
toArray() {
|
|
413
|
+
const tableArr = [];
|
|
414
|
+
for (const row of this.rows) {
|
|
415
|
+
const rowArr = [];
|
|
416
|
+
for (const cell of row) {
|
|
417
|
+
let text = cell.text.join("");
|
|
418
|
+
text = text.replace(/^[\s]+|[\s]+$/g, "");
|
|
419
|
+
text = text.trim();
|
|
420
|
+
rowArr.push(text);
|
|
421
|
+
}
|
|
422
|
+
tableArr.push(rowArr);
|
|
423
|
+
}
|
|
424
|
+
return tableArr;
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
// node_modules/pdf-parse/dist/pdf-parse/esm/geometry/Table.js
|
|
429
|
+
class Table {
|
|
430
|
+
hLines = [];
|
|
431
|
+
vLines = [];
|
|
432
|
+
constructor(line) {
|
|
433
|
+
if (line.direction === LineDirection.Horizontal) {
|
|
434
|
+
this.hLines.push(line);
|
|
435
|
+
} else if (line.direction === LineDirection.Vertical) {
|
|
436
|
+
this.vLines.push(line);
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
get isValid() {
|
|
440
|
+
return this.hLines.length + this.vLines.length > 4;
|
|
441
|
+
}
|
|
442
|
+
get rowPivots() {
|
|
443
|
+
const rowSet = new Set;
|
|
444
|
+
for (const line of this.hLines) {
|
|
445
|
+
rowSet.add(line.from.y);
|
|
446
|
+
}
|
|
447
|
+
return [...rowSet].sort((a, b) => a - b);
|
|
448
|
+
}
|
|
449
|
+
get colPivots() {
|
|
450
|
+
const colSet = new Set;
|
|
451
|
+
for (const line of this.vLines) {
|
|
452
|
+
colSet.add(line.from.x);
|
|
453
|
+
}
|
|
454
|
+
return [...colSet].sort((a, b) => a - b);
|
|
455
|
+
}
|
|
456
|
+
add(line) {
|
|
457
|
+
const hasIntersection = this.intersection(line);
|
|
458
|
+
if (hasIntersection) {
|
|
459
|
+
if (line.direction === LineDirection.Horizontal) {
|
|
460
|
+
this.hLines.push(line);
|
|
461
|
+
return true;
|
|
462
|
+
} else if (line.direction === LineDirection.Vertical) {
|
|
463
|
+
this.vLines.push(line);
|
|
464
|
+
return true;
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
return false;
|
|
468
|
+
}
|
|
469
|
+
intersection(line) {
|
|
470
|
+
let flag = false;
|
|
471
|
+
if (!line.valid)
|
|
472
|
+
return flag;
|
|
473
|
+
if (line.direction === LineDirection.Horizontal) {
|
|
474
|
+
for (const vLine of this.vLines) {
|
|
475
|
+
const p = line.intersection(vLine);
|
|
476
|
+
if (p) {
|
|
477
|
+
flag = true;
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
} else if (line.direction === LineDirection.Vertical) {
|
|
481
|
+
for (const hLine of this.hLines) {
|
|
482
|
+
const p = line.intersection(hLine);
|
|
483
|
+
if (p) {
|
|
484
|
+
flag = true;
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
return flag;
|
|
489
|
+
}
|
|
490
|
+
getSameHorizontal(line) {
|
|
491
|
+
const same = [line];
|
|
492
|
+
const other = [];
|
|
493
|
+
while (this.hLines.length > 0) {
|
|
494
|
+
const hLine = this.hLines.shift();
|
|
495
|
+
if (!hLine)
|
|
496
|
+
continue;
|
|
497
|
+
if (hLine.from.y === line.from.y) {
|
|
498
|
+
same.push(hLine);
|
|
499
|
+
} else {
|
|
500
|
+
other.push(hLine);
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
this.hLines = other;
|
|
504
|
+
return same;
|
|
505
|
+
}
|
|
506
|
+
getSameVertical(line) {
|
|
507
|
+
const same = [line];
|
|
508
|
+
const other = [];
|
|
509
|
+
while (this.vLines.length > 0) {
|
|
510
|
+
const vLine = this.vLines.shift();
|
|
511
|
+
if (!vLine)
|
|
512
|
+
continue;
|
|
513
|
+
if (vLine.from.x === line.from.x) {
|
|
514
|
+
same.push(vLine);
|
|
515
|
+
} else {
|
|
516
|
+
other.push(vLine);
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
this.vLines = other;
|
|
520
|
+
return same;
|
|
521
|
+
}
|
|
522
|
+
mergeHorizontalLines(lines) {
|
|
523
|
+
lines.sort((l1, l2) => l1.from.x - l2.from.x);
|
|
524
|
+
const minX = lines[0].from.x;
|
|
525
|
+
const maxX = lines[lines.length - 1].to.x;
|
|
526
|
+
const resultLine = new Line(new Point(minX, lines[0].from.y), new Point(maxX, lines[0].from.y));
|
|
527
|
+
for (let i = 1;i < lines.length; i++) {
|
|
528
|
+
const prevLine = lines[i - 1];
|
|
529
|
+
const currLine = lines[i];
|
|
530
|
+
if (Math.abs(prevLine.to.x - currLine.from.x) > Shape.tolerance) {
|
|
531
|
+
const gapLine = new Line(new Point(prevLine.to.x, prevLine.from.y), new Point(currLine.from.x, currLine.from.y));
|
|
532
|
+
resultLine.addGap(gapLine);
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
return resultLine;
|
|
536
|
+
}
|
|
537
|
+
mergeVerticalLines(lines) {
|
|
538
|
+
lines.sort((l1, l2) => l1.from.y - l2.from.y);
|
|
539
|
+
const minY = lines[0].from.y;
|
|
540
|
+
const maxY = lines[lines.length - 1].to.y;
|
|
541
|
+
const resultLine = new Line(new Point(lines[0].from.x, minY), new Point(lines[0].from.x, maxY));
|
|
542
|
+
for (let i = 1;i < lines.length; i++) {
|
|
543
|
+
const prevLine = lines[i - 1];
|
|
544
|
+
const currLine = lines[i];
|
|
545
|
+
if (Math.abs(prevLine.to.y - currLine.from.y) > Shape.tolerance) {
|
|
546
|
+
const gapLine = new Line(new Point(prevLine.to.x, prevLine.to.y), new Point(prevLine.to.x, currLine.from.y));
|
|
547
|
+
resultLine.addGap(gapLine);
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
return resultLine;
|
|
551
|
+
}
|
|
552
|
+
normalize() {
|
|
553
|
+
this.hLines = this.hLines.filter((l) => l.intersections.length > 1);
|
|
554
|
+
this.vLines = this.vLines.filter((l) => l.intersections.length > 1);
|
|
555
|
+
this.hLines.sort((l1, l2) => l1.from.y - l2.from.y);
|
|
556
|
+
this.vLines.sort((l1, l2) => l1.from.x - l2.from.x);
|
|
557
|
+
const newHLines = [];
|
|
558
|
+
while (this.hLines.length > 0) {
|
|
559
|
+
const line = this.hLines.shift();
|
|
560
|
+
if (!line)
|
|
561
|
+
continue;
|
|
562
|
+
const lines = this.getSameHorizontal(line);
|
|
563
|
+
const merged = this.mergeHorizontalLines(lines);
|
|
564
|
+
newHLines.push(merged);
|
|
565
|
+
}
|
|
566
|
+
this.hLines = newHLines;
|
|
567
|
+
const newVLines = [];
|
|
568
|
+
while (this.vLines.length > 0) {
|
|
569
|
+
const line = this.vLines.shift();
|
|
570
|
+
if (!line)
|
|
571
|
+
continue;
|
|
572
|
+
const lines = this.getSameVertical(line);
|
|
573
|
+
const merged = this.mergeVerticalLines(lines);
|
|
574
|
+
newVLines.push(merged);
|
|
575
|
+
}
|
|
576
|
+
this.vLines = newVLines;
|
|
577
|
+
}
|
|
578
|
+
verticalExists(line, y1, y2) {
|
|
579
|
+
if (line.direction !== LineDirection.Vertical) {
|
|
580
|
+
throw new Error("Line is not vertical");
|
|
581
|
+
}
|
|
582
|
+
if (y1 >= y2) {
|
|
583
|
+
throw new Error("y1 must be less than y2");
|
|
584
|
+
}
|
|
585
|
+
if (line.from.y <= y1 && line.to.y >= y2) {
|
|
586
|
+
for (const gap of line.gaps) {
|
|
587
|
+
if (gap.from.y <= y1 && gap.to.y >= y2) {
|
|
588
|
+
return false;
|
|
589
|
+
}
|
|
590
|
+
}
|
|
591
|
+
return true;
|
|
592
|
+
}
|
|
593
|
+
return false;
|
|
594
|
+
}
|
|
595
|
+
horizontalExists(line, x1, x2) {
|
|
596
|
+
if (line.direction !== LineDirection.Horizontal) {
|
|
597
|
+
throw new Error("Line is not horizontal");
|
|
598
|
+
}
|
|
599
|
+
if (x1 >= x2) {
|
|
600
|
+
throw new Error("x1 must be less than x2");
|
|
601
|
+
}
|
|
602
|
+
if (line.from.x <= x1 && line.to.x >= x2) {
|
|
603
|
+
for (const gap of line.gaps) {
|
|
604
|
+
if (gap.from.x <= x1 && gap.to.x >= x2) {
|
|
605
|
+
return false;
|
|
606
|
+
}
|
|
607
|
+
}
|
|
608
|
+
return true;
|
|
609
|
+
}
|
|
610
|
+
return false;
|
|
611
|
+
}
|
|
612
|
+
findBottomLineIndex(h2Index, xMiddle) {
|
|
613
|
+
for (let i = h2Index;i < this.hLines.length; i++) {
|
|
614
|
+
const hLine = this.hLines[i];
|
|
615
|
+
if (hLine.from.x <= xMiddle && hLine.to.x >= xMiddle) {
|
|
616
|
+
return i;
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
return -1;
|
|
620
|
+
}
|
|
621
|
+
findVerticalLineIndexs(topHLine, yMiddle) {
|
|
622
|
+
const result = [];
|
|
623
|
+
for (let i = 0;i < this.vLines.length; i++) {
|
|
624
|
+
const vLine = this.vLines[i];
|
|
625
|
+
if (vLine.from.y <= yMiddle && vLine.to.y >= yMiddle && topHLine.intersection(vLine)) {
|
|
626
|
+
result.push(i);
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
return result;
|
|
630
|
+
}
|
|
631
|
+
getRow(h1Index, h2Index, yMiddle) {
|
|
632
|
+
const tableRow = [];
|
|
633
|
+
const topHLine = this.hLines[h1Index];
|
|
634
|
+
const vLineIndexes = this.findVerticalLineIndexs(topHLine, yMiddle);
|
|
635
|
+
for (let i = 1;i < vLineIndexes.length; i++) {
|
|
636
|
+
const leftVLine = this.vLines[vLineIndexes[i - 1]];
|
|
637
|
+
const rightVLine = this.vLines[vLineIndexes[i]];
|
|
638
|
+
const xMiddle = (leftVLine.from.x + rightVLine.from.x) / 2;
|
|
639
|
+
const bottomHLineIndex = this.findBottomLineIndex(h2Index, xMiddle);
|
|
640
|
+
const bottomHLine = this.hLines[bottomHLineIndex];
|
|
641
|
+
const tableCell = {
|
|
642
|
+
minXY: new Point(leftVLine.from.x, topHLine.from.y),
|
|
643
|
+
maxXY: new Point(rightVLine.from.x, bottomHLine.from.y),
|
|
644
|
+
width: rightVLine.from.x - leftVLine.from.x,
|
|
645
|
+
height: bottomHLine.from.y - topHLine.from.y,
|
|
646
|
+
text: []
|
|
647
|
+
};
|
|
648
|
+
const colSpan = vLineIndexes[i] - vLineIndexes[i - 1];
|
|
649
|
+
const rowSpan = bottomHLineIndex - h1Index;
|
|
650
|
+
if (colSpan > 1) {
|
|
651
|
+
tableCell.colspan = colSpan;
|
|
652
|
+
}
|
|
653
|
+
if (rowSpan > 1) {
|
|
654
|
+
tableCell.rowspan = rowSpan;
|
|
655
|
+
}
|
|
656
|
+
tableRow.push(tableCell);
|
|
657
|
+
}
|
|
658
|
+
return tableRow;
|
|
659
|
+
}
|
|
660
|
+
toData() {
|
|
661
|
+
const rowPivots = this.rowPivots;
|
|
662
|
+
const colPivots = this.colPivots;
|
|
663
|
+
const minXY = new Point(colPivots[0], rowPivots[0]);
|
|
664
|
+
const maxXY = new Point(colPivots[colPivots.length - 1], rowPivots[rowPivots.length - 1]);
|
|
665
|
+
const result = new TableData(minXY, maxXY, rowPivots, colPivots);
|
|
666
|
+
for (let h1 = 1;h1 < this.hLines.length; h1++) {
|
|
667
|
+
const prevHLine = this.hLines[h1 - 1];
|
|
668
|
+
const currHLine = this.hLines[h1];
|
|
669
|
+
const YMiddle = (prevHLine.from.y + currHLine.from.y) / 2;
|
|
670
|
+
const rowData = this.getRow(h1 - 1, h1, YMiddle);
|
|
671
|
+
result.rows.push(rowData);
|
|
672
|
+
}
|
|
673
|
+
return result;
|
|
674
|
+
}
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
// node_modules/pdf-parse/dist/pdf-parse/esm/geometry/LineStore.js
|
|
678
|
+
class LineStore {
|
|
679
|
+
hLines = [];
|
|
680
|
+
vLines = [];
|
|
681
|
+
add(line) {
|
|
682
|
+
if (line.valid) {
|
|
683
|
+
if (line.direction === LineDirection.Horizontal) {
|
|
684
|
+
this.hLines.push(line);
|
|
685
|
+
} else if (line.direction === LineDirection.Vertical) {
|
|
686
|
+
this.vLines.push(line);
|
|
687
|
+
}
|
|
688
|
+
}
|
|
689
|
+
}
|
|
690
|
+
addRectangle(rect) {
|
|
691
|
+
for (const line of rect.getLines()) {
|
|
692
|
+
this.add(line);
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
getTableData() {
|
|
696
|
+
const result = [];
|
|
697
|
+
const tables = this.getTables();
|
|
698
|
+
for (const table of tables) {
|
|
699
|
+
const data = table.toData();
|
|
700
|
+
if (data) {
|
|
701
|
+
result.push(data);
|
|
702
|
+
}
|
|
703
|
+
}
|
|
704
|
+
return result;
|
|
705
|
+
}
|
|
706
|
+
getTables() {
|
|
707
|
+
const result = [];
|
|
708
|
+
while (this.hLines.length !== 0) {
|
|
709
|
+
const hLine = this.hLines.shift();
|
|
710
|
+
if (!hLine)
|
|
711
|
+
continue;
|
|
712
|
+
const filled = this.tryFill(result, hLine);
|
|
713
|
+
if (filled)
|
|
714
|
+
continue;
|
|
715
|
+
const table = new Table(hLine);
|
|
716
|
+
this.fillTable(table);
|
|
717
|
+
result.push(table);
|
|
718
|
+
}
|
|
719
|
+
while (this.vLines.length !== 0) {
|
|
720
|
+
const vLine = this.vLines.shift();
|
|
721
|
+
if (!vLine)
|
|
722
|
+
continue;
|
|
723
|
+
const filled = this.tryFill(result, vLine);
|
|
724
|
+
if (filled)
|
|
725
|
+
continue;
|
|
726
|
+
const table = new Table(vLine);
|
|
727
|
+
this.fillTable(table);
|
|
728
|
+
result.push(table);
|
|
729
|
+
}
|
|
730
|
+
const validTables = result.filter((t) => t.isValid);
|
|
731
|
+
for (const table of validTables) {
|
|
732
|
+
table.normalize();
|
|
733
|
+
}
|
|
734
|
+
return validTables;
|
|
735
|
+
}
|
|
736
|
+
normalize() {
|
|
737
|
+
this.normalizeHorizontal();
|
|
738
|
+
this.normalizeVertical();
|
|
739
|
+
}
|
|
740
|
+
normalizeHorizontal() {
|
|
741
|
+
this.hLines.sort((l1, l2) => l1.from.y - l2.from.y);
|
|
742
|
+
const newLines = [];
|
|
743
|
+
let sameY = [];
|
|
744
|
+
for (const line of this.hLines) {
|
|
745
|
+
if (sameY.length === 0) {
|
|
746
|
+
sameY.push(line);
|
|
747
|
+
} else if (Math.abs(sameY[0]?.from.y - line.from.y) < Shape.tolerance) {
|
|
748
|
+
sameY.push(line);
|
|
749
|
+
} else {
|
|
750
|
+
const merged = this.margeHorizontalLines(sameY);
|
|
751
|
+
newLines.push(...merged);
|
|
752
|
+
sameY = [line];
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
if (sameY.length > 0) {
|
|
756
|
+
const merged = this.margeHorizontalLines(sameY);
|
|
757
|
+
newLines.push(...merged);
|
|
758
|
+
}
|
|
759
|
+
this.hLines = newLines;
|
|
760
|
+
}
|
|
761
|
+
normalizeVertical() {
|
|
762
|
+
this.vLines.sort((l1, l2) => l1.from.x - l2.from.x);
|
|
763
|
+
const newLines = [];
|
|
764
|
+
let sameX = [];
|
|
765
|
+
for (const line of this.vLines) {
|
|
766
|
+
if (sameX.length === 0) {
|
|
767
|
+
sameX.push(line);
|
|
768
|
+
} else if (Math.abs(sameX[0]?.from.x - line.from.x) < Shape.tolerance) {
|
|
769
|
+
sameX.push(line);
|
|
770
|
+
} else {
|
|
771
|
+
const merged = this.margeVerticalLines(sameX);
|
|
772
|
+
newLines.push(...merged);
|
|
773
|
+
sameX = [line];
|
|
774
|
+
}
|
|
775
|
+
}
|
|
776
|
+
if (sameX.length > 0) {
|
|
777
|
+
const merged = this.margeVerticalLines(sameX);
|
|
778
|
+
newLines.push(...merged);
|
|
779
|
+
}
|
|
780
|
+
this.vLines = newLines;
|
|
781
|
+
}
|
|
782
|
+
fillTable(table) {
|
|
783
|
+
const newVLines = [];
|
|
784
|
+
const newHLines = [];
|
|
785
|
+
for (const vLine of this.vLines) {
|
|
786
|
+
if (!table.add(vLine)) {
|
|
787
|
+
newVLines.push(vLine);
|
|
788
|
+
}
|
|
789
|
+
}
|
|
790
|
+
for (const hLine of this.hLines) {
|
|
791
|
+
if (!table.add(hLine)) {
|
|
792
|
+
newHLines.push(hLine);
|
|
793
|
+
}
|
|
794
|
+
}
|
|
795
|
+
this.hLines = newHLines;
|
|
796
|
+
this.vLines = newVLines;
|
|
797
|
+
}
|
|
798
|
+
tryFill(tables, line) {
|
|
799
|
+
for (const table of tables) {
|
|
800
|
+
if (table.add(line)) {
|
|
801
|
+
this.fillTable(table);
|
|
802
|
+
return true;
|
|
803
|
+
}
|
|
804
|
+
}
|
|
805
|
+
return false;
|
|
806
|
+
}
|
|
807
|
+
margeHorizontalLines(sameYLines) {
|
|
808
|
+
const result = [];
|
|
809
|
+
sameYLines.sort((l1, l2) => l1.from.x - l2.from.x);
|
|
810
|
+
const sameY = sameYLines[0]?.from.y;
|
|
811
|
+
if (sameY === undefined)
|
|
812
|
+
return result;
|
|
813
|
+
let minX = Number.MAX_SAFE_INTEGER;
|
|
814
|
+
let maxX = Number.MIN_SAFE_INTEGER;
|
|
815
|
+
for (const line of sameYLines) {
|
|
816
|
+
if (line.from.x - maxX < Shape.tolerance) {
|
|
817
|
+
if (line.from.x < minX) {
|
|
818
|
+
minX = line.from.x;
|
|
819
|
+
}
|
|
820
|
+
if (line.to.x > maxX) {
|
|
821
|
+
maxX = line.to.x;
|
|
822
|
+
}
|
|
823
|
+
} else {
|
|
824
|
+
if (maxX > minX) {
|
|
825
|
+
result.push(new Line(new Point(minX, sameY), new Point(maxX, sameY)));
|
|
826
|
+
}
|
|
827
|
+
minX = line.from.x;
|
|
828
|
+
maxX = line.to.x;
|
|
829
|
+
}
|
|
830
|
+
}
|
|
831
|
+
const last = result[result.length - 1];
|
|
832
|
+
if (last) {
|
|
833
|
+
if (last.from.x !== minX && last.to.x !== maxX) {
|
|
834
|
+
result.push(new Line(new Point(minX, sameY), new Point(maxX, sameY)));
|
|
835
|
+
}
|
|
836
|
+
} else {
|
|
837
|
+
result.push(new Line(new Point(minX, sameY), new Point(maxX, sameY)));
|
|
838
|
+
}
|
|
839
|
+
return result;
|
|
840
|
+
}
|
|
841
|
+
margeVerticalLines(sameXLines) {
|
|
842
|
+
const result = [];
|
|
843
|
+
sameXLines.sort((l1, l2) => l1.from.y - l2.from.y);
|
|
844
|
+
const sameX = sameXLines[0]?.from.x;
|
|
845
|
+
if (sameX === undefined)
|
|
846
|
+
return result;
|
|
847
|
+
let minY = Number.MAX_SAFE_INTEGER;
|
|
848
|
+
let maxY = Number.MIN_SAFE_INTEGER;
|
|
849
|
+
for (const line of sameXLines) {
|
|
850
|
+
if (line.from.y - maxY < Shape.tolerance) {
|
|
851
|
+
if (line.from.y < minY) {
|
|
852
|
+
minY = line.from.y;
|
|
853
|
+
}
|
|
854
|
+
if (line.to.y > maxY) {
|
|
855
|
+
maxY = line.to.y;
|
|
856
|
+
}
|
|
857
|
+
} else {
|
|
858
|
+
if (maxY > minY) {
|
|
859
|
+
result.push(new Line(new Point(sameX, minY), new Point(sameX, maxY)));
|
|
860
|
+
}
|
|
861
|
+
minY = line.from.y;
|
|
862
|
+
maxY = line.to.y;
|
|
863
|
+
}
|
|
864
|
+
}
|
|
865
|
+
const last = result[result.length - 1];
|
|
866
|
+
if (last) {
|
|
867
|
+
if (last.from.y !== minY && last.to.y !== maxY) {
|
|
868
|
+
result.push(new Line(new Point(sameX, minY), new Point(sameX, maxY)));
|
|
869
|
+
}
|
|
870
|
+
} else {
|
|
871
|
+
result.push(new Line(new Point(sameX, minY), new Point(sameX, maxY)));
|
|
872
|
+
}
|
|
873
|
+
return result;
|
|
874
|
+
}
|
|
875
|
+
}
|
|
876
|
+
// node_modules/pdf-parse/dist/pdf-parse/esm/geometry/Rectangle.js
|
|
877
|
+
class Rectangle extends Shape {
|
|
878
|
+
from;
|
|
879
|
+
width;
|
|
880
|
+
height;
|
|
881
|
+
constructor(from, width, height) {
|
|
882
|
+
super();
|
|
883
|
+
this.from = from;
|
|
884
|
+
this.width = width;
|
|
885
|
+
this.height = height;
|
|
886
|
+
}
|
|
887
|
+
get to() {
|
|
888
|
+
return new Point(this.from.x + this.width, this.from.y + this.height);
|
|
889
|
+
}
|
|
890
|
+
getLines() {
|
|
891
|
+
const to = this.to;
|
|
892
|
+
const lines = [
|
|
893
|
+
new Line(this.from, new Point(to.x, this.from.y)),
|
|
894
|
+
new Line(this.from, new Point(this.from.x, to.y)),
|
|
895
|
+
new Line(new Point(to.x, this.from.y), to),
|
|
896
|
+
new Line(new Point(this.from.x, to.y), to)
|
|
897
|
+
];
|
|
898
|
+
return lines.filter((l) => l.valid);
|
|
899
|
+
}
|
|
900
|
+
transform(matrix) {
|
|
901
|
+
const p1 = Shape.applyTransform([this.from.x, this.from.y], matrix);
|
|
902
|
+
const p2 = Shape.applyTransform([this.from.x + this.width, this.from.y + this.height], matrix);
|
|
903
|
+
const x = Math.min(p1[0], p2[0]);
|
|
904
|
+
const y = Math.min(p1[1], p2[1]);
|
|
905
|
+
const width = Math.abs(p1[0] - p2[0]);
|
|
906
|
+
const height = Math.abs(p1[1] - p2[1]);
|
|
907
|
+
this.from = new Point(x, y);
|
|
908
|
+
this.width = width;
|
|
909
|
+
this.height = height;
|
|
910
|
+
return this;
|
|
911
|
+
}
|
|
912
|
+
}
|
|
913
|
+
// node_modules/pdf-parse/dist/pdf-parse/esm/ImageResult.js
|
|
914
|
+
class ImageResult {
|
|
915
|
+
pages = [];
|
|
916
|
+
total = 0;
|
|
917
|
+
getPageImage(num, name) {
|
|
918
|
+
for (const pageData of this.pages) {
|
|
919
|
+
if (pageData.pageNumber === num) {
|
|
920
|
+
for (const img of pageData.images) {
|
|
921
|
+
if (img.name === name) {
|
|
922
|
+
return img;
|
|
923
|
+
}
|
|
924
|
+
}
|
|
925
|
+
}
|
|
926
|
+
}
|
|
927
|
+
return null;
|
|
928
|
+
}
|
|
929
|
+
constructor(total) {
|
|
930
|
+
this.total = total;
|
|
931
|
+
}
|
|
932
|
+
}
|
|
933
|
+
|
|
934
|
+
// node_modules/pdf-parse/dist/pdf-parse/esm/InfoResult.js
|
|
74
935
|
var pdfjs = __toESM(require("pdfjs-dist/legacy/build/pdf.mjs"));
|
|
75
|
-
var
|
|
76
|
-
|
|
77
|
-
|
|
936
|
+
var XMP_DATE_PROPERTIES = [
|
|
937
|
+
"xmp:createdate",
|
|
938
|
+
"xmp:modifydate",
|
|
939
|
+
"xmp:metadatadate",
|
|
940
|
+
"xap:createdate",
|
|
941
|
+
"xap:modifydate",
|
|
942
|
+
"xap:metadatadate"
|
|
943
|
+
];
|
|
944
|
+
|
|
945
|
+
class InfoResult {
|
|
946
|
+
total;
|
|
947
|
+
info;
|
|
948
|
+
metadata;
|
|
949
|
+
fingerprints;
|
|
950
|
+
permission;
|
|
951
|
+
outline;
|
|
952
|
+
pages = [];
|
|
953
|
+
getDateNode() {
|
|
954
|
+
const result = {};
|
|
955
|
+
const CreationDate = this.info?.CreationDate;
|
|
956
|
+
if (CreationDate) {
|
|
957
|
+
result.CreationDate = pdfjs.PDFDateString.toDateObject(CreationDate);
|
|
958
|
+
}
|
|
959
|
+
const ModDate = this.info?.ModDate;
|
|
960
|
+
if (ModDate) {
|
|
961
|
+
result.ModDate = pdfjs.PDFDateString.toDateObject(ModDate);
|
|
962
|
+
}
|
|
963
|
+
if (!this.metadata) {
|
|
964
|
+
return result;
|
|
965
|
+
}
|
|
966
|
+
for (const prop of XMP_DATE_PROPERTIES) {
|
|
967
|
+
const value = this.metadata?.get(prop);
|
|
968
|
+
const date = this.parseISODateString(value);
|
|
969
|
+
switch (prop) {
|
|
970
|
+
case XMP_DATE_PROPERTIES[0]:
|
|
971
|
+
result.XmpCreateDate = date;
|
|
972
|
+
break;
|
|
973
|
+
case XMP_DATE_PROPERTIES[1]:
|
|
974
|
+
result.XmpModifyDate = date;
|
|
975
|
+
break;
|
|
976
|
+
case XMP_DATE_PROPERTIES[2]:
|
|
977
|
+
result.XmpMetadataDate = date;
|
|
978
|
+
break;
|
|
979
|
+
case XMP_DATE_PROPERTIES[3]:
|
|
980
|
+
result.XapCreateDate = date;
|
|
981
|
+
break;
|
|
982
|
+
case XMP_DATE_PROPERTIES[4]:
|
|
983
|
+
result.XapModifyDate = date;
|
|
984
|
+
break;
|
|
985
|
+
case XMP_DATE_PROPERTIES[5]:
|
|
986
|
+
result.XapMetadataDate = date;
|
|
987
|
+
break;
|
|
988
|
+
}
|
|
989
|
+
}
|
|
990
|
+
return result;
|
|
991
|
+
}
|
|
992
|
+
parseISODateString(isoDateString) {
|
|
993
|
+
if (!isoDateString)
|
|
994
|
+
return;
|
|
995
|
+
const parsedDate = Date.parse(isoDateString);
|
|
996
|
+
if (!Number.isNaN(parsedDate)) {
|
|
997
|
+
return new Date(parsedDate);
|
|
998
|
+
}
|
|
999
|
+
return;
|
|
1000
|
+
}
|
|
1001
|
+
constructor(total) {
|
|
1002
|
+
this.total = total;
|
|
1003
|
+
}
|
|
1004
|
+
}
|
|
1005
|
+
|
|
1006
|
+
// node_modules/pdf-parse/dist/pdf-parse/esm/ParseParameters.js
|
|
1007
|
+
function setDefaultParseParameters(params) {
|
|
1008
|
+
params.lineThreshold = params?.lineThreshold ?? 4.6;
|
|
1009
|
+
params.cellThreshold = params?.cellThreshold ?? 7;
|
|
1010
|
+
params.cellSeparator = params?.cellSeparator ?? "\t";
|
|
1011
|
+
params.lineEnforce = params?.lineEnforce ?? true;
|
|
1012
|
+
params.pageJoiner = params?.pageJoiner ?? `
|
|
1013
|
+
-- page_number of total_number --`;
|
|
1014
|
+
params.imageThreshold = params?.imageThreshold ?? 80;
|
|
1015
|
+
params.imageDataUrl = params?.imageDataUrl ?? true;
|
|
1016
|
+
params.imageBuffer = params?.imageBuffer ?? true;
|
|
1017
|
+
params.scale = params?.scale ?? 1;
|
|
1018
|
+
return params;
|
|
1019
|
+
}
|
|
1020
|
+
|
|
1021
|
+
// node_modules/pdf-parse/dist/pdf-parse/esm/PathGeometry.js
|
|
1022
|
+
var PathGeometry;
|
|
1023
|
+
(function(PathGeometry2) {
|
|
1024
|
+
PathGeometry2[PathGeometry2["undefined"] = 0] = "undefined";
|
|
1025
|
+
PathGeometry2[PathGeometry2["hline"] = 1] = "hline";
|
|
1026
|
+
PathGeometry2[PathGeometry2["vline"] = 2] = "vline";
|
|
1027
|
+
PathGeometry2[PathGeometry2["rectangle"] = 3] = "rectangle";
|
|
1028
|
+
})(PathGeometry || (PathGeometry = {}));
|
|
1029
|
+
var DrawOPS;
|
|
1030
|
+
(function(DrawOPS2) {
|
|
1031
|
+
DrawOPS2[DrawOPS2["moveTo"] = 0] = "moveTo";
|
|
1032
|
+
DrawOPS2[DrawOPS2["lineTo"] = 1] = "lineTo";
|
|
1033
|
+
DrawOPS2[DrawOPS2["curveTo"] = 2] = "curveTo";
|
|
1034
|
+
DrawOPS2[DrawOPS2["closePath"] = 3] = "closePath";
|
|
1035
|
+
DrawOPS2[DrawOPS2["rectangle"] = 4] = "rectangle";
|
|
1036
|
+
})(DrawOPS || (DrawOPS = {}));
|
|
1037
|
+
|
|
1038
|
+
// node_modules/pdf-parse/dist/pdf-parse/esm/ScreenshotResult.js
|
|
1039
|
+
class ScreenshotResult {
|
|
1040
|
+
pages = [];
|
|
1041
|
+
total = 0;
|
|
1042
|
+
constructor(total) {
|
|
1043
|
+
this.total = total;
|
|
1044
|
+
}
|
|
1045
|
+
}
|
|
1046
|
+
|
|
1047
|
+
// node_modules/pdf-parse/dist/pdf-parse/esm/TableResult.js
|
|
1048
|
+
class TableResult {
|
|
1049
|
+
pages = [];
|
|
1050
|
+
mergedTables = [];
|
|
1051
|
+
total = 0;
|
|
1052
|
+
constructor(total) {
|
|
1053
|
+
this.total = total;
|
|
1054
|
+
}
|
|
1055
|
+
}
|
|
1056
|
+
|
|
1057
|
+
// node_modules/pdf-parse/dist/pdf-parse/esm/TextResult.js
|
|
1058
|
+
class TextResult {
|
|
1059
|
+
pages = [];
|
|
1060
|
+
text = "";
|
|
1061
|
+
total = 0;
|
|
1062
|
+
getPageText(num) {
|
|
1063
|
+
for (const pageData of this.pages) {
|
|
1064
|
+
if (pageData.num === num)
|
|
1065
|
+
return pageData.text;
|
|
1066
|
+
}
|
|
1067
|
+
return "";
|
|
1068
|
+
}
|
|
1069
|
+
constructor(total) {
|
|
1070
|
+
this.total = total;
|
|
1071
|
+
}
|
|
78
1072
|
}
|
|
79
1073
|
|
|
1074
|
+
// node_modules/pdf-parse/dist/pdf-parse/esm/PDFParse.js
|
|
1075
|
+
class PDFParse {
|
|
1076
|
+
options;
|
|
1077
|
+
doc;
|
|
1078
|
+
progress = { loaded: -1, total: 0 };
|
|
1079
|
+
constructor(options) {
|
|
1080
|
+
if (options.verbosity === undefined) {
|
|
1081
|
+
options.verbosity = pdfjs2.VerbosityLevel.ERRORS;
|
|
1082
|
+
}
|
|
1083
|
+
if (typeof Buffer !== "undefined" && options.data instanceof Buffer) {
|
|
1084
|
+
options.data = new Uint8Array(options.data);
|
|
1085
|
+
}
|
|
1086
|
+
this.options = options;
|
|
1087
|
+
}
|
|
1088
|
+
async destroy() {
|
|
1089
|
+
if (this.doc) {
|
|
1090
|
+
await this.doc.destroy();
|
|
1091
|
+
this.doc = undefined;
|
|
1092
|
+
}
|
|
1093
|
+
}
|
|
1094
|
+
static get isNodeJS() {
|
|
1095
|
+
const isNodeJS = typeof process === "object" && `${process}` === "[object process]" && !process.versions.nw && !(process.versions.electron && typeof process.type !== "undefined" && process.type !== "browser");
|
|
1096
|
+
return isNodeJS;
|
|
1097
|
+
}
|
|
1098
|
+
static setWorker(workerSrc) {
|
|
1099
|
+
if (typeof globalThis.pdfjs === "undefined") {
|
|
1100
|
+
globalThis.pdfjs = pdfjs2;
|
|
1101
|
+
}
|
|
1102
|
+
if (pdfjs2?.GlobalWorkerOptions === null)
|
|
1103
|
+
return "";
|
|
1104
|
+
if (workerSrc !== undefined) {
|
|
1105
|
+
pdfjs2.GlobalWorkerOptions.workerSrc = workerSrc;
|
|
1106
|
+
return pdfjs2.GlobalWorkerOptions.workerSrc;
|
|
1107
|
+
}
|
|
1108
|
+
return pdfjs2.GlobalWorkerOptions.workerSrc;
|
|
1109
|
+
}
|
|
1110
|
+
async getInfo(params = {}) {
|
|
1111
|
+
const doc = await this.load();
|
|
1112
|
+
const result = new InfoResult(doc.numPages);
|
|
1113
|
+
const { info, metadata } = await doc.getMetadata();
|
|
1114
|
+
result.info = info;
|
|
1115
|
+
result.metadata = metadata;
|
|
1116
|
+
result.fingerprints = doc.fingerprints;
|
|
1117
|
+
result.outline = await doc.getOutline();
|
|
1118
|
+
result.permission = await doc.getPermissions();
|
|
1119
|
+
const pageLabels = await doc.getPageLabels();
|
|
1120
|
+
if (params.parsePageInfo) {
|
|
1121
|
+
for (let i = 1;i <= result.total; i++) {
|
|
1122
|
+
if (this.shouldParse(i, result.total, params)) {
|
|
1123
|
+
const page = await doc.getPage(i);
|
|
1124
|
+
const pageLinkResult = await this.getPageLinks(page);
|
|
1125
|
+
pageLinkResult.pageLabel = pageLabels?.[page.pageNumber];
|
|
1126
|
+
result.pages.push(pageLinkResult);
|
|
1127
|
+
page.cleanup();
|
|
1128
|
+
}
|
|
1129
|
+
}
|
|
1130
|
+
}
|
|
1131
|
+
return result;
|
|
1132
|
+
}
|
|
1133
|
+
async getPageLinks(page) {
|
|
1134
|
+
const viewport = page.getViewport({ scale: 1 });
|
|
1135
|
+
const result = {
|
|
1136
|
+
pageNumber: page.pageNumber,
|
|
1137
|
+
links: [],
|
|
1138
|
+
width: viewport.width,
|
|
1139
|
+
height: viewport.height
|
|
1140
|
+
};
|
|
1141
|
+
const annotations = await page.getAnnotations({ intent: "display" }) || [];
|
|
1142
|
+
for (const i of annotations) {
|
|
1143
|
+
if (i.subtype !== "Link")
|
|
1144
|
+
continue;
|
|
1145
|
+
const url = i.url ?? i.unsafeUrl;
|
|
1146
|
+
if (!url)
|
|
1147
|
+
continue;
|
|
1148
|
+
const text = i.overlaidText || "";
|
|
1149
|
+
result.links.push({ url, text });
|
|
1150
|
+
}
|
|
1151
|
+
return result;
|
|
1152
|
+
}
|
|
1153
|
+
async getText(params = {}) {
|
|
1154
|
+
const doc = await this.load();
|
|
1155
|
+
const result = new TextResult(doc.numPages);
|
|
1156
|
+
for (let i = 1;i <= result.total; i++) {
|
|
1157
|
+
if (this.shouldParse(i, result.total, params)) {
|
|
1158
|
+
const page = await doc.getPage(i);
|
|
1159
|
+
const text = await this.getPageText(page, params, result.total);
|
|
1160
|
+
result.pages.push({
|
|
1161
|
+
text,
|
|
1162
|
+
num: i
|
|
1163
|
+
});
|
|
1164
|
+
page.cleanup();
|
|
1165
|
+
}
|
|
1166
|
+
}
|
|
1167
|
+
for (const page of result.pages) {
|
|
1168
|
+
if (params.pageJoiner) {
|
|
1169
|
+
let pageNumber = params.pageJoiner.replace("page_number", `${page.num}`);
|
|
1170
|
+
pageNumber = pageNumber.replace("total_number", `${result.total}`);
|
|
1171
|
+
result.text += `${page.text}
|
|
1172
|
+
${pageNumber}
|
|
1173
|
+
|
|
1174
|
+
`;
|
|
1175
|
+
} else {
|
|
1176
|
+
result.text += `${page.text}
|
|
1177
|
+
|
|
1178
|
+
`;
|
|
1179
|
+
}
|
|
1180
|
+
}
|
|
1181
|
+
return result;
|
|
1182
|
+
}
|
|
1183
|
+
async load() {
|
|
1184
|
+
try {
|
|
1185
|
+
if (this.doc === undefined) {
|
|
1186
|
+
const loadingTask = pdfjs2.getDocument(this.options);
|
|
1187
|
+
loadingTask.onProgress = (progress) => {
|
|
1188
|
+
this.progress = progress;
|
|
1189
|
+
};
|
|
1190
|
+
this.doc = await loadingTask.promise;
|
|
1191
|
+
}
|
|
1192
|
+
return this.doc;
|
|
1193
|
+
} catch (error) {
|
|
1194
|
+
throw getException(error);
|
|
1195
|
+
}
|
|
1196
|
+
}
|
|
1197
|
+
shouldParse(currentPage, totalPage, params) {
|
|
1198
|
+
params.partial = params?.partial ?? [];
|
|
1199
|
+
params.first = params?.first ?? 0;
|
|
1200
|
+
params.last = params?.last ?? 0;
|
|
1201
|
+
if (params.partial.length > 0) {
|
|
1202
|
+
if (params.partial.includes(currentPage)) {
|
|
1203
|
+
return true;
|
|
1204
|
+
}
|
|
1205
|
+
return false;
|
|
1206
|
+
}
|
|
1207
|
+
if (params.first > 0 && params.last > 0) {
|
|
1208
|
+
if (currentPage >= params.first && currentPage <= params.last) {
|
|
1209
|
+
return true;
|
|
1210
|
+
}
|
|
1211
|
+
return false;
|
|
1212
|
+
}
|
|
1213
|
+
if (params.first > 0) {
|
|
1214
|
+
if (currentPage <= params.first) {
|
|
1215
|
+
return true;
|
|
1216
|
+
}
|
|
1217
|
+
return false;
|
|
1218
|
+
}
|
|
1219
|
+
if (params.last > 0) {
|
|
1220
|
+
if (currentPage > totalPage - params.last) {
|
|
1221
|
+
return true;
|
|
1222
|
+
}
|
|
1223
|
+
return false;
|
|
1224
|
+
}
|
|
1225
|
+
return true;
|
|
1226
|
+
}
|
|
1227
|
+
async getPageText(page, parseParams, total) {
|
|
1228
|
+
const viewport = page.getViewport({ scale: 1 });
|
|
1229
|
+
const params = setDefaultParseParameters(parseParams);
|
|
1230
|
+
const textContent = await page.getTextContent({
|
|
1231
|
+
includeMarkedContent: !!params.includeMarkedContent,
|
|
1232
|
+
disableNormalization: !!params.disableNormalization
|
|
1233
|
+
});
|
|
1234
|
+
let links = new Map;
|
|
1235
|
+
if (params.parseHyperlinks) {
|
|
1236
|
+
links = await this.getHyperlinks(page, viewport);
|
|
1237
|
+
}
|
|
1238
|
+
const strBuf = [];
|
|
1239
|
+
let lastX;
|
|
1240
|
+
let lastY;
|
|
1241
|
+
let lineHeight = 0;
|
|
1242
|
+
for (const item of textContent.items) {
|
|
1243
|
+
if (!("str" in item))
|
|
1244
|
+
continue;
|
|
1245
|
+
const tm = item.transform ?? item.transform;
|
|
1246
|
+
const [x, y] = viewport.convertToViewportPoint(tm[4], tm[5]);
|
|
1247
|
+
if (params.parseHyperlinks) {
|
|
1248
|
+
const posArr = links.get(item.str) || [];
|
|
1249
|
+
const hit = posArr.find((l) => x >= l.rect.left && x <= l.rect.right && y >= l.rect.top && y <= l.rect.bottom);
|
|
1250
|
+
if (hit) {
|
|
1251
|
+
item.str = `[${item.str}](${hit.url})`;
|
|
1252
|
+
}
|
|
1253
|
+
}
|
|
1254
|
+
if (params.lineEnforce) {
|
|
1255
|
+
if (lastY !== undefined && Math.abs(lastY - y) > params.lineThreshold) {
|
|
1256
|
+
const lastItem = strBuf.length ? strBuf[strBuf.length - 1] : undefined;
|
|
1257
|
+
const isCurrentItemHasNewLine = item.str.startsWith(`
|
|
1258
|
+
`) || item.str.trim() === "" && item.hasEOL;
|
|
1259
|
+
if (lastItem?.endsWith(`
|
|
1260
|
+
`) === false && !isCurrentItemHasNewLine) {
|
|
1261
|
+
const ydiff = Math.abs(lastY - y);
|
|
1262
|
+
if (ydiff - 1 > lineHeight) {
|
|
1263
|
+
strBuf.push(`
|
|
1264
|
+
`);
|
|
1265
|
+
lineHeight = 0;
|
|
1266
|
+
}
|
|
1267
|
+
}
|
|
1268
|
+
}
|
|
1269
|
+
}
|
|
1270
|
+
if (params.cellSeparator) {
|
|
1271
|
+
if (lastY !== undefined && Math.abs(lastY - y) < params.lineThreshold) {
|
|
1272
|
+
if (lastX !== undefined && Math.abs(lastX - x) > params.cellThreshold) {
|
|
1273
|
+
item.str = `${params.cellSeparator}${item.str}`;
|
|
1274
|
+
}
|
|
1275
|
+
}
|
|
1276
|
+
}
|
|
1277
|
+
strBuf.push(item.str);
|
|
1278
|
+
lastX = x + item.width;
|
|
1279
|
+
lastY = y;
|
|
1280
|
+
lineHeight = Math.max(lineHeight, item.height);
|
|
1281
|
+
if (item.hasEOL) {
|
|
1282
|
+
strBuf.push(`
|
|
1283
|
+
`);
|
|
1284
|
+
}
|
|
1285
|
+
if (item.hasEOL || item.str.endsWith(`
|
|
1286
|
+
`)) {
|
|
1287
|
+
lineHeight = 0;
|
|
1288
|
+
}
|
|
1289
|
+
}
|
|
1290
|
+
if (params.itemJoiner) {
|
|
1291
|
+
return strBuf.join(params.itemJoiner);
|
|
1292
|
+
}
|
|
1293
|
+
return strBuf.join("");
|
|
1294
|
+
}
|
|
1295
|
+
async getHyperlinks(page, viewport) {
|
|
1296
|
+
const result = new Map;
|
|
1297
|
+
const annotations = await page.getAnnotations({ intent: "display" }) || [];
|
|
1298
|
+
for (const i of annotations) {
|
|
1299
|
+
if (i.subtype !== "Link")
|
|
1300
|
+
continue;
|
|
1301
|
+
const url = i.url ?? i.unsafeUrl;
|
|
1302
|
+
if (!url)
|
|
1303
|
+
continue;
|
|
1304
|
+
const text = i.overlaidText;
|
|
1305
|
+
if (!text)
|
|
1306
|
+
continue;
|
|
1307
|
+
const rectVp = viewport.convertToViewportRectangle(i.rect);
|
|
1308
|
+
const left = Math.min(rectVp[0], rectVp[2]) - 0.5;
|
|
1309
|
+
const top = Math.min(rectVp[1], rectVp[3]) - 0.5;
|
|
1310
|
+
const right = Math.max(rectVp[0], rectVp[2]) + 0.5;
|
|
1311
|
+
const bottom = Math.max(rectVp[1], rectVp[3]) + 0.5;
|
|
1312
|
+
const pos = { rect: { left, top, right, bottom }, url, text, used: false };
|
|
1313
|
+
const el = result.get(text);
|
|
1314
|
+
if (el) {
|
|
1315
|
+
el.push(pos);
|
|
1316
|
+
} else {
|
|
1317
|
+
result.set(text, [pos]);
|
|
1318
|
+
}
|
|
1319
|
+
}
|
|
1320
|
+
return result;
|
|
1321
|
+
}
|
|
1322
|
+
async getImage(params = {}) {
|
|
1323
|
+
const doc = await this.load();
|
|
1324
|
+
const result = new ImageResult(doc.numPages);
|
|
1325
|
+
setDefaultParseParameters(params);
|
|
1326
|
+
for (let i = 1;i <= result.total; i++) {
|
|
1327
|
+
if (this.shouldParse(i, result.total, params)) {
|
|
1328
|
+
const page = await doc.getPage(i);
|
|
1329
|
+
const ops = await page.getOperatorList();
|
|
1330
|
+
const pageImages = { pageNumber: i, images: [] };
|
|
1331
|
+
result.pages.push(pageImages);
|
|
1332
|
+
for (let j = 0;j < ops.fnArray.length; j++) {
|
|
1333
|
+
if (ops.fnArray[j] === pdfjs2.OPS.paintInlineImageXObject || ops.fnArray[j] === pdfjs2.OPS.paintImageXObject) {
|
|
1334
|
+
const name = ops.argsArray[j][0];
|
|
1335
|
+
const isCommon = page.commonObjs.has(name);
|
|
1336
|
+
const imgPromise = isCommon ? this.resolveEmbeddedImage(page.commonObjs, name) : this.resolveEmbeddedImage(page.objs, name);
|
|
1337
|
+
const { width, height, kind, data } = await imgPromise;
|
|
1338
|
+
if (params.imageThreshold) {
|
|
1339
|
+
if (params.imageThreshold >= width || params.imageThreshold >= height) {
|
|
1340
|
+
continue;
|
|
1341
|
+
}
|
|
1342
|
+
}
|
|
1343
|
+
const canvasFactory = doc.canvasFactory;
|
|
1344
|
+
const canvasAndContext = canvasFactory.create(width, height);
|
|
1345
|
+
const context = canvasAndContext.context;
|
|
1346
|
+
let imgData = null;
|
|
1347
|
+
if (kind === pdfjs2.ImageKind.RGBA_32BPP) {
|
|
1348
|
+
imgData = context.createImageData(width, height);
|
|
1349
|
+
imgData.data.set(data);
|
|
1350
|
+
} else {
|
|
1351
|
+
imgData = context.createImageData(width, height);
|
|
1352
|
+
this.convertToRGBA({
|
|
1353
|
+
src: data,
|
|
1354
|
+
dest: new Uint32Array(imgData.data.buffer),
|
|
1355
|
+
width,
|
|
1356
|
+
height,
|
|
1357
|
+
kind
|
|
1358
|
+
});
|
|
1359
|
+
}
|
|
1360
|
+
context.putImageData(imgData, 0, 0);
|
|
1361
|
+
let buffer = new Uint8Array;
|
|
1362
|
+
let dataUrl = "";
|
|
1363
|
+
if (typeof canvasAndContext.canvas.toBuffer === "function") {
|
|
1364
|
+
let nodeBuffer;
|
|
1365
|
+
if (params.imageBuffer) {
|
|
1366
|
+
nodeBuffer = canvasAndContext.canvas.toBuffer("image/png");
|
|
1367
|
+
buffer = new Uint8Array(nodeBuffer);
|
|
1368
|
+
}
|
|
1369
|
+
if (params.imageDataUrl) {
|
|
1370
|
+
if (nodeBuffer) {
|
|
1371
|
+
dataUrl = `data:image/png;base64,${nodeBuffer.toString("base64")}`;
|
|
1372
|
+
} else {
|
|
1373
|
+
nodeBuffer = canvasAndContext.canvas.toBuffer("image/png");
|
|
1374
|
+
buffer = new Uint8Array(nodeBuffer);
|
|
1375
|
+
dataUrl = `data:image/png;base64,${nodeBuffer.toString("base64")}`;
|
|
1376
|
+
}
|
|
1377
|
+
}
|
|
1378
|
+
} else {
|
|
1379
|
+
if (params.imageBuffer) {
|
|
1380
|
+
const imageData = canvasAndContext.context.getImageData(0, 0, canvasAndContext.canvas.width, canvasAndContext.canvas.height);
|
|
1381
|
+
buffer = new Uint8Array(imageData.data);
|
|
1382
|
+
}
|
|
1383
|
+
if (params.imageDataUrl) {
|
|
1384
|
+
dataUrl = canvasAndContext.canvas.toDataURL("image/png");
|
|
1385
|
+
}
|
|
1386
|
+
}
|
|
1387
|
+
pageImages.images.push({
|
|
1388
|
+
data: buffer,
|
|
1389
|
+
dataUrl,
|
|
1390
|
+
name,
|
|
1391
|
+
height,
|
|
1392
|
+
width,
|
|
1393
|
+
kind
|
|
1394
|
+
});
|
|
1395
|
+
}
|
|
1396
|
+
}
|
|
1397
|
+
}
|
|
1398
|
+
}
|
|
1399
|
+
return result;
|
|
1400
|
+
}
|
|
1401
|
+
convertToRGBA({ src, dest, width, height, kind }) {
|
|
1402
|
+
if (kind === pdfjs2.ImageKind.RGB_24BPP) {
|
|
1403
|
+
for (let i = 0, j = 0;i < src.length; i += 3, j++) {
|
|
1404
|
+
const r = src[i];
|
|
1405
|
+
const g = src[i + 1];
|
|
1406
|
+
const b = src[i + 2];
|
|
1407
|
+
dest[j] = 255 << 24 | b << 16 | g << 8 | r;
|
|
1408
|
+
}
|
|
1409
|
+
} else if (kind === pdfjs2.ImageKind.GRAYSCALE_1BPP) {
|
|
1410
|
+
let pixelIndex = 0;
|
|
1411
|
+
for (let i = 0;i < src.length; i++) {
|
|
1412
|
+
const byte = src[i];
|
|
1413
|
+
for (let bit = 7;bit >= 0; bit--) {
|
|
1414
|
+
if (pixelIndex >= width * height)
|
|
1415
|
+
break;
|
|
1416
|
+
const isWhite = (byte >> bit & 1) === 1;
|
|
1417
|
+
const gray = isWhite ? 255 : 0;
|
|
1418
|
+
dest[pixelIndex++] = 255 << 24 | gray << 16 | gray << 8 | gray;
|
|
1419
|
+
}
|
|
1420
|
+
}
|
|
1421
|
+
} else if (kind === undefined || kind === null) {
|
|
1422
|
+
const bytesPerPixel = src.length / (width * height);
|
|
1423
|
+
if (Math.abs(bytesPerPixel - 3) < 0.1) {
|
|
1424
|
+
for (let i = 0, j = 0;i < src.length; i += 3, j++) {
|
|
1425
|
+
const r = src[i];
|
|
1426
|
+
const g = src[i + 1];
|
|
1427
|
+
const b = src[i + 2];
|
|
1428
|
+
dest[j] = 255 << 24 | b << 16 | g << 8 | r;
|
|
1429
|
+
}
|
|
1430
|
+
} else if (Math.abs(bytesPerPixel - 4) < 0.1) {
|
|
1431
|
+
for (let i = 0, j = 0;i < src.length; i += 4, j++) {
|
|
1432
|
+
const r = src[i];
|
|
1433
|
+
const g = src[i + 1];
|
|
1434
|
+
const b = src[i + 2];
|
|
1435
|
+
const a = src[i + 3];
|
|
1436
|
+
dest[j] = a << 24 | b << 16 | g << 8 | r;
|
|
1437
|
+
}
|
|
1438
|
+
} else if (Math.abs(bytesPerPixel - 1) < 0.1) {
|
|
1439
|
+
for (let i = 0;i < src.length; i++) {
|
|
1440
|
+
const gray = src[i];
|
|
1441
|
+
dest[i] = 255 << 24 | gray << 16 | gray << 8 | gray;
|
|
1442
|
+
}
|
|
1443
|
+
} else {
|
|
1444
|
+
throw new Error(`convertToRGBA: Cannot infer image format. kind: ${kind}, bytesPerPixel: ${bytesPerPixel}, width: ${width}, height: ${height}, dataLength: ${src.length}`);
|
|
1445
|
+
}
|
|
1446
|
+
} else {
|
|
1447
|
+
throw new Error(`convertToRGBA: Unsupported image kind: ${kind}. Available kinds: GRAYSCALE_1BPP=${pdfjs2.ImageKind.GRAYSCALE_1BPP}, RGB_24BPP=${pdfjs2.ImageKind.RGB_24BPP}, RGBA_32BPP=${pdfjs2.ImageKind.RGBA_32BPP}`);
|
|
1448
|
+
}
|
|
1449
|
+
}
|
|
1450
|
+
resolveEmbeddedImage(pdfObjects, name) {
|
|
1451
|
+
return new Promise((resolve, reject) => {
|
|
1452
|
+
pdfObjects.get(name, (imgData) => {
|
|
1453
|
+
if (imgData) {
|
|
1454
|
+
let dataBuff;
|
|
1455
|
+
if (imgData.data instanceof Uint8Array) {
|
|
1456
|
+
dataBuff = imgData.data;
|
|
1457
|
+
} else if (imgData.data instanceof Uint8ClampedArray) {
|
|
1458
|
+
dataBuff = new Uint8Array(imgData.data);
|
|
1459
|
+
} else if (imgData.data?.buffer) {
|
|
1460
|
+
dataBuff = new Uint8Array(imgData.data.buffer);
|
|
1461
|
+
} else if (imgData.bitmap) {
|
|
1462
|
+
const canvasFactory = this.doc.canvasFactory;
|
|
1463
|
+
const canvasAndContext = canvasFactory.create(imgData.bitmap.width, imgData.bitmap.height);
|
|
1464
|
+
canvasAndContext.context.drawImage(imgData.bitmap, 0, 0);
|
|
1465
|
+
const imageData = canvasAndContext.context.getImageData(0, 0, imgData.bitmap.width, imgData.bitmap.height);
|
|
1466
|
+
dataBuff = new Uint8Array(imageData.data.buffer);
|
|
1467
|
+
} else if (ArrayBuffer.isView(imgData.data)) {
|
|
1468
|
+
dataBuff = new Uint8Array(imgData.data.buffer, imgData.data.byteOffset, imgData.data.byteLength);
|
|
1469
|
+
}
|
|
1470
|
+
if (!dataBuff) {
|
|
1471
|
+
reject(new Error(`Image object ${name}: data field is empty or invalid. Available fields: ${Object.keys(imgData).join(", ")}`));
|
|
1472
|
+
return;
|
|
1473
|
+
}
|
|
1474
|
+
if (dataBuff.length === 0) {
|
|
1475
|
+
reject(new Error(`Image object ${name}: data buffer is empty (length: 0)`));
|
|
1476
|
+
return;
|
|
1477
|
+
}
|
|
1478
|
+
resolve({ width: imgData.width, height: imgData.height, kind: imgData.kind, data: dataBuff });
|
|
1479
|
+
} else {
|
|
1480
|
+
reject(new Error(`Image object ${name} not found`));
|
|
1481
|
+
}
|
|
1482
|
+
});
|
|
1483
|
+
});
|
|
1484
|
+
}
|
|
1485
|
+
async getScreenshot(parseParams = {}) {
|
|
1486
|
+
const params = setDefaultParseParameters(parseParams);
|
|
1487
|
+
const doc = await this.load();
|
|
1488
|
+
const result = new ScreenshotResult(doc.numPages);
|
|
1489
|
+
if (this.doc === undefined) {
|
|
1490
|
+
throw new Error("PDF document not loaded");
|
|
1491
|
+
}
|
|
1492
|
+
for (let i = 1;i <= result.total; i++) {
|
|
1493
|
+
if (this.shouldParse(i, result.total, params)) {
|
|
1494
|
+
const page = await this.doc.getPage(i);
|
|
1495
|
+
let viewport = page.getViewport({ scale: params.scale });
|
|
1496
|
+
if (params.desiredWidth) {
|
|
1497
|
+
viewport = page.getViewport({ scale: 1 });
|
|
1498
|
+
const scale = params.desiredWidth / viewport.width;
|
|
1499
|
+
viewport = page.getViewport({ scale });
|
|
1500
|
+
}
|
|
1501
|
+
const canvasFactory = this.doc.canvasFactory;
|
|
1502
|
+
const canvasAndContext = canvasFactory.create(viewport.width, viewport.height);
|
|
1503
|
+
const renderContext = {
|
|
1504
|
+
canvasContext: canvasAndContext.context,
|
|
1505
|
+
viewport,
|
|
1506
|
+
canvas: canvasAndContext.canvas
|
|
1507
|
+
};
|
|
1508
|
+
const renderTask = page.render(renderContext);
|
|
1509
|
+
await renderTask.promise;
|
|
1510
|
+
let data = new Uint8Array;
|
|
1511
|
+
let dataUrl = "";
|
|
1512
|
+
if (typeof canvasAndContext.canvas.toBuffer === "function") {
|
|
1513
|
+
let nodeBuffer;
|
|
1514
|
+
if (params.imageBuffer) {
|
|
1515
|
+
nodeBuffer = canvasAndContext.canvas.toBuffer("image/png");
|
|
1516
|
+
data = new Uint8Array(nodeBuffer);
|
|
1517
|
+
}
|
|
1518
|
+
if (params.imageDataUrl) {
|
|
1519
|
+
if (nodeBuffer) {
|
|
1520
|
+
dataUrl = `data:image/png;base64,${nodeBuffer.toString("base64")}`;
|
|
1521
|
+
} else {
|
|
1522
|
+
nodeBuffer = canvasAndContext.canvas.toBuffer("image/png");
|
|
1523
|
+
data = new Uint8Array(nodeBuffer);
|
|
1524
|
+
dataUrl = `data:image/png;base64,${nodeBuffer.toString("base64")}`;
|
|
1525
|
+
}
|
|
1526
|
+
}
|
|
1527
|
+
} else {
|
|
1528
|
+
if (params.imageBuffer) {
|
|
1529
|
+
const imageData = canvasAndContext.context.getImageData(0, 0, canvasAndContext.canvas.width, canvasAndContext.canvas.height);
|
|
1530
|
+
data = new Uint8Array(imageData.data);
|
|
1531
|
+
}
|
|
1532
|
+
if (params.imageDataUrl) {
|
|
1533
|
+
dataUrl = canvasAndContext.canvas.toDataURL("image/png");
|
|
1534
|
+
}
|
|
1535
|
+
}
|
|
1536
|
+
result.pages.push({
|
|
1537
|
+
data,
|
|
1538
|
+
dataUrl,
|
|
1539
|
+
pageNumber: i,
|
|
1540
|
+
width: viewport.width,
|
|
1541
|
+
height: viewport.height,
|
|
1542
|
+
scale: viewport.scale
|
|
1543
|
+
});
|
|
1544
|
+
page.cleanup();
|
|
1545
|
+
}
|
|
1546
|
+
}
|
|
1547
|
+
return result;
|
|
1548
|
+
}
|
|
1549
|
+
async getTable(params = {}) {
|
|
1550
|
+
const doc = await this.load();
|
|
1551
|
+
const result = new TableResult(doc.numPages);
|
|
1552
|
+
if (this.doc === undefined) {
|
|
1553
|
+
throw new Error("PDF document not loaded");
|
|
1554
|
+
}
|
|
1555
|
+
for (let i = 1;i <= result.total; i++) {
|
|
1556
|
+
if (this.shouldParse(i, result.total, params)) {
|
|
1557
|
+
const page = await this.doc.getPage(i);
|
|
1558
|
+
const store = await this.getPageTables(page);
|
|
1559
|
+
store.normalize();
|
|
1560
|
+
const tableDataArr = store.getTableData();
|
|
1561
|
+
await this.fillPageTables(page, tableDataArr);
|
|
1562
|
+
const pageTableResult = { num: i, tables: [] };
|
|
1563
|
+
for (const table of tableDataArr) {
|
|
1564
|
+
pageTableResult.tables.push(table.toArray());
|
|
1565
|
+
}
|
|
1566
|
+
result.pages.push(pageTableResult);
|
|
1567
|
+
page.cleanup();
|
|
1568
|
+
}
|
|
1569
|
+
}
|
|
1570
|
+
return result;
|
|
1571
|
+
}
|
|
1572
|
+
getPathGeometry(mm) {
|
|
1573
|
+
const width = mm[2] - mm[0];
|
|
1574
|
+
const height = mm[3] - mm[1];
|
|
1575
|
+
if (mm[0] === Infinity) {
|
|
1576
|
+
return PathGeometry.undefined;
|
|
1577
|
+
}
|
|
1578
|
+
if (width > 5 && height > 5) {
|
|
1579
|
+
return PathGeometry.rectangle;
|
|
1580
|
+
} else if (width > 5 && height === 0) {
|
|
1581
|
+
return PathGeometry.hline;
|
|
1582
|
+
} else if (width === 0 && height > 5) {
|
|
1583
|
+
return PathGeometry.vline;
|
|
1584
|
+
}
|
|
1585
|
+
return PathGeometry.undefined;
|
|
1586
|
+
}
|
|
1587
|
+
async getPageTables(page) {
|
|
1588
|
+
const lineStore = new LineStore;
|
|
1589
|
+
const viewport = page.getViewport({ scale: 1 });
|
|
1590
|
+
let transformMatrix = [1, 0, 0, 1, 0, 0];
|
|
1591
|
+
const transformStack = [];
|
|
1592
|
+
const opList = await page.getOperatorList();
|
|
1593
|
+
for (let i = 0;i < opList.fnArray.length; i++) {
|
|
1594
|
+
const fn = opList.fnArray[i];
|
|
1595
|
+
const args = opList.argsArray[i];
|
|
1596
|
+
const op = args?.[0] ?? 0;
|
|
1597
|
+
const mm = args?.[2] ?? [Infinity, Infinity, -Infinity, -Infinity];
|
|
1598
|
+
if (fn === pdfjs2.OPS.constructPath) {
|
|
1599
|
+
if (op === pdfjs2.OPS.fill) {}
|
|
1600
|
+
if (op !== pdfjs2.OPS.stroke) {
|
|
1601
|
+
continue;
|
|
1602
|
+
}
|
|
1603
|
+
const pg = this.getPathGeometry(mm);
|
|
1604
|
+
if (pg === PathGeometry.rectangle) {
|
|
1605
|
+
const rect = new Rectangle(new Point(mm[0], mm[1]), mm[2] - mm[0], mm[3] - mm[1]);
|
|
1606
|
+
rect.transform(transformMatrix);
|
|
1607
|
+
rect.transform(viewport.transform);
|
|
1608
|
+
lineStore.addRectangle(rect);
|
|
1609
|
+
} else if (pg === PathGeometry.hline || pg === PathGeometry.vline) {
|
|
1610
|
+
const from = new Point(mm[0], mm[1]);
|
|
1611
|
+
const to = new Point(mm[2], mm[3]);
|
|
1612
|
+
const line = new Line(from, to);
|
|
1613
|
+
line.transform(transformMatrix);
|
|
1614
|
+
line.transform(viewport.transform);
|
|
1615
|
+
lineStore.add(line);
|
|
1616
|
+
} else {}
|
|
1617
|
+
} else if (fn === pdfjs2.OPS.setLineWidth) {} else if (fn === pdfjs2.OPS.save) {
|
|
1618
|
+
transformStack.push(transformMatrix);
|
|
1619
|
+
} else if (fn === pdfjs2.OPS.restore) {
|
|
1620
|
+
const restoredMatrix = transformStack.pop();
|
|
1621
|
+
if (restoredMatrix) {
|
|
1622
|
+
transformMatrix = restoredMatrix;
|
|
1623
|
+
}
|
|
1624
|
+
} else if (fn === pdfjs2.OPS.transform) {
|
|
1625
|
+
transformMatrix = pdfjs2.Util.transform(transformMatrix, args);
|
|
1626
|
+
}
|
|
1627
|
+
}
|
|
1628
|
+
return lineStore;
|
|
1629
|
+
}
|
|
1630
|
+
async fillPageTables(page, pageTables) {
|
|
1631
|
+
const viewport = page.getViewport({ scale: 1 });
|
|
1632
|
+
const textContent = await page.getTextContent({
|
|
1633
|
+
includeMarkedContent: false,
|
|
1634
|
+
disableNormalization: false
|
|
1635
|
+
});
|
|
1636
|
+
for (const textItem of textContent.items) {
|
|
1637
|
+
if (!("str" in textItem))
|
|
1638
|
+
continue;
|
|
1639
|
+
const tx = pdfjs2.Util.transform(pdfjs2.Util.transform(viewport.transform, textItem.transform), [1, 0, 0, -1, 0, 0]);
|
|
1640
|
+
for (const pageTable of pageTables) {
|
|
1641
|
+
const cell = pageTable.findCell(tx[4], tx[5]);
|
|
1642
|
+
if (cell) {
|
|
1643
|
+
cell.text.push(textItem.str);
|
|
1644
|
+
if (textItem.hasEOL) {
|
|
1645
|
+
cell.text.push(`
|
|
1646
|
+
`);
|
|
1647
|
+
}
|
|
1648
|
+
break;
|
|
1649
|
+
}
|
|
1650
|
+
}
|
|
1651
|
+
}
|
|
1652
|
+
}
|
|
1653
|
+
}
|
|
1654
|
+
|
|
1655
|
+
// node_modules/pdf-parse/dist/pdf-parse/esm/index.js
|
|
1656
|
+
var import_pdf = require("pdfjs-dist/legacy/build/pdf.mjs");
|
|
1657
|
+
|
|
1658
|
+
// services/pdf.ts
|
|
1659
|
+
var pdfParse = undefined || exports_esm;
|
|
1660
|
+
|
|
80
1661
|
class PdfService extends import_core.Service {
|
|
81
1662
|
static serviceType = import_core.ServiceType.PDF;
|
|
82
1663
|
capabilityDescription = "The agent is able to convert PDF files to text";
|
|
83
|
-
static async start(
|
|
84
|
-
|
|
85
|
-
return service;
|
|
1664
|
+
static async start(_runtime) {
|
|
1665
|
+
return new PdfService;
|
|
86
1666
|
}
|
|
87
1667
|
static async stop(runtime) {
|
|
88
1668
|
const service = runtime.getService(import_core.ServiceType.PDF);
|
|
89
1669
|
if (service) {
|
|
90
|
-
await service.stop();
|
|
1670
|
+
await service.stop?.();
|
|
91
1671
|
}
|
|
92
1672
|
}
|
|
93
1673
|
async stop() {}
|
|
94
1674
|
async convertPdfToText(pdfBuffer) {
|
|
95
1675
|
try {
|
|
96
|
-
const
|
|
97
|
-
|
|
98
|
-
const numPages = pdf.numPages;
|
|
99
|
-
const textPages = [];
|
|
100
|
-
for (let pageNum = 1;pageNum <= numPages; pageNum++) {
|
|
101
|
-
const page = await pdf.getPage(pageNum);
|
|
102
|
-
const textContent = await page.getTextContent();
|
|
103
|
-
const pageText = textContent.items.filter(isTextItem).map((item) => item.str).join(" ");
|
|
104
|
-
textPages.push(pageText);
|
|
105
|
-
}
|
|
106
|
-
const rawText = textPages.join(`
|
|
107
|
-
`);
|
|
108
|
-
return this.cleanUpContent(rawText);
|
|
1676
|
+
const data = await pdfParse(pdfBuffer);
|
|
1677
|
+
return this.cleanUpContent(data.text);
|
|
109
1678
|
} catch (error) {
|
|
110
|
-
import_core.logger.error(`PdfService: Failed to convert PDF to text - error: ${error}
|
|
1679
|
+
import_core.logger.error(`PdfService: Failed to convert PDF to text - error: ${error}`);
|
|
111
1680
|
throw error;
|
|
112
1681
|
}
|
|
113
1682
|
}
|
|
114
1683
|
async convertPdfToTextWithOptions(pdfBuffer, options = {}) {
|
|
115
1684
|
try {
|
|
116
|
-
const
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
const
|
|
121
|
-
|
|
122
|
-
for (let pageNum = startPage;pageNum <= endPage; pageNum++) {
|
|
123
|
-
const page = await pdf.getPage(pageNum);
|
|
124
|
-
const textContent = await page.getTextContent();
|
|
125
|
-
const pageText = textContent.items.filter(isTextItem).map((item) => item.str).join(options.preserveWhitespace ? "" : " ");
|
|
126
|
-
textPages.push(pageText);
|
|
127
|
-
}
|
|
128
|
-
let text = textPages.join(`
|
|
129
|
-
`);
|
|
1685
|
+
const parseOptions = {};
|
|
1686
|
+
if (options.endPage) {
|
|
1687
|
+
parseOptions.max = options.endPage;
|
|
1688
|
+
}
|
|
1689
|
+
const data = await pdfParse(pdfBuffer, parseOptions);
|
|
1690
|
+
let text = data.text;
|
|
130
1691
|
if (options.cleanContent !== false) {
|
|
131
1692
|
text = this.cleanUpContent(text);
|
|
132
1693
|
}
|
|
133
1694
|
return {
|
|
134
1695
|
success: true,
|
|
135
1696
|
text,
|
|
136
|
-
pageCount:
|
|
1697
|
+
pageCount: data.numpages
|
|
137
1698
|
};
|
|
138
1699
|
} catch (error) {
|
|
139
1700
|
return {
|
|
@@ -143,54 +1704,30 @@ class PdfService extends import_core.Service {
|
|
|
143
1704
|
}
|
|
144
1705
|
}
|
|
145
1706
|
async getDocumentInfo(pdfBuffer) {
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
for (let pageNum = 1;pageNum <= numPages; pageNum++) {
|
|
164
|
-
const page = await pdf.getPage(pageNum);
|
|
165
|
-
const viewport = page.getViewport({ scale: 1 });
|
|
166
|
-
const textContent = await page.getTextContent();
|
|
167
|
-
const pageText = textContent.items.filter(isTextItem).map((item) => item.str).join(" ");
|
|
168
|
-
pages.push({
|
|
169
|
-
pageNumber: pageNum,
|
|
170
|
-
width: viewport.width,
|
|
171
|
-
height: viewport.height,
|
|
172
|
-
text: this.cleanUpContent(pageText)
|
|
173
|
-
});
|
|
174
|
-
allText.push(pageText);
|
|
175
|
-
}
|
|
176
|
-
return {
|
|
177
|
-
pageCount: numPages,
|
|
178
|
-
metadata,
|
|
179
|
-
text: this.cleanUpContent(allText.join(`
|
|
180
|
-
`)),
|
|
181
|
-
pages
|
|
182
|
-
};
|
|
1707
|
+
try {
|
|
1708
|
+
const data = await pdfParse(pdfBuffer);
|
|
1709
|
+
return {
|
|
1710
|
+
pageCount: data.numpages,
|
|
1711
|
+
metadata: {
|
|
1712
|
+
title: data.info?.Title,
|
|
1713
|
+
author: data.info?.Author,
|
|
1714
|
+
creator: data.info?.Creator,
|
|
1715
|
+
producer: data.info?.Producer
|
|
1716
|
+
},
|
|
1717
|
+
text: this.cleanUpContent(data.text),
|
|
1718
|
+
pages: []
|
|
1719
|
+
};
|
|
1720
|
+
} catch (error) {
|
|
1721
|
+
import_core.logger.error(`PdfService: Failed to get document info - error: ${error}`);
|
|
1722
|
+
throw error;
|
|
1723
|
+
}
|
|
183
1724
|
}
|
|
184
1725
|
cleanUpContent(content) {
|
|
185
1726
|
try {
|
|
186
|
-
const
|
|
187
|
-
const charCode = char.charCodeAt(0);
|
|
188
|
-
return !(charCode === 0 || charCode >= 1 && charCode <= 8 || charCode >= 11 && charCode <= 12 || charCode >= 14 && charCode <= 31 || charCode === 127);
|
|
189
|
-
}).join("");
|
|
190
|
-
const cleaned = filtered.replace(/[^\S\r\n]+/g, " ").replace(/[ \t]+(\r?\n)/g, "$1").trim();
|
|
1727
|
+
const cleaned = content.replace(/[^\S\r\n]+/g, " ").replace(/[ \t]+(\r?\n)/g, "$1").trim();
|
|
191
1728
|
return cleaned;
|
|
192
1729
|
} catch (error) {
|
|
193
|
-
import_core.logger.error(`PdfService: Failed to clean up content - error: ${error}
|
|
1730
|
+
import_core.logger.error(`PdfService: Failed to clean up content - error: ${error}`);
|
|
194
1731
|
return content;
|
|
195
1732
|
}
|
|
196
1733
|
}
|
|
@@ -202,6 +1739,5 @@ var pdfPlugin = {
|
|
|
202
1739
|
services: [PdfService],
|
|
203
1740
|
actions: []
|
|
204
1741
|
};
|
|
205
|
-
var typescript_default = pdfPlugin;
|
|
206
1742
|
|
|
207
|
-
//# debugId=
|
|
1743
|
+
//# debugId=11EC03194FE63DF364756E2164756E21
|