@elizaos/plugin-pdf 2.0.0-alpha.11 → 2.0.0-alpha.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,1640 +1,68 @@
1
- var __defProp = Object.defineProperty;
2
- var __export = (target, all) => {
3
- for (var name in all)
4
- __defProp(target, name, {
5
- get: all[name],
6
- enumerable: true,
7
- configurable: true,
8
- set: (newValue) => all[name] = () => newValue
9
- });
10
- };
11
-
12
1
  // services/pdf.ts
13
- import { Service, ServiceType, logger } from "@elizaos/core";
14
-
15
- // node_modules/pdf-parse/dist/pdf-parse/esm/index.js
16
- var exports_esm = {};
17
- __export(exports_esm, {
18
- getException: () => getException,
19
- VerbosityLevel: () => VerbosityLevel2,
20
- UnknownErrorException: () => UnknownErrorException,
21
- Table: () => Table,
22
- Shape: () => Shape,
23
- ResponseException: () => ResponseException,
24
- Rectangle: () => Rectangle,
25
- Point: () => Point,
26
- PasswordException: () => PasswordException,
27
- PDFParse: () => PDFParse,
28
- LineStore: () => LineStore,
29
- LineDirection: () => LineDirection,
30
- Line: () => Line,
31
- InvalidPDFException: () => InvalidPDFException,
32
- FormatError: () => FormatError,
33
- AbortException: () => AbortException
34
- });
35
-
36
- // node_modules/pdf-parse/dist/pdf-parse/esm/PDFParse.js
37
- import * as pdfjs2 from "pdfjs-dist/legacy/build/pdf.mjs";
38
-
39
- // node_modules/pdf-parse/dist/pdf-parse/esm/Exception.js
40
- class InvalidPDFException extends Error {
41
- constructor(message, cause) {
42
- if (cause !== undefined) {
43
- super(message ?? "Invalid PDF", { cause });
44
- } else {
45
- super(message ?? "Invalid PDF");
46
- }
47
- this.name = "InvalidPDFException";
48
- Object.setPrototypeOf(this, InvalidPDFException.prototype);
49
- if (typeof Error.captureStackTrace === "function") {
50
- Error.captureStackTrace(this, InvalidPDFException);
51
- }
52
- }
53
- }
54
-
55
- class PasswordException extends Error {
56
- constructor(message, cause) {
57
- if (cause !== undefined) {
58
- super(message ?? "Password required or incorrect", { cause });
59
- } else {
60
- super(message ?? "Password required or incorrect");
61
- }
62
- this.name = "PasswordException";
63
- Object.setPrototypeOf(this, PasswordException.prototype);
64
- if (typeof Error.captureStackTrace === "function") {
65
- Error.captureStackTrace(this, PasswordException);
66
- }
67
- }
68
- }
69
-
70
- class FormatError extends Error {
71
- constructor(message, cause) {
72
- if (cause !== undefined) {
73
- super(message ?? "PDF format error", { cause });
74
- } else {
75
- super(message ?? "PDF format error");
76
- }
77
- this.name = "FormatError";
78
- Object.setPrototypeOf(this, FormatError.prototype);
79
- if (typeof Error.captureStackTrace === "function") {
80
- Error.captureStackTrace(this, FormatError);
81
- }
82
- }
83
- }
84
-
85
- class UnknownErrorException extends Error {
86
- constructor(message, details, cause) {
87
- if (cause !== undefined) {
88
- super(message ?? "Unknown error", { cause });
89
- } else {
90
- super(message ?? "Unknown error");
91
- }
92
- this.name = "UnknownErrorException";
93
- Object.setPrototypeOf(this, UnknownErrorException.prototype);
94
- if (typeof Error.captureStackTrace === "function") {
95
- Error.captureStackTrace(this, UnknownErrorException);
96
- }
97
- this.details = details;
98
- }
99
- }
100
-
101
- class ResponseException extends Error {
102
- constructor(message, status, missing, cause) {
103
- if (cause !== undefined) {
104
- super(message ?? "Response error", { cause });
105
- } else {
106
- super(message ?? "Response error");
107
- }
108
- this.name = "ResponseException";
109
- Object.setPrototypeOf(this, ResponseException.prototype);
110
- if (typeof Error.captureStackTrace === "function") {
111
- Error.captureStackTrace(this, ResponseException);
112
- }
113
- this.status = status;
114
- this.missing = missing;
115
- }
116
- }
117
-
118
- class AbortException extends Error {
119
- constructor(message, cause) {
120
- if (cause !== undefined) {
121
- super(message ?? "Operation aborted", { cause });
122
- } else {
123
- super(message ?? "Operation aborted");
124
- }
125
- this.name = "AbortException";
126
- Object.setPrototypeOf(this, AbortException.prototype);
127
- if (typeof Error.captureStackTrace === "function") {
128
- Error.captureStackTrace(this, AbortException);
129
- }
130
- }
131
- }
132
- function getException(error) {
133
- if (error instanceof Error) {
134
- switch (error.name) {
135
- case "InvalidPDFException":
136
- return new InvalidPDFException(error.message, error);
137
- case "PasswordException":
138
- return new PasswordException(error.message, error);
139
- case "FormatError":
140
- return new FormatError(error.message, error);
141
- case "UnknownErrorException":
142
- return new UnknownErrorException(error.message, error.details, error);
143
- case "ResponseException":
144
- return new ResponseException(error.message, error.status, error.missing, error);
145
- case "AbortException":
146
- return new AbortException(error.message, error);
147
- default:
148
- return error;
149
- }
150
- }
151
- return new Error(String(error));
152
- }
153
-
154
- // node_modules/pdf-parse/dist/pdf-parse/esm/geometry/Shape.js
155
- class Shape {
156
- static tolerance = 2;
157
- static applyTransform(p, m) {
158
- const xt = p[0] * m[0] + p[1] * m[2] + m[4];
159
- const yt = p[0] * m[1] + p[1] * m[3] + m[5];
160
- return [xt, yt];
161
- }
162
- }
163
-
164
- // node_modules/pdf-parse/dist/pdf-parse/esm/geometry/Point.js
165
- class Point extends Shape {
166
- x;
167
- y;
168
- constructor(x, y) {
169
- super();
170
- this.x = x;
171
- this.y = y;
172
- }
173
- equal(point) {
174
- return point.x === this.x && point.y === this.y;
175
- }
176
- transform(matrix) {
177
- const p = Shape.applyTransform([this.x, this.y], matrix);
178
- this.x = p[0];
179
- this.y = p[1];
180
- return this;
181
- }
182
- }
183
-
184
- // node_modules/pdf-parse/dist/pdf-parse/esm/geometry/Line.js
185
- var LineDirection;
186
- (function(LineDirection2) {
187
- LineDirection2[LineDirection2["None"] = 0] = "None";
188
- LineDirection2[LineDirection2["Horizontal"] = 1] = "Horizontal";
189
- LineDirection2[LineDirection2["Vertical"] = 2] = "Vertical";
190
- })(LineDirection || (LineDirection = {}));
191
-
192
- class Line extends Shape {
193
- from;
194
- to;
195
- direction = LineDirection.None;
196
- length = 0;
197
- intersections = [];
198
- gaps = [];
199
- constructor(from, to) {
200
- super();
201
- this.from = from;
202
- this.to = to;
203
- this.init();
204
- }
205
- init() {
206
- let from = this.from;
207
- let to = this.to;
208
- if (Math.abs(from.y - to.y) < Shape.tolerance) {
209
- this.direction = LineDirection.Horizontal;
210
- to.y = from.y;
211
- if (from.x > to.x) {
212
- const temp = from;
213
- from = to;
214
- to = temp;
215
- }
216
- this.length = to.x - from.x;
217
- } else if (Math.abs(from.x - to.x) < Shape.tolerance) {
218
- this.direction = LineDirection.Vertical;
219
- to.x = from.x;
220
- if (from.y > to.y) {
221
- const temp = from;
222
- from = to;
223
- to = temp;
224
- }
225
- this.length = to.y - from.y;
226
- }
227
- this.from = from;
228
- this.to = to;
229
- }
230
- _valid = undefined;
231
- get valid() {
232
- if (this._valid === undefined) {
233
- this._valid = this.direction !== LineDirection.None && this.length > Shape.tolerance;
234
- }
235
- return this._valid;
236
- }
237
- get normalized() {
238
- if (this.direction === LineDirection.Horizontal) {
239
- return new Line(new Point(this.from.x - Shape.tolerance, this.from.y), new Point(this.to.x + Shape.tolerance, this.from.y));
240
- } else if (this.direction === LineDirection.Vertical) {
241
- return new Line(new Point(this.from.x, this.from.y - Shape.tolerance), new Point(this.from.x, this.to.y + Shape.tolerance));
242
- }
243
- return this;
244
- }
245
- addGap(line) {
246
- this.gaps.push(line);
247
- }
248
- containsPoint(p) {
249
- if (this.direction === LineDirection.Vertical) {
250
- return this.from.x === p.x && p.y >= this.from.y && p.y <= this.to.y;
251
- } else if (this.direction === LineDirection.Horizontal) {
252
- return this.from.y === p.y && p.x >= this.from.x && p.x <= this.to.x;
253
- }
254
- return false;
255
- }
256
- addIntersectionPoint(point) {
257
- for (const intPoint of this.intersections) {
258
- if (intPoint.equal(point))
259
- return;
260
- }
261
- this.intersections.push(point);
262
- }
263
- intersection(line) {
264
- let result;
265
- if (!this.valid || !line.valid) {
266
- return result;
267
- }
268
- const thisNormalized = this.normalized;
269
- const lineNormalized = line.normalized;
270
- if (this.direction === LineDirection.Horizontal && line.direction === LineDirection.Vertical) {
271
- const x = lineNormalized.from.x;
272
- const y = thisNormalized.from.y;
273
- const isOk = x > thisNormalized.from.x && x < thisNormalized.to.x && y > lineNormalized.from.y && y < lineNormalized.to.y;
274
- if (isOk) {
275
- const intPoint = new Point(x, y);
276
- this.addIntersectionPoint(intPoint);
277
- line.addIntersectionPoint(intPoint);
278
- result = intPoint;
279
- }
280
- } else if (this.direction === LineDirection.Vertical && line.direction === LineDirection.Horizontal) {
281
- const x = thisNormalized.from.x;
282
- const y = lineNormalized.from.y;
283
- const isOk = x > lineNormalized.from.x && x < lineNormalized.to.x && y > thisNormalized.from.y && y < thisNormalized.to.y;
284
- if (isOk) {
285
- const intPoint = new Point(x, y);
286
- this.addIntersectionPoint(intPoint);
287
- line.addIntersectionPoint(intPoint);
288
- result = intPoint;
289
- }
290
- }
291
- return result;
292
- }
293
- transform(matrix) {
294
- const p1 = this.from.transform(matrix);
295
- const p2 = this.to.transform(matrix);
296
- const x = Math.min(p1.x, p2.x);
297
- const y = Math.min(p1.y, p2.y);
298
- const width = Math.abs(p1.x - p2.x);
299
- const height = Math.abs(p1.y - p2.y);
300
- this.from = new Point(x, y);
301
- this.to = new Point(x + width, y + height);
302
- this.init();
303
- return this;
304
- }
305
- }
306
- // node_modules/pdf-parse/dist/pdf-parse/esm/geometry/TableData.js
307
- class TableData {
308
- minXY;
309
- maxXY;
310
- rows;
311
- rowPivots;
312
- colPivots;
313
- constructor(minXY, maxXY, rowPivots, colPivots) {
314
- this.minXY = minXY;
315
- this.maxXY = maxXY;
316
- this.rows = [];
317
- this.rowPivots = rowPivots;
318
- this.colPivots = colPivots;
319
- }
320
- findCell(x, y) {
321
- if (x >= this.minXY.x && y >= this.minXY.y && x <= this.maxXY.x && y <= this.maxXY.y) {
322
- for (const row of this.rows) {
323
- for (const cell of row) {
324
- if (cell.minXY.x <= x && cell.minXY.y <= y && cell.maxXY.x >= x && cell.maxXY.y >= y) {
325
- return cell;
326
- }
327
- }
328
- }
329
- }
330
- return;
331
- }
332
- get cellCount() {
333
- return this.rows.reduce((acc, row) => acc + row.length, 0);
334
- }
335
- get rowCount() {
336
- return this.rows.length;
337
- }
338
- check() {
339
- const virtualCellCount = (this.colPivots.length - 1) * (this.rowPivots.length - 1);
340
- let allCellCount = 0;
341
- for (const row of this.rows) {
342
- for (const cell of row) {
343
- const count = (cell.colspan || 1) * (cell.rowspan || 1);
344
- allCellCount += count;
345
- }
346
- }
347
- if (virtualCellCount !== allCellCount) {
348
- return false;
349
- }
350
- return true;
351
- }
352
- toArray() {
353
- const tableArr = [];
354
- for (const row of this.rows) {
355
- const rowArr = [];
356
- for (const cell of row) {
357
- let text = cell.text.join("");
358
- text = text.replace(/^[\s]+|[\s]+$/g, "");
359
- text = text.trim();
360
- rowArr.push(text);
361
- }
362
- tableArr.push(rowArr);
363
- }
364
- return tableArr;
365
- }
2
+ import { logger, Service, ServiceType } from "@elizaos/core";
3
+ import pkg from "pdfjs-dist";
4
+ var { getDocument } = pkg;
5
+ function isTextItem(item) {
6
+ return "str" in item;
366
7
  }
367
8
 
368
- // node_modules/pdf-parse/dist/pdf-parse/esm/geometry/Table.js
369
- class Table {
370
- hLines = [];
371
- vLines = [];
372
- constructor(line) {
373
- if (line.direction === LineDirection.Horizontal) {
374
- this.hLines.push(line);
375
- } else if (line.direction === LineDirection.Vertical) {
376
- this.vLines.push(line);
377
- }
378
- }
379
- get isValid() {
380
- return this.hLines.length + this.vLines.length > 4;
381
- }
382
- get rowPivots() {
383
- const rowSet = new Set;
384
- for (const line of this.hLines) {
385
- rowSet.add(line.from.y);
386
- }
387
- return [...rowSet].sort((a, b) => a - b);
388
- }
389
- get colPivots() {
390
- const colSet = new Set;
391
- for (const line of this.vLines) {
392
- colSet.add(line.from.x);
393
- }
394
- return [...colSet].sort((a, b) => a - b);
395
- }
396
- add(line) {
397
- const hasIntersection = this.intersection(line);
398
- if (hasIntersection) {
399
- if (line.direction === LineDirection.Horizontal) {
400
- this.hLines.push(line);
401
- return true;
402
- } else if (line.direction === LineDirection.Vertical) {
403
- this.vLines.push(line);
404
- return true;
405
- }
406
- }
407
- return false;
408
- }
409
- intersection(line) {
410
- let flag = false;
411
- if (!line.valid)
412
- return flag;
413
- if (line.direction === LineDirection.Horizontal) {
414
- for (const vLine of this.vLines) {
415
- const p = line.intersection(vLine);
416
- if (p) {
417
- flag = true;
418
- }
419
- }
420
- } else if (line.direction === LineDirection.Vertical) {
421
- for (const hLine of this.hLines) {
422
- const p = line.intersection(hLine);
423
- if (p) {
424
- flag = true;
425
- }
426
- }
427
- }
428
- return flag;
429
- }
430
- getSameHorizontal(line) {
431
- const same = [line];
432
- const other = [];
433
- while (this.hLines.length > 0) {
434
- const hLine = this.hLines.shift();
435
- if (!hLine)
436
- continue;
437
- if (hLine.from.y === line.from.y) {
438
- same.push(hLine);
439
- } else {
440
- other.push(hLine);
441
- }
442
- }
443
- this.hLines = other;
444
- return same;
445
- }
446
- getSameVertical(line) {
447
- const same = [line];
448
- const other = [];
449
- while (this.vLines.length > 0) {
450
- const vLine = this.vLines.shift();
451
- if (!vLine)
452
- continue;
453
- if (vLine.from.x === line.from.x) {
454
- same.push(vLine);
455
- } else {
456
- other.push(vLine);
457
- }
458
- }
459
- this.vLines = other;
460
- return same;
461
- }
462
- mergeHorizontalLines(lines) {
463
- lines.sort((l1, l2) => l1.from.x - l2.from.x);
464
- const minX = lines[0].from.x;
465
- const maxX = lines[lines.length - 1].to.x;
466
- const resultLine = new Line(new Point(minX, lines[0].from.y), new Point(maxX, lines[0].from.y));
467
- for (let i = 1;i < lines.length; i++) {
468
- const prevLine = lines[i - 1];
469
- const currLine = lines[i];
470
- if (Math.abs(prevLine.to.x - currLine.from.x) > Shape.tolerance) {
471
- const gapLine = new Line(new Point(prevLine.to.x, prevLine.from.y), new Point(currLine.from.x, currLine.from.y));
472
- resultLine.addGap(gapLine);
473
- }
474
- }
475
- return resultLine;
476
- }
477
- mergeVerticalLines(lines) {
478
- lines.sort((l1, l2) => l1.from.y - l2.from.y);
479
- const minY = lines[0].from.y;
480
- const maxY = lines[lines.length - 1].to.y;
481
- const resultLine = new Line(new Point(lines[0].from.x, minY), new Point(lines[0].from.x, maxY));
482
- for (let i = 1;i < lines.length; i++) {
483
- const prevLine = lines[i - 1];
484
- const currLine = lines[i];
485
- if (Math.abs(prevLine.to.y - currLine.from.y) > Shape.tolerance) {
486
- const gapLine = new Line(new Point(prevLine.to.x, prevLine.to.y), new Point(prevLine.to.x, currLine.from.y));
487
- resultLine.addGap(gapLine);
488
- }
489
- }
490
- return resultLine;
491
- }
492
- normalize() {
493
- this.hLines = this.hLines.filter((l) => l.intersections.length > 1);
494
- this.vLines = this.vLines.filter((l) => l.intersections.length > 1);
495
- this.hLines.sort((l1, l2) => l1.from.y - l2.from.y);
496
- this.vLines.sort((l1, l2) => l1.from.x - l2.from.x);
497
- const newHLines = [];
498
- while (this.hLines.length > 0) {
499
- const line = this.hLines.shift();
500
- if (!line)
501
- continue;
502
- const lines = this.getSameHorizontal(line);
503
- const merged = this.mergeHorizontalLines(lines);
504
- newHLines.push(merged);
505
- }
506
- this.hLines = newHLines;
507
- const newVLines = [];
508
- while (this.vLines.length > 0) {
509
- const line = this.vLines.shift();
510
- if (!line)
511
- continue;
512
- const lines = this.getSameVertical(line);
513
- const merged = this.mergeVerticalLines(lines);
514
- newVLines.push(merged);
515
- }
516
- this.vLines = newVLines;
517
- }
518
- verticalExists(line, y1, y2) {
519
- if (line.direction !== LineDirection.Vertical) {
520
- throw new Error("Line is not vertical");
521
- }
522
- if (y1 >= y2) {
523
- throw new Error("y1 must be less than y2");
524
- }
525
- if (line.from.y <= y1 && line.to.y >= y2) {
526
- for (const gap of line.gaps) {
527
- if (gap.from.y <= y1 && gap.to.y >= y2) {
528
- return false;
529
- }
530
- }
531
- return true;
532
- }
533
- return false;
534
- }
535
- horizontalExists(line, x1, x2) {
536
- if (line.direction !== LineDirection.Horizontal) {
537
- throw new Error("Line is not horizontal");
538
- }
539
- if (x1 >= x2) {
540
- throw new Error("x1 must be less than x2");
541
- }
542
- if (line.from.x <= x1 && line.to.x >= x2) {
543
- for (const gap of line.gaps) {
544
- if (gap.from.x <= x1 && gap.to.x >= x2) {
545
- return false;
546
- }
547
- }
548
- return true;
549
- }
550
- return false;
551
- }
552
- findBottomLineIndex(h2Index, xMiddle) {
553
- for (let i = h2Index;i < this.hLines.length; i++) {
554
- const hLine = this.hLines[i];
555
- if (hLine.from.x <= xMiddle && hLine.to.x >= xMiddle) {
556
- return i;
557
- }
558
- }
559
- return -1;
560
- }
561
- findVerticalLineIndexs(topHLine, yMiddle) {
562
- const result = [];
563
- for (let i = 0;i < this.vLines.length; i++) {
564
- const vLine = this.vLines[i];
565
- if (vLine.from.y <= yMiddle && vLine.to.y >= yMiddle && topHLine.intersection(vLine)) {
566
- result.push(i);
567
- }
568
- }
569
- return result;
570
- }
571
- getRow(h1Index, h2Index, yMiddle) {
572
- const tableRow = [];
573
- const topHLine = this.hLines[h1Index];
574
- const vLineIndexes = this.findVerticalLineIndexs(topHLine, yMiddle);
575
- for (let i = 1;i < vLineIndexes.length; i++) {
576
- const leftVLine = this.vLines[vLineIndexes[i - 1]];
577
- const rightVLine = this.vLines[vLineIndexes[i]];
578
- const xMiddle = (leftVLine.from.x + rightVLine.from.x) / 2;
579
- const bottomHLineIndex = this.findBottomLineIndex(h2Index, xMiddle);
580
- const bottomHLine = this.hLines[bottomHLineIndex];
581
- const tableCell = {
582
- minXY: new Point(leftVLine.from.x, topHLine.from.y),
583
- maxXY: new Point(rightVLine.from.x, bottomHLine.from.y),
584
- width: rightVLine.from.x - leftVLine.from.x,
585
- height: bottomHLine.from.y - topHLine.from.y,
586
- text: []
587
- };
588
- const colSpan = vLineIndexes[i] - vLineIndexes[i - 1];
589
- const rowSpan = bottomHLineIndex - h1Index;
590
- if (colSpan > 1) {
591
- tableCell.colspan = colSpan;
592
- }
593
- if (rowSpan > 1) {
594
- tableCell.rowspan = rowSpan;
595
- }
596
- tableRow.push(tableCell);
597
- }
598
- return tableRow;
599
- }
600
- toData() {
601
- const rowPivots = this.rowPivots;
602
- const colPivots = this.colPivots;
603
- const minXY = new Point(colPivots[0], rowPivots[0]);
604
- const maxXY = new Point(colPivots[colPivots.length - 1], rowPivots[rowPivots.length - 1]);
605
- const result = new TableData(minXY, maxXY, rowPivots, colPivots);
606
- for (let h1 = 1;h1 < this.hLines.length; h1++) {
607
- const prevHLine = this.hLines[h1 - 1];
608
- const currHLine = this.hLines[h1];
609
- const YMiddle = (prevHLine.from.y + currHLine.from.y) / 2;
610
- const rowData = this.getRow(h1 - 1, h1, YMiddle);
611
- result.rows.push(rowData);
612
- }
613
- return result;
614
- }
615
- }
616
-
617
- // node_modules/pdf-parse/dist/pdf-parse/esm/geometry/LineStore.js
618
- class LineStore {
619
- hLines = [];
620
- vLines = [];
621
- add(line) {
622
- if (line.valid) {
623
- if (line.direction === LineDirection.Horizontal) {
624
- this.hLines.push(line);
625
- } else if (line.direction === LineDirection.Vertical) {
626
- this.vLines.push(line);
627
- }
628
- }
629
- }
630
- addRectangle(rect) {
631
- for (const line of rect.getLines()) {
632
- this.add(line);
633
- }
634
- }
635
- getTableData() {
636
- const result = [];
637
- const tables = this.getTables();
638
- for (const table of tables) {
639
- const data = table.toData();
640
- if (data) {
641
- result.push(data);
642
- }
643
- }
644
- return result;
645
- }
646
- getTables() {
647
- const result = [];
648
- while (this.hLines.length !== 0) {
649
- const hLine = this.hLines.shift();
650
- if (!hLine)
651
- continue;
652
- const filled = this.tryFill(result, hLine);
653
- if (filled)
654
- continue;
655
- const table = new Table(hLine);
656
- this.fillTable(table);
657
- result.push(table);
658
- }
659
- while (this.vLines.length !== 0) {
660
- const vLine = this.vLines.shift();
661
- if (!vLine)
662
- continue;
663
- const filled = this.tryFill(result, vLine);
664
- if (filled)
665
- continue;
666
- const table = new Table(vLine);
667
- this.fillTable(table);
668
- result.push(table);
669
- }
670
- const validTables = result.filter((t) => t.isValid);
671
- for (const table of validTables) {
672
- table.normalize();
673
- }
674
- return validTables;
675
- }
676
- normalize() {
677
- this.normalizeHorizontal();
678
- this.normalizeVertical();
679
- }
680
- normalizeHorizontal() {
681
- this.hLines.sort((l1, l2) => l1.from.y - l2.from.y);
682
- const newLines = [];
683
- let sameY = [];
684
- for (const line of this.hLines) {
685
- if (sameY.length === 0) {
686
- sameY.push(line);
687
- } else if (Math.abs(sameY[0]?.from.y - line.from.y) < Shape.tolerance) {
688
- sameY.push(line);
689
- } else {
690
- const merged = this.margeHorizontalLines(sameY);
691
- newLines.push(...merged);
692
- sameY = [line];
693
- }
694
- }
695
- if (sameY.length > 0) {
696
- const merged = this.margeHorizontalLines(sameY);
697
- newLines.push(...merged);
698
- }
699
- this.hLines = newLines;
700
- }
701
- normalizeVertical() {
702
- this.vLines.sort((l1, l2) => l1.from.x - l2.from.x);
703
- const newLines = [];
704
- let sameX = [];
705
- for (const line of this.vLines) {
706
- if (sameX.length === 0) {
707
- sameX.push(line);
708
- } else if (Math.abs(sameX[0]?.from.x - line.from.x) < Shape.tolerance) {
709
- sameX.push(line);
710
- } else {
711
- const merged = this.margeVerticalLines(sameX);
712
- newLines.push(...merged);
713
- sameX = [line];
714
- }
715
- }
716
- if (sameX.length > 0) {
717
- const merged = this.margeVerticalLines(sameX);
718
- newLines.push(...merged);
719
- }
720
- this.vLines = newLines;
721
- }
722
- fillTable(table) {
723
- const newVLines = [];
724
- const newHLines = [];
725
- for (const vLine of this.vLines) {
726
- if (!table.add(vLine)) {
727
- newVLines.push(vLine);
728
- }
729
- }
730
- for (const hLine of this.hLines) {
731
- if (!table.add(hLine)) {
732
- newHLines.push(hLine);
733
- }
734
- }
735
- this.hLines = newHLines;
736
- this.vLines = newVLines;
737
- }
738
- tryFill(tables, line) {
739
- for (const table of tables) {
740
- if (table.add(line)) {
741
- this.fillTable(table);
742
- return true;
743
- }
744
- }
745
- return false;
746
- }
747
- margeHorizontalLines(sameYLines) {
748
- const result = [];
749
- sameYLines.sort((l1, l2) => l1.from.x - l2.from.x);
750
- const sameY = sameYLines[0]?.from.y;
751
- if (sameY === undefined)
752
- return result;
753
- let minX = Number.MAX_SAFE_INTEGER;
754
- let maxX = Number.MIN_SAFE_INTEGER;
755
- for (const line of sameYLines) {
756
- if (line.from.x - maxX < Shape.tolerance) {
757
- if (line.from.x < minX) {
758
- minX = line.from.x;
759
- }
760
- if (line.to.x > maxX) {
761
- maxX = line.to.x;
762
- }
763
- } else {
764
- if (maxX > minX) {
765
- result.push(new Line(new Point(minX, sameY), new Point(maxX, sameY)));
766
- }
767
- minX = line.from.x;
768
- maxX = line.to.x;
769
- }
770
- }
771
- const last = result[result.length - 1];
772
- if (last) {
773
- if (last.from.x !== minX && last.to.x !== maxX) {
774
- result.push(new Line(new Point(minX, sameY), new Point(maxX, sameY)));
775
- }
776
- } else {
777
- result.push(new Line(new Point(minX, sameY), new Point(maxX, sameY)));
778
- }
779
- return result;
780
- }
781
- margeVerticalLines(sameXLines) {
782
- const result = [];
783
- sameXLines.sort((l1, l2) => l1.from.y - l2.from.y);
784
- const sameX = sameXLines[0]?.from.x;
785
- if (sameX === undefined)
786
- return result;
787
- let minY = Number.MAX_SAFE_INTEGER;
788
- let maxY = Number.MIN_SAFE_INTEGER;
789
- for (const line of sameXLines) {
790
- if (line.from.y - maxY < Shape.tolerance) {
791
- if (line.from.y < minY) {
792
- minY = line.from.y;
793
- }
794
- if (line.to.y > maxY) {
795
- maxY = line.to.y;
796
- }
797
- } else {
798
- if (maxY > minY) {
799
- result.push(new Line(new Point(sameX, minY), new Point(sameX, maxY)));
800
- }
801
- minY = line.from.y;
802
- maxY = line.to.y;
803
- }
804
- }
805
- const last = result[result.length - 1];
806
- if (last) {
807
- if (last.from.y !== minY && last.to.y !== maxY) {
808
- result.push(new Line(new Point(sameX, minY), new Point(sameX, maxY)));
809
- }
810
- } else {
811
- result.push(new Line(new Point(sameX, minY), new Point(sameX, maxY)));
812
- }
813
- return result;
814
- }
815
- }
816
- // node_modules/pdf-parse/dist/pdf-parse/esm/geometry/Rectangle.js
817
- class Rectangle extends Shape {
818
- from;
819
- width;
820
- height;
821
- constructor(from, width, height) {
822
- super();
823
- this.from = from;
824
- this.width = width;
825
- this.height = height;
826
- }
827
- get to() {
828
- return new Point(this.from.x + this.width, this.from.y + this.height);
829
- }
830
- getLines() {
831
- const to = this.to;
832
- const lines = [
833
- new Line(this.from, new Point(to.x, this.from.y)),
834
- new Line(this.from, new Point(this.from.x, to.y)),
835
- new Line(new Point(to.x, this.from.y), to),
836
- new Line(new Point(this.from.x, to.y), to)
837
- ];
838
- return lines.filter((l) => l.valid);
839
- }
840
- transform(matrix) {
841
- const p1 = Shape.applyTransform([this.from.x, this.from.y], matrix);
842
- const p2 = Shape.applyTransform([this.from.x + this.width, this.from.y + this.height], matrix);
843
- const x = Math.min(p1[0], p2[0]);
844
- const y = Math.min(p1[1], p2[1]);
845
- const width = Math.abs(p1[0] - p2[0]);
846
- const height = Math.abs(p1[1] - p2[1]);
847
- this.from = new Point(x, y);
848
- this.width = width;
849
- this.height = height;
850
- return this;
851
- }
852
- }
853
- // node_modules/pdf-parse/dist/pdf-parse/esm/ImageResult.js
854
- class ImageResult {
855
- pages = [];
856
- total = 0;
857
- getPageImage(num, name) {
858
- for (const pageData of this.pages) {
859
- if (pageData.pageNumber === num) {
860
- for (const img of pageData.images) {
861
- if (img.name === name) {
862
- return img;
863
- }
864
- }
865
- }
866
- }
867
- return null;
868
- }
869
- constructor(total) {
870
- this.total = total;
871
- }
872
- }
873
-
874
- // node_modules/pdf-parse/dist/pdf-parse/esm/InfoResult.js
875
- import * as pdfjs from "pdfjs-dist/legacy/build/pdf.mjs";
876
- var XMP_DATE_PROPERTIES = [
877
- "xmp:createdate",
878
- "xmp:modifydate",
879
- "xmp:metadatadate",
880
- "xap:createdate",
881
- "xap:modifydate",
882
- "xap:metadatadate"
883
- ];
884
-
885
- class InfoResult {
886
- total;
887
- info;
888
- metadata;
889
- fingerprints;
890
- permission;
891
- outline;
892
- pages = [];
893
- getDateNode() {
894
- const result = {};
895
- const CreationDate = this.info?.CreationDate;
896
- if (CreationDate) {
897
- result.CreationDate = pdfjs.PDFDateString.toDateObject(CreationDate);
898
- }
899
- const ModDate = this.info?.ModDate;
900
- if (ModDate) {
901
- result.ModDate = pdfjs.PDFDateString.toDateObject(ModDate);
902
- }
903
- if (!this.metadata) {
904
- return result;
905
- }
906
- for (const prop of XMP_DATE_PROPERTIES) {
907
- const value = this.metadata?.get(prop);
908
- const date = this.parseISODateString(value);
909
- switch (prop) {
910
- case XMP_DATE_PROPERTIES[0]:
911
- result.XmpCreateDate = date;
912
- break;
913
- case XMP_DATE_PROPERTIES[1]:
914
- result.XmpModifyDate = date;
915
- break;
916
- case XMP_DATE_PROPERTIES[2]:
917
- result.XmpMetadataDate = date;
918
- break;
919
- case XMP_DATE_PROPERTIES[3]:
920
- result.XapCreateDate = date;
921
- break;
922
- case XMP_DATE_PROPERTIES[4]:
923
- result.XapModifyDate = date;
924
- break;
925
- case XMP_DATE_PROPERTIES[5]:
926
- result.XapMetadataDate = date;
927
- break;
928
- }
929
- }
930
- return result;
931
- }
932
- parseISODateString(isoDateString) {
933
- if (!isoDateString)
934
- return;
935
- const parsedDate = Date.parse(isoDateString);
936
- if (!Number.isNaN(parsedDate)) {
937
- return new Date(parsedDate);
938
- }
939
- return;
940
- }
941
- constructor(total) {
942
- this.total = total;
943
- }
944
- }
945
-
946
- // node_modules/pdf-parse/dist/pdf-parse/esm/ParseParameters.js
947
- function setDefaultParseParameters(params) {
948
- params.lineThreshold = params?.lineThreshold ?? 4.6;
949
- params.cellThreshold = params?.cellThreshold ?? 7;
950
- params.cellSeparator = params?.cellSeparator ?? "\t";
951
- params.lineEnforce = params?.lineEnforce ?? true;
952
- params.pageJoiner = params?.pageJoiner ?? `
953
- -- page_number of total_number --`;
954
- params.imageThreshold = params?.imageThreshold ?? 80;
955
- params.imageDataUrl = params?.imageDataUrl ?? true;
956
- params.imageBuffer = params?.imageBuffer ?? true;
957
- params.scale = params?.scale ?? 1;
958
- return params;
959
- }
960
-
961
- // node_modules/pdf-parse/dist/pdf-parse/esm/PathGeometry.js
962
- var PathGeometry;
963
- (function(PathGeometry2) {
964
- PathGeometry2[PathGeometry2["undefined"] = 0] = "undefined";
965
- PathGeometry2[PathGeometry2["hline"] = 1] = "hline";
966
- PathGeometry2[PathGeometry2["vline"] = 2] = "vline";
967
- PathGeometry2[PathGeometry2["rectangle"] = 3] = "rectangle";
968
- })(PathGeometry || (PathGeometry = {}));
969
- var DrawOPS;
970
- (function(DrawOPS2) {
971
- DrawOPS2[DrawOPS2["moveTo"] = 0] = "moveTo";
972
- DrawOPS2[DrawOPS2["lineTo"] = 1] = "lineTo";
973
- DrawOPS2[DrawOPS2["curveTo"] = 2] = "curveTo";
974
- DrawOPS2[DrawOPS2["closePath"] = 3] = "closePath";
975
- DrawOPS2[DrawOPS2["rectangle"] = 4] = "rectangle";
976
- })(DrawOPS || (DrawOPS = {}));
977
-
978
- // node_modules/pdf-parse/dist/pdf-parse/esm/ScreenshotResult.js
979
- class ScreenshotResult {
980
- pages = [];
981
- total = 0;
982
- constructor(total) {
983
- this.total = total;
984
- }
985
- }
986
-
987
- // node_modules/pdf-parse/dist/pdf-parse/esm/TableResult.js
988
- class TableResult {
989
- pages = [];
990
- mergedTables = [];
991
- total = 0;
992
- constructor(total) {
993
- this.total = total;
994
- }
995
- }
996
-
997
- // node_modules/pdf-parse/dist/pdf-parse/esm/TextResult.js
998
- class TextResult {
999
- pages = [];
1000
- text = "";
1001
- total = 0;
1002
- getPageText(num) {
1003
- for (const pageData of this.pages) {
1004
- if (pageData.num === num)
1005
- return pageData.text;
1006
- }
1007
- return "";
1008
- }
1009
- constructor(total) {
1010
- this.total = total;
1011
- }
1012
- }
1013
-
1014
- // node_modules/pdf-parse/dist/pdf-parse/esm/PDFParse.js
1015
- class PDFParse {
1016
- options;
1017
- doc;
1018
- progress = { loaded: -1, total: 0 };
1019
- constructor(options) {
1020
- if (options.verbosity === undefined) {
1021
- options.verbosity = pdfjs2.VerbosityLevel.ERRORS;
1022
- }
1023
- if (typeof Buffer !== "undefined" && options.data instanceof Buffer) {
1024
- options.data = new Uint8Array(options.data);
1025
- }
1026
- this.options = options;
1027
- }
1028
- async destroy() {
1029
- if (this.doc) {
1030
- await this.doc.destroy();
1031
- this.doc = undefined;
1032
- }
1033
- }
1034
- static get isNodeJS() {
1035
- const isNodeJS = typeof process === "object" && `${process}` === "[object process]" && !process.versions.nw && !(process.versions.electron && typeof process.type !== "undefined" && process.type !== "browser");
1036
- return isNodeJS;
1037
- }
1038
- static setWorker(workerSrc) {
1039
- if (typeof globalThis.pdfjs === "undefined") {
1040
- globalThis.pdfjs = pdfjs2;
1041
- }
1042
- if (pdfjs2?.GlobalWorkerOptions === null)
1043
- return "";
1044
- if (workerSrc !== undefined) {
1045
- pdfjs2.GlobalWorkerOptions.workerSrc = workerSrc;
1046
- return pdfjs2.GlobalWorkerOptions.workerSrc;
1047
- }
1048
- return pdfjs2.GlobalWorkerOptions.workerSrc;
1049
- }
1050
- async getInfo(params = {}) {
1051
- const doc = await this.load();
1052
- const result = new InfoResult(doc.numPages);
1053
- const { info, metadata } = await doc.getMetadata();
1054
- result.info = info;
1055
- result.metadata = metadata;
1056
- result.fingerprints = doc.fingerprints;
1057
- result.outline = await doc.getOutline();
1058
- result.permission = await doc.getPermissions();
1059
- const pageLabels = await doc.getPageLabels();
1060
- if (params.parsePageInfo) {
1061
- for (let i = 1;i <= result.total; i++) {
1062
- if (this.shouldParse(i, result.total, params)) {
1063
- const page = await doc.getPage(i);
1064
- const pageLinkResult = await this.getPageLinks(page);
1065
- pageLinkResult.pageLabel = pageLabels?.[page.pageNumber];
1066
- result.pages.push(pageLinkResult);
1067
- page.cleanup();
1068
- }
1069
- }
1070
- }
1071
- return result;
1072
- }
1073
- async getPageLinks(page) {
1074
- const viewport = page.getViewport({ scale: 1 });
1075
- const result = {
1076
- pageNumber: page.pageNumber,
1077
- links: [],
1078
- width: viewport.width,
1079
- height: viewport.height
1080
- };
1081
- const annotations = await page.getAnnotations({ intent: "display" }) || [];
1082
- for (const i of annotations) {
1083
- if (i.subtype !== "Link")
1084
- continue;
1085
- const url = i.url ?? i.unsafeUrl;
1086
- if (!url)
1087
- continue;
1088
- const text = i.overlaidText || "";
1089
- result.links.push({ url, text });
1090
- }
1091
- return result;
1092
- }
1093
- async getText(params = {}) {
1094
- const doc = await this.load();
1095
- const result = new TextResult(doc.numPages);
1096
- for (let i = 1;i <= result.total; i++) {
1097
- if (this.shouldParse(i, result.total, params)) {
1098
- const page = await doc.getPage(i);
1099
- const text = await this.getPageText(page, params, result.total);
1100
- result.pages.push({
1101
- text,
1102
- num: i
1103
- });
1104
- page.cleanup();
1105
- }
1106
- }
1107
- for (const page of result.pages) {
1108
- if (params.pageJoiner) {
1109
- let pageNumber = params.pageJoiner.replace("page_number", `${page.num}`);
1110
- pageNumber = pageNumber.replace("total_number", `${result.total}`);
1111
- result.text += `${page.text}
1112
- ${pageNumber}
1113
-
1114
- `;
1115
- } else {
1116
- result.text += `${page.text}
1117
-
1118
- `;
1119
- }
1120
- }
1121
- return result;
1122
- }
1123
- async load() {
1124
- try {
1125
- if (this.doc === undefined) {
1126
- const loadingTask = pdfjs2.getDocument(this.options);
1127
- loadingTask.onProgress = (progress) => {
1128
- this.progress = progress;
1129
- };
1130
- this.doc = await loadingTask.promise;
1131
- }
1132
- return this.doc;
1133
- } catch (error) {
1134
- throw getException(error);
1135
- }
1136
- }
1137
- shouldParse(currentPage, totalPage, params) {
1138
- params.partial = params?.partial ?? [];
1139
- params.first = params?.first ?? 0;
1140
- params.last = params?.last ?? 0;
1141
- if (params.partial.length > 0) {
1142
- if (params.partial.includes(currentPage)) {
1143
- return true;
1144
- }
1145
- return false;
1146
- }
1147
- if (params.first > 0 && params.last > 0) {
1148
- if (currentPage >= params.first && currentPage <= params.last) {
1149
- return true;
1150
- }
1151
- return false;
1152
- }
1153
- if (params.first > 0) {
1154
- if (currentPage <= params.first) {
1155
- return true;
1156
- }
1157
- return false;
1158
- }
1159
- if (params.last > 0) {
1160
- if (currentPage > totalPage - params.last) {
1161
- return true;
1162
- }
1163
- return false;
1164
- }
1165
- return true;
1166
- }
1167
- async getPageText(page, parseParams, total) {
1168
- const viewport = page.getViewport({ scale: 1 });
1169
- const params = setDefaultParseParameters(parseParams);
1170
- const textContent = await page.getTextContent({
1171
- includeMarkedContent: !!params.includeMarkedContent,
1172
- disableNormalization: !!params.disableNormalization
1173
- });
1174
- let links = new Map;
1175
- if (params.parseHyperlinks) {
1176
- links = await this.getHyperlinks(page, viewport);
1177
- }
1178
- const strBuf = [];
1179
- let lastX;
1180
- let lastY;
1181
- let lineHeight = 0;
1182
- for (const item of textContent.items) {
1183
- if (!("str" in item))
1184
- continue;
1185
- const tm = item.transform ?? item.transform;
1186
- const [x, y] = viewport.convertToViewportPoint(tm[4], tm[5]);
1187
- if (params.parseHyperlinks) {
1188
- const posArr = links.get(item.str) || [];
1189
- const hit = posArr.find((l) => x >= l.rect.left && x <= l.rect.right && y >= l.rect.top && y <= l.rect.bottom);
1190
- if (hit) {
1191
- item.str = `[${item.str}](${hit.url})`;
1192
- }
1193
- }
1194
- if (params.lineEnforce) {
1195
- if (lastY !== undefined && Math.abs(lastY - y) > params.lineThreshold) {
1196
- const lastItem = strBuf.length ? strBuf[strBuf.length - 1] : undefined;
1197
- const isCurrentItemHasNewLine = item.str.startsWith(`
1198
- `) || item.str.trim() === "" && item.hasEOL;
1199
- if (lastItem?.endsWith(`
1200
- `) === false && !isCurrentItemHasNewLine) {
1201
- const ydiff = Math.abs(lastY - y);
1202
- if (ydiff - 1 > lineHeight) {
1203
- strBuf.push(`
1204
- `);
1205
- lineHeight = 0;
1206
- }
1207
- }
1208
- }
1209
- }
1210
- if (params.cellSeparator) {
1211
- if (lastY !== undefined && Math.abs(lastY - y) < params.lineThreshold) {
1212
- if (lastX !== undefined && Math.abs(lastX - x) > params.cellThreshold) {
1213
- item.str = `${params.cellSeparator}${item.str}`;
1214
- }
1215
- }
1216
- }
1217
- strBuf.push(item.str);
1218
- lastX = x + item.width;
1219
- lastY = y;
1220
- lineHeight = Math.max(lineHeight, item.height);
1221
- if (item.hasEOL) {
1222
- strBuf.push(`
1223
- `);
1224
- }
1225
- if (item.hasEOL || item.str.endsWith(`
1226
- `)) {
1227
- lineHeight = 0;
1228
- }
1229
- }
1230
- if (params.itemJoiner) {
1231
- return strBuf.join(params.itemJoiner);
1232
- }
1233
- return strBuf.join("");
1234
- }
1235
- async getHyperlinks(page, viewport) {
1236
- const result = new Map;
1237
- const annotations = await page.getAnnotations({ intent: "display" }) || [];
1238
- for (const i of annotations) {
1239
- if (i.subtype !== "Link")
1240
- continue;
1241
- const url = i.url ?? i.unsafeUrl;
1242
- if (!url)
1243
- continue;
1244
- const text = i.overlaidText;
1245
- if (!text)
1246
- continue;
1247
- const rectVp = viewport.convertToViewportRectangle(i.rect);
1248
- const left = Math.min(rectVp[0], rectVp[2]) - 0.5;
1249
- const top = Math.min(rectVp[1], rectVp[3]) - 0.5;
1250
- const right = Math.max(rectVp[0], rectVp[2]) + 0.5;
1251
- const bottom = Math.max(rectVp[1], rectVp[3]) + 0.5;
1252
- const pos = { rect: { left, top, right, bottom }, url, text, used: false };
1253
- const el = result.get(text);
1254
- if (el) {
1255
- el.push(pos);
1256
- } else {
1257
- result.set(text, [pos]);
1258
- }
1259
- }
1260
- return result;
1261
- }
1262
- async getImage(params = {}) {
1263
- const doc = await this.load();
1264
- const result = new ImageResult(doc.numPages);
1265
- setDefaultParseParameters(params);
1266
- for (let i = 1;i <= result.total; i++) {
1267
- if (this.shouldParse(i, result.total, params)) {
1268
- const page = await doc.getPage(i);
1269
- const ops = await page.getOperatorList();
1270
- const pageImages = { pageNumber: i, images: [] };
1271
- result.pages.push(pageImages);
1272
- for (let j = 0;j < ops.fnArray.length; j++) {
1273
- if (ops.fnArray[j] === pdfjs2.OPS.paintInlineImageXObject || ops.fnArray[j] === pdfjs2.OPS.paintImageXObject) {
1274
- const name = ops.argsArray[j][0];
1275
- const isCommon = page.commonObjs.has(name);
1276
- const imgPromise = isCommon ? this.resolveEmbeddedImage(page.commonObjs, name) : this.resolveEmbeddedImage(page.objs, name);
1277
- const { width, height, kind, data } = await imgPromise;
1278
- if (params.imageThreshold) {
1279
- if (params.imageThreshold >= width || params.imageThreshold >= height) {
1280
- continue;
1281
- }
1282
- }
1283
- const canvasFactory = doc.canvasFactory;
1284
- const canvasAndContext = canvasFactory.create(width, height);
1285
- const context = canvasAndContext.context;
1286
- let imgData = null;
1287
- if (kind === pdfjs2.ImageKind.RGBA_32BPP) {
1288
- imgData = context.createImageData(width, height);
1289
- imgData.data.set(data);
1290
- } else {
1291
- imgData = context.createImageData(width, height);
1292
- this.convertToRGBA({
1293
- src: data,
1294
- dest: new Uint32Array(imgData.data.buffer),
1295
- width,
1296
- height,
1297
- kind
1298
- });
1299
- }
1300
- context.putImageData(imgData, 0, 0);
1301
- let buffer = new Uint8Array;
1302
- let dataUrl = "";
1303
- if (typeof canvasAndContext.canvas.toBuffer === "function") {
1304
- let nodeBuffer;
1305
- if (params.imageBuffer) {
1306
- nodeBuffer = canvasAndContext.canvas.toBuffer("image/png");
1307
- buffer = new Uint8Array(nodeBuffer);
1308
- }
1309
- if (params.imageDataUrl) {
1310
- if (nodeBuffer) {
1311
- dataUrl = `data:image/png;base64,${nodeBuffer.toString("base64")}`;
1312
- } else {
1313
- nodeBuffer = canvasAndContext.canvas.toBuffer("image/png");
1314
- buffer = new Uint8Array(nodeBuffer);
1315
- dataUrl = `data:image/png;base64,${nodeBuffer.toString("base64")}`;
1316
- }
1317
- }
1318
- } else {
1319
- if (params.imageBuffer) {
1320
- const imageData = canvasAndContext.context.getImageData(0, 0, canvasAndContext.canvas.width, canvasAndContext.canvas.height);
1321
- buffer = new Uint8Array(imageData.data);
1322
- }
1323
- if (params.imageDataUrl) {
1324
- dataUrl = canvasAndContext.canvas.toDataURL("image/png");
1325
- }
1326
- }
1327
- pageImages.images.push({
1328
- data: buffer,
1329
- dataUrl,
1330
- name,
1331
- height,
1332
- width,
1333
- kind
1334
- });
1335
- }
1336
- }
1337
- }
1338
- }
1339
- return result;
1340
- }
1341
- convertToRGBA({ src, dest, width, height, kind }) {
1342
- if (kind === pdfjs2.ImageKind.RGB_24BPP) {
1343
- for (let i = 0, j = 0;i < src.length; i += 3, j++) {
1344
- const r = src[i];
1345
- const g = src[i + 1];
1346
- const b = src[i + 2];
1347
- dest[j] = 255 << 24 | b << 16 | g << 8 | r;
1348
- }
1349
- } else if (kind === pdfjs2.ImageKind.GRAYSCALE_1BPP) {
1350
- let pixelIndex = 0;
1351
- for (let i = 0;i < src.length; i++) {
1352
- const byte = src[i];
1353
- for (let bit = 7;bit >= 0; bit--) {
1354
- if (pixelIndex >= width * height)
1355
- break;
1356
- const isWhite = (byte >> bit & 1) === 1;
1357
- const gray = isWhite ? 255 : 0;
1358
- dest[pixelIndex++] = 255 << 24 | gray << 16 | gray << 8 | gray;
1359
- }
1360
- }
1361
- } else if (kind === undefined || kind === null) {
1362
- const bytesPerPixel = src.length / (width * height);
1363
- if (Math.abs(bytesPerPixel - 3) < 0.1) {
1364
- for (let i = 0, j = 0;i < src.length; i += 3, j++) {
1365
- const r = src[i];
1366
- const g = src[i + 1];
1367
- const b = src[i + 2];
1368
- dest[j] = 255 << 24 | b << 16 | g << 8 | r;
1369
- }
1370
- } else if (Math.abs(bytesPerPixel - 4) < 0.1) {
1371
- for (let i = 0, j = 0;i < src.length; i += 4, j++) {
1372
- const r = src[i];
1373
- const g = src[i + 1];
1374
- const b = src[i + 2];
1375
- const a = src[i + 3];
1376
- dest[j] = a << 24 | b << 16 | g << 8 | r;
1377
- }
1378
- } else if (Math.abs(bytesPerPixel - 1) < 0.1) {
1379
- for (let i = 0;i < src.length; i++) {
1380
- const gray = src[i];
1381
- dest[i] = 255 << 24 | gray << 16 | gray << 8 | gray;
1382
- }
1383
- } else {
1384
- throw new Error(`convertToRGBA: Cannot infer image format. kind: ${kind}, bytesPerPixel: ${bytesPerPixel}, width: ${width}, height: ${height}, dataLength: ${src.length}`);
1385
- }
1386
- } else {
1387
- throw new Error(`convertToRGBA: Unsupported image kind: ${kind}. Available kinds: GRAYSCALE_1BPP=${pdfjs2.ImageKind.GRAYSCALE_1BPP}, RGB_24BPP=${pdfjs2.ImageKind.RGB_24BPP}, RGBA_32BPP=${pdfjs2.ImageKind.RGBA_32BPP}`);
1388
- }
1389
- }
1390
- resolveEmbeddedImage(pdfObjects, name) {
1391
- return new Promise((resolve, reject) => {
1392
- pdfObjects.get(name, (imgData) => {
1393
- if (imgData) {
1394
- let dataBuff;
1395
- if (imgData.data instanceof Uint8Array) {
1396
- dataBuff = imgData.data;
1397
- } else if (imgData.data instanceof Uint8ClampedArray) {
1398
- dataBuff = new Uint8Array(imgData.data);
1399
- } else if (imgData.data?.buffer) {
1400
- dataBuff = new Uint8Array(imgData.data.buffer);
1401
- } else if (imgData.bitmap) {
1402
- const canvasFactory = this.doc.canvasFactory;
1403
- const canvasAndContext = canvasFactory.create(imgData.bitmap.width, imgData.bitmap.height);
1404
- canvasAndContext.context.drawImage(imgData.bitmap, 0, 0);
1405
- const imageData = canvasAndContext.context.getImageData(0, 0, imgData.bitmap.width, imgData.bitmap.height);
1406
- dataBuff = new Uint8Array(imageData.data.buffer);
1407
- } else if (ArrayBuffer.isView(imgData.data)) {
1408
- dataBuff = new Uint8Array(imgData.data.buffer, imgData.data.byteOffset, imgData.data.byteLength);
1409
- }
1410
- if (!dataBuff) {
1411
- reject(new Error(`Image object ${name}: data field is empty or invalid. Available fields: ${Object.keys(imgData).join(", ")}`));
1412
- return;
1413
- }
1414
- if (dataBuff.length === 0) {
1415
- reject(new Error(`Image object ${name}: data buffer is empty (length: 0)`));
1416
- return;
1417
- }
1418
- resolve({ width: imgData.width, height: imgData.height, kind: imgData.kind, data: dataBuff });
1419
- } else {
1420
- reject(new Error(`Image object ${name} not found`));
1421
- }
1422
- });
1423
- });
1424
- }
1425
- async getScreenshot(parseParams = {}) {
1426
- const params = setDefaultParseParameters(parseParams);
1427
- const doc = await this.load();
1428
- const result = new ScreenshotResult(doc.numPages);
1429
- if (this.doc === undefined) {
1430
- throw new Error("PDF document not loaded");
1431
- }
1432
- for (let i = 1;i <= result.total; i++) {
1433
- if (this.shouldParse(i, result.total, params)) {
1434
- const page = await this.doc.getPage(i);
1435
- let viewport = page.getViewport({ scale: params.scale });
1436
- if (params.desiredWidth) {
1437
- viewport = page.getViewport({ scale: 1 });
1438
- const scale = params.desiredWidth / viewport.width;
1439
- viewport = page.getViewport({ scale });
1440
- }
1441
- const canvasFactory = this.doc.canvasFactory;
1442
- const canvasAndContext = canvasFactory.create(viewport.width, viewport.height);
1443
- const renderContext = {
1444
- canvasContext: canvasAndContext.context,
1445
- viewport,
1446
- canvas: canvasAndContext.canvas
1447
- };
1448
- const renderTask = page.render(renderContext);
1449
- await renderTask.promise;
1450
- let data = new Uint8Array;
1451
- let dataUrl = "";
1452
- if (typeof canvasAndContext.canvas.toBuffer === "function") {
1453
- let nodeBuffer;
1454
- if (params.imageBuffer) {
1455
- nodeBuffer = canvasAndContext.canvas.toBuffer("image/png");
1456
- data = new Uint8Array(nodeBuffer);
1457
- }
1458
- if (params.imageDataUrl) {
1459
- if (nodeBuffer) {
1460
- dataUrl = `data:image/png;base64,${nodeBuffer.toString("base64")}`;
1461
- } else {
1462
- nodeBuffer = canvasAndContext.canvas.toBuffer("image/png");
1463
- data = new Uint8Array(nodeBuffer);
1464
- dataUrl = `data:image/png;base64,${nodeBuffer.toString("base64")}`;
1465
- }
1466
- }
1467
- } else {
1468
- if (params.imageBuffer) {
1469
- const imageData = canvasAndContext.context.getImageData(0, 0, canvasAndContext.canvas.width, canvasAndContext.canvas.height);
1470
- data = new Uint8Array(imageData.data);
1471
- }
1472
- if (params.imageDataUrl) {
1473
- dataUrl = canvasAndContext.canvas.toDataURL("image/png");
1474
- }
1475
- }
1476
- result.pages.push({
1477
- data,
1478
- dataUrl,
1479
- pageNumber: i,
1480
- width: viewport.width,
1481
- height: viewport.height,
1482
- scale: viewport.scale
1483
- });
1484
- page.cleanup();
1485
- }
1486
- }
1487
- return result;
1488
- }
1489
- async getTable(params = {}) {
1490
- const doc = await this.load();
1491
- const result = new TableResult(doc.numPages);
1492
- if (this.doc === undefined) {
1493
- throw new Error("PDF document not loaded");
1494
- }
1495
- for (let i = 1;i <= result.total; i++) {
1496
- if (this.shouldParse(i, result.total, params)) {
1497
- const page = await this.doc.getPage(i);
1498
- const store = await this.getPageTables(page);
1499
- store.normalize();
1500
- const tableDataArr = store.getTableData();
1501
- await this.fillPageTables(page, tableDataArr);
1502
- const pageTableResult = { num: i, tables: [] };
1503
- for (const table of tableDataArr) {
1504
- pageTableResult.tables.push(table.toArray());
1505
- }
1506
- result.pages.push(pageTableResult);
1507
- page.cleanup();
1508
- }
1509
- }
1510
- return result;
1511
- }
1512
- getPathGeometry(mm) {
1513
- const width = mm[2] - mm[0];
1514
- const height = mm[3] - mm[1];
1515
- if (mm[0] === Infinity) {
1516
- return PathGeometry.undefined;
1517
- }
1518
- if (width > 5 && height > 5) {
1519
- return PathGeometry.rectangle;
1520
- } else if (width > 5 && height === 0) {
1521
- return PathGeometry.hline;
1522
- } else if (width === 0 && height > 5) {
1523
- return PathGeometry.vline;
1524
- }
1525
- return PathGeometry.undefined;
1526
- }
1527
- async getPageTables(page) {
1528
- const lineStore = new LineStore;
1529
- const viewport = page.getViewport({ scale: 1 });
1530
- let transformMatrix = [1, 0, 0, 1, 0, 0];
1531
- const transformStack = [];
1532
- const opList = await page.getOperatorList();
1533
- for (let i = 0;i < opList.fnArray.length; i++) {
1534
- const fn = opList.fnArray[i];
1535
- const args = opList.argsArray[i];
1536
- const op = args?.[0] ?? 0;
1537
- const mm = args?.[2] ?? [Infinity, Infinity, -Infinity, -Infinity];
1538
- if (fn === pdfjs2.OPS.constructPath) {
1539
- if (op === pdfjs2.OPS.fill) {}
1540
- if (op !== pdfjs2.OPS.stroke) {
1541
- continue;
1542
- }
1543
- const pg = this.getPathGeometry(mm);
1544
- if (pg === PathGeometry.rectangle) {
1545
- const rect = new Rectangle(new Point(mm[0], mm[1]), mm[2] - mm[0], mm[3] - mm[1]);
1546
- rect.transform(transformMatrix);
1547
- rect.transform(viewport.transform);
1548
- lineStore.addRectangle(rect);
1549
- } else if (pg === PathGeometry.hline || pg === PathGeometry.vline) {
1550
- const from = new Point(mm[0], mm[1]);
1551
- const to = new Point(mm[2], mm[3]);
1552
- const line = new Line(from, to);
1553
- line.transform(transformMatrix);
1554
- line.transform(viewport.transform);
1555
- lineStore.add(line);
1556
- } else {}
1557
- } else if (fn === pdfjs2.OPS.setLineWidth) {} else if (fn === pdfjs2.OPS.save) {
1558
- transformStack.push(transformMatrix);
1559
- } else if (fn === pdfjs2.OPS.restore) {
1560
- const restoredMatrix = transformStack.pop();
1561
- if (restoredMatrix) {
1562
- transformMatrix = restoredMatrix;
1563
- }
1564
- } else if (fn === pdfjs2.OPS.transform) {
1565
- transformMatrix = pdfjs2.Util.transform(transformMatrix, args);
1566
- }
1567
- }
1568
- return lineStore;
1569
- }
1570
- async fillPageTables(page, pageTables) {
1571
- const viewport = page.getViewport({ scale: 1 });
1572
- const textContent = await page.getTextContent({
1573
- includeMarkedContent: false,
1574
- disableNormalization: false
1575
- });
1576
- for (const textItem of textContent.items) {
1577
- if (!("str" in textItem))
1578
- continue;
1579
- const tx = pdfjs2.Util.transform(pdfjs2.Util.transform(viewport.transform, textItem.transform), [1, 0, 0, -1, 0, 0]);
1580
- for (const pageTable of pageTables) {
1581
- const cell = pageTable.findCell(tx[4], tx[5]);
1582
- if (cell) {
1583
- cell.text.push(textItem.str);
1584
- if (textItem.hasEOL) {
1585
- cell.text.push(`
1586
- `);
1587
- }
1588
- break;
1589
- }
1590
- }
1591
- }
1592
- }
1593
- }
1594
-
1595
- // node_modules/pdf-parse/dist/pdf-parse/esm/index.js
1596
- import { VerbosityLevel as VerbosityLevel2 } from "pdfjs-dist/legacy/build/pdf.mjs";
1597
-
1598
- // services/pdf.ts
1599
- var pdfParse = undefined || exports_esm;
1600
-
1601
9
  class PdfService extends Service {
1602
10
  static serviceType = ServiceType.PDF;
1603
11
  capabilityDescription = "The agent is able to convert PDF files to text";
1604
- static async start(_runtime) {
1605
- return new PdfService;
12
+ static async start(runtime) {
13
+ const service = new PdfService(runtime);
14
+ return service;
1606
15
  }
1607
16
  static async stop(runtime) {
1608
- const service = await runtime.getService(ServiceType.PDF);
17
+ const service = runtime.getService(ServiceType.PDF);
1609
18
  if (service) {
1610
- await service.stop?.();
19
+ await service.stop();
1611
20
  }
1612
21
  }
1613
22
  async stop() {}
1614
23
  async convertPdfToText(pdfBuffer) {
1615
24
  try {
1616
- const data = await pdfParse(pdfBuffer);
1617
- return this.cleanUpContent(data.text);
25
+ const uint8Array = new Uint8Array(pdfBuffer);
26
+ const pdf = await getDocument({ data: uint8Array }).promise;
27
+ const numPages = pdf.numPages;
28
+ const textPages = [];
29
+ for (let pageNum = 1;pageNum <= numPages; pageNum++) {
30
+ const page = await pdf.getPage(pageNum);
31
+ const textContent = await page.getTextContent();
32
+ const pageText = textContent.items.filter(isTextItem).map((item) => item.str).join(" ");
33
+ textPages.push(pageText);
34
+ }
35
+ const rawText = textPages.join(`
36
+ `);
37
+ return this.cleanUpContent(rawText);
1618
38
  } catch (error) {
1619
- logger.error(`PdfService: Failed to convert PDF to text - error: ${error}`);
39
+ logger.error(`PdfService: Failed to convert PDF to text - error: ${error}, bufferSize: ${pdfBuffer.length}`);
1620
40
  throw error;
1621
41
  }
1622
42
  }
1623
43
  async convertPdfToTextWithOptions(pdfBuffer, options = {}) {
1624
44
  try {
1625
- const parseOptions = {};
1626
- if (options.endPage) {
1627
- parseOptions.max = options.endPage;
1628
- }
1629
- const data = await pdfParse(pdfBuffer, parseOptions);
1630
- let text = data.text;
45
+ const uint8Array = new Uint8Array(pdfBuffer);
46
+ const pdf = await getDocument({ data: uint8Array }).promise;
47
+ const numPages = pdf.numPages;
48
+ const startPage = Math.max(1, options.startPage || 1);
49
+ const endPage = Math.min(numPages, options.endPage || numPages);
50
+ const textPages = [];
51
+ for (let pageNum = startPage;pageNum <= endPage; pageNum++) {
52
+ const page = await pdf.getPage(pageNum);
53
+ const textContent = await page.getTextContent();
54
+ const pageText = textContent.items.filter(isTextItem).map((item) => item.str).join(options.preserveWhitespace ? "" : " ");
55
+ textPages.push(pageText);
56
+ }
57
+ let text = textPages.join(`
58
+ `);
1631
59
  if (options.cleanContent !== false) {
1632
60
  text = this.cleanUpContent(text);
1633
61
  }
1634
62
  return {
1635
63
  success: true,
1636
64
  text,
1637
- pageCount: data.numpages
65
+ pageCount: numPages
1638
66
  };
1639
67
  } catch (error) {
1640
68
  return {
@@ -1644,30 +72,54 @@ class PdfService extends Service {
1644
72
  }
1645
73
  }
1646
74
  async getDocumentInfo(pdfBuffer) {
1647
- try {
1648
- const data = await pdfParse(pdfBuffer);
1649
- return {
1650
- pageCount: data.numpages,
1651
- metadata: {
1652
- title: data.info?.Title,
1653
- author: data.info?.Author,
1654
- creator: data.info?.Creator,
1655
- producer: data.info?.Producer
1656
- },
1657
- text: this.cleanUpContent(data.text),
1658
- pages: []
1659
- };
1660
- } catch (error) {
1661
- logger.error(`PdfService: Failed to get document info - error: ${error}`);
1662
- throw error;
1663
- }
75
+ const uint8Array = new Uint8Array(pdfBuffer);
76
+ const pdf = await getDocument({ data: uint8Array }).promise;
77
+ const numPages = pdf.numPages;
78
+ const metadataResult = await pdf.getMetadata();
79
+ const info = metadataResult.info;
80
+ const metadata = {
81
+ title: info.Title,
82
+ author: info.Author,
83
+ subject: info.Subject,
84
+ keywords: info.Keywords,
85
+ creator: info.Creator,
86
+ producer: info.Producer,
87
+ creationDate: info.CreationDate ? new Date(info.CreationDate) : undefined,
88
+ modificationDate: info.ModDate ? new Date(info.ModDate) : undefined
89
+ };
90
+ const pages = [];
91
+ const allText = [];
92
+ for (let pageNum = 1;pageNum <= numPages; pageNum++) {
93
+ const page = await pdf.getPage(pageNum);
94
+ const viewport = page.getViewport({ scale: 1 });
95
+ const textContent = await page.getTextContent();
96
+ const pageText = textContent.items.filter(isTextItem).map((item) => item.str).join(" ");
97
+ pages.push({
98
+ pageNumber: pageNum,
99
+ width: viewport.width,
100
+ height: viewport.height,
101
+ text: this.cleanUpContent(pageText)
102
+ });
103
+ allText.push(pageText);
104
+ }
105
+ return {
106
+ pageCount: numPages,
107
+ metadata,
108
+ text: this.cleanUpContent(allText.join(`
109
+ `)),
110
+ pages
111
+ };
1664
112
  }
1665
113
  cleanUpContent(content) {
1666
114
  try {
1667
- const cleaned = content.replace(/[^\S\r\n]+/g, " ").replace(/[ \t]+(\r?\n)/g, "$1").trim();
115
+ const filtered = content.split("").filter((char) => {
116
+ const charCode = char.charCodeAt(0);
117
+ return !(charCode === 0 || charCode >= 1 && charCode <= 8 || charCode >= 11 && charCode <= 12 || charCode >= 14 && charCode <= 31 || charCode === 127);
118
+ }).join("");
119
+ const cleaned = filtered.replace(/[^\S\r\n]+/g, " ").replace(/[ \t]+(\r?\n)/g, "$1").trim();
1668
120
  return cleaned;
1669
121
  } catch (error) {
1670
- logger.error(`PdfService: Failed to clean up content - error: ${error}`);
122
+ logger.error(`PdfService: Failed to clean up content - error: ${error}, contentLength: ${content.length}`);
1671
123
  return content;
1672
124
  }
1673
125
  }
@@ -1686,4 +138,4 @@ export {
1686
138
  PdfService
1687
139
  };
1688
140
 
1689
- //# debugId=DD875B9990A9EAE664756E2164756E21
141
+ //# debugId=D3E0640C90E88EE064756E2164756E21