@syncfusion/ej2-pdf-data-extract 30.1.42 → 30.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/dist/ej2-pdf-data-extract.umd.min.js +1 -2
  2. package/dist/global/ej2-pdf-data-extract.min.js +1 -2
  3. package/dist/global/index.d.ts +1 -2
  4. package/package.json +14 -46
  5. package/dist/ts/index.d.ts +0 -20
  6. package/dist/ts/index.ts +0 -20
  7. package/dist/ts/pdf-data-extract/core/content-parser-helper.d.ts +0 -62
  8. package/dist/ts/pdf-data-extract/core/content-parser-helper.ts +0 -640
  9. package/dist/ts/pdf-data-extract/core/enum.d.ts +0 -6
  10. package/dist/ts/pdf-data-extract/core/enum.ts +0 -6
  11. package/dist/ts/pdf-data-extract/core/graphic-state.d.ts +0 -33
  12. package/dist/ts/pdf-data-extract/core/graphic-state.ts +0 -106
  13. package/dist/ts/pdf-data-extract/core/pdf-data-extractor.d.ts +0 -210
  14. package/dist/ts/pdf-data-extract/core/pdf-data-extractor.ts +0 -977
  15. package/dist/ts/pdf-data-extract/core/pdf-text-parser.d.ts +0 -67
  16. package/dist/ts/pdf-data-extract/core/pdf-text-parser.ts +0 -495
  17. package/dist/ts/pdf-data-extract/core/redaction/index.d.ts +0 -4
  18. package/dist/ts/pdf-data-extract/core/redaction/index.ts +0 -4
  19. package/dist/ts/pdf-data-extract/core/redaction/pdf-redaction-processor.d.ts +0 -55
  20. package/dist/ts/pdf-data-extract/core/redaction/pdf-redaction-processor.ts +0 -592
  21. package/dist/ts/pdf-data-extract/core/redaction/pdf-redaction-region.d.ts +0 -281
  22. package/dist/ts/pdf-data-extract/core/redaction/pdf-redaction-region.ts +0 -342
  23. package/dist/ts/pdf-data-extract/core/redaction/pdf-redactor.d.ts +0 -129
  24. package/dist/ts/pdf-data-extract/core/redaction/pdf-redactor.ts +0 -322
  25. package/dist/ts/pdf-data-extract/core/redaction/text-glyph-mapper.d.ts +0 -12
  26. package/dist/ts/pdf-data-extract/core/redaction/text-glyph-mapper.ts +0 -153
  27. package/dist/ts/pdf-data-extract/core/text-extraction/binary-cmap-reader.d.ts +0 -24
  28. package/dist/ts/pdf-data-extract/core/text-extraction/binary-cmap-reader.ts +0 -281
  29. package/dist/ts/pdf-data-extract/core/text-extraction/cmap.d.ts +0 -50
  30. package/dist/ts/pdf-data-extract/core/text-extraction/cmap.ts +0 -565
  31. package/dist/ts/pdf-data-extract/core/text-extraction/compact-font-parser.d.ts +0 -191
  32. package/dist/ts/pdf-data-extract/core/text-extraction/compact-font-parser.ts +0 -1928
  33. package/dist/ts/pdf-data-extract/core/text-extraction/encoding-utils.d.ts +0 -102
  34. package/dist/ts/pdf-data-extract/core/text-extraction/encoding-utils.ts +0 -5780
  35. package/dist/ts/pdf-data-extract/core/text-extraction/font-structure.d.ts +0 -167
  36. package/dist/ts/pdf-data-extract/core/text-extraction/font-structure.ts +0 -1842
  37. package/dist/ts/pdf-data-extract/core/text-extraction/font-tables.d.ts +0 -5
  38. package/dist/ts/pdf-data-extract/core/text-extraction/font-tables.ts +0 -16
  39. package/dist/ts/pdf-data-extract/core/text-extraction/font-utils.d.ts +0 -18
  40. package/dist/ts/pdf-data-extract/core/text-extraction/font-utils.ts +0 -630
  41. package/dist/ts/pdf-data-extract/core/text-extraction/glyph.d.ts +0 -93
  42. package/dist/ts/pdf-data-extract/core/text-extraction/glyph.ts +0 -622
  43. package/dist/ts/pdf-data-extract/core/text-extraction/index.d.ts +0 -10
  44. package/dist/ts/pdf-data-extract/core/text-extraction/index.ts +0 -10
  45. package/dist/ts/pdf-data-extract/core/text-extraction/matrix-helper.d.ts +0 -38
  46. package/dist/ts/pdf-data-extract/core/text-extraction/matrix-helper.ts +0 -150
  47. package/dist/ts/pdf-data-extract/core/text-extraction/metrics.d.ts +0 -16
  48. package/dist/ts/pdf-data-extract/core/text-extraction/metrics.ts +0 -2938
  49. package/dist/ts/pdf-data-extract/core/text-structure.d.ts +0 -628
  50. package/dist/ts/pdf-data-extract/core/text-structure.ts +0 -668
  51. package/dist/ts/pdf-data-extract/core/utils.d.ts +0 -99
  52. package/dist/ts/pdf-data-extract/core/utils.ts +0 -626
  53. package/dist/ts/pdf-data-extract/index.d.ts +0 -20
  54. package/dist/ts/pdf-data-extract/index.ts +0 -20
@@ -1,977 +0,0 @@
1
- import { _MatrixHelper, _TransformationStack } from './text-extraction/matrix-helper';
2
- import { TextGlyph, TextLine, TextWord } from './text-structure';
3
- import { _PdfContentParserHelper } from './content-parser-helper';
4
- import { _GraphicState } from './graphic-state';
5
- import { _FontStructure } from './text-extraction';
6
- import { _TextProcessingMode } from './enum';
7
- import { _addFontResources, _getXObject, _getXObjectResources, _ignoreEscapeSequence, _isArrayEqual, _parseEncodedText } from './utils';
8
- import { _PdfCrossReference, _PdfDictionary, _PdfRecord, PdfDocument, PdfFontStyle, PdfPage, PdfPath, PdfRotationAngle } from '@syncfusion/ej2-pdf';
9
- import { _PdfTextParser } from './pdf-text-parser';
10
-
11
- /**
12
- * Represents a utility for extracting data from a PDF document.
13
- * ```typescript
14
- * // Load an existing PDF document
15
- * let document: PdfDocument = new PdfDocument(data, password);
16
- * // Initialize a new instance of the `PdfDataExtractor` class
17
- * let extractor: PdfDataExtractor = new PdfDataExtractor(document);
18
- * // Extract `TextLine` from the PDF document.
19
- * let textLines: Array<TextLine> = extractor.extractTextLines({ startPageIndex: 0, endPageIndex: document.pageCount-1});
20
- * // Save the document
21
- * document.save('output.pdf');
22
- * // Destroy the document
23
- * document.destroy();
24
- * ```
25
- */
26
- export class PdfDataExtractor {
27
- _document: PdfDocument;
28
- _rotation: number = 0;
29
- _textMatrix: _MatrixHelper;
30
- _textLeading: number = 0;
31
- _textColor: number[] = [0, 0, 0];
32
- _textLineMatrix: _MatrixHelper;
33
- _extractedText: string = '';
34
- _hasLeading: boolean;
35
- _hasNoSpacing: boolean;
36
- _textLines: TextLine[] = [];
37
- _transformations: _TransformationStack;
38
- _identityMatrix: number[] = [1, 0, 0, 1, 0, 0];
39
- _currentLocation: number[] = [0, 0]
40
- _currentFont: string;
41
- _tempBoundingRectangle: { x: number, y: number, width: number, height: number };
42
- _boundingRectangle: { x: number, y: number, width: number, height: number } = {x: 0, y: 0, width: 0 , height: 0};
43
- _previousRect: { x: number, y: number, width: number, height: number } = {x: 0, y: 0, width: 0 , height: 0};
44
- _fontSize: number;
45
- _textHorizontalScaling: number = 100;
46
- _previousTextMatrix: _MatrixHelper = new _MatrixHelper(0, 0, 0, 0, 0, 0);
47
- _previousFontSize: number;
48
- _previousExtractText: string;
49
- _arise: number = 0;
50
- _isTextMatrix: boolean;
51
- _currentTextMatrix: _MatrixHelper = new _MatrixHelper(0, 0, 0, 0, 0, 0);
52
- _text: string = '';
53
- _hasTj: boolean;
54
- _hasTm: boolean;
55
- _hasET: boolean;
56
- _characterSpacing: number = 0;
57
- _wordSpacing: number = 0;
58
- _hasBeginMarkedContent: boolean;
59
- _differenceX: number;
60
- _textScale: number = 1;
61
- _textRise: number = 0;
62
- _width: number = 0;
63
- _height: number = 0;
64
- _crossReference: _PdfCrossReference;
65
- _resultantText: string = '';
66
- _currentExtractedText: string;
67
- _initialTransForm: _MatrixHelper;
68
- _textGlyph: TextGlyph[] = [];
69
- _textWord: TextWord[] = [];
70
- _textLine: TextLine[] = []
71
- _textExtraction: string[] = [];
72
- _fontCollection: Map<string, _FontStructure> = new Map<string, _FontStructure>();
73
- _ctm: _MatrixHelper = new _MatrixHelper(1, 0, 0, 1, 0, 0);
74
- _objects: _MatrixHelper[] = [];
75
- _isLayout: boolean = false;
76
- _isRotatePage: boolean = false;
77
- _isExtractTextLines: boolean;
78
- _contentParser: _PdfContentParserHelper;
79
- _parser: _PdfTextParser = new _PdfTextParser();
80
- /**
81
- * Initialize a new instance of the `PdfDataExtractor` class
82
- *
83
- * @param {PdfDocument} document PDF document
84
- * ```typescript
85
- * // Load an existing PDF document
86
- * let document: PdfDocument = new PdfDocument(data1);
87
- * // Initialize a new instance of the `PdfDataExtractor` class
88
- * let extractor: PdfDataExtractor = new PdfDataExtractor(document);
89
- * // Extracts text from the PDF Page based on its layout
90
- * let text: string = extractor.extractText({isLayout: true});
91
- * // Save the output PDF
92
- * document.save(‘Output.pdf’);
93
- * // Destroy the documents
94
- * document.destroy();
95
- * ```
96
- */
97
- constructor(document: PdfDocument) {
98
- this._document = document;
99
- this._crossReference = document._crossReference;
100
- this._objects.push(this._ctm);
101
- }
102
- /**
103
- * Extract text from the PDF document
104
- *
105
- * @returns {string} The extracted text
106
- *
107
- * ```typescript
108
- * // Load an existing PDF document
109
- * let document: PdfDocument = new PdfDocument(data1);
110
- * // Initialize a new instance of the `PdfDataExtractor` class
111
- * let extractor: PdfDataExtractor = new PdfDataExtractor(document);
112
- * // Extract text content from the PDF document.
113
- * let text: string = extractor.extractText();
114
- * // Save the output PDF
115
- * document.save(‘Output.pdf’);
116
- * // Destroy the documents
117
- * document.destroy();
118
- * ```
119
- */
120
- extractText(): string
121
- /**
122
- * Extract text from the page ranges specified by start and end page number
123
- *
124
- * @param {object} options Options to specify the page range to be selected and to extract the text.
125
- * @returns {string} The extracted text
126
- *
127
- * ```typescript
128
- * // Load an existing PDF document
129
- * let document: PdfDocument = new PdfDocument(data1);
130
- * // Initialize a new instance of the `PdfDataExtractor` class
131
- * let extractor: PdfDataExtractor = new PdfDataExtractor(document);
132
- * // Extract text content from the PDF document.
133
- * let text: string = extractor.extractText({ startPageIndex: 0, endPageIndex: document.pageCount - 1 });
134
- * // Save the output PDF
135
- * document.save(‘Output.pdf’);
136
- * // Destroy the documents
137
- * document.destroy();
138
- * ```
139
- */
140
- extractText(options: { isLayout?: boolean; startPageIndex?: number; endPageIndex?: number }): string
141
- extractText(options?: { isLayout?: boolean; startPageIndex?: number; endPageIndex?: number }): string {
142
- let startIndex: number = 0;
143
- let endIndex: number = this._document.pageCount - 1;
144
- this._resultantText = '';
145
- if (options) {
146
- if (options.isLayout) {
147
- this._isLayout = true;
148
- }
149
- this._contentParser = new _PdfContentParserHelper(_TextProcessingMode.textExtraction);
150
- if (options.startPageIndex !== null && typeof options.startPageIndex === 'number') {
151
- startIndex = options.startPageIndex;
152
- }
153
- if (options.endPageIndex !== null && typeof options.endPageIndex === 'number') {
154
- endIndex = options.endPageIndex;
155
- }
156
- }
157
- this._processPages(startIndex, endIndex);
158
- if (this._resultantText.length) {
159
- this._resultantText = _ignoreEscapeSequence(this._resultantText);
160
- }
161
- this._isLayout = false;
162
- return this._resultantText;
163
- }
164
- _renderTextAsLayOut(recordCollection: _PdfRecord[], page: PdfPage, fontCollection: Map<string, _FontStructure>, xObjectCollection: Map<string, any>): any { //eslint-disable-line
165
- const currentTd: number[] = [];
166
- let currentYLocation: number;
167
- let hexElement: string[];
168
- let spaceBetweenWord: boolean = false;
169
- let differenceX: number = 0;
170
- let currentCmY: number = 0;
171
- let prevCmY: number = 0;
172
- this._hasTm = false;
173
- this._hasET = false;
174
- this._hasBeginMarkedContent = false;
175
- this._hasTj = false;
176
- let textlineMatrix: _MatrixHelper = new _MatrixHelper(1, 0, 0, 1, 0, 0);
177
- let prevYLocation: number;
178
- this._initialTransForm = new _MatrixHelper(1.3333333333333333, 0, 0, -1.3333333333333333, 0, page.size[1] * 1.3333333333333333);
179
- recordCollection.forEach((record: _PdfRecord) => {
180
- const token: string = record._operator;
181
- const element: string[] = record._operands;
182
- let a: number;
183
- let b: number;
184
- let c: number;
185
- let d: number;
186
- let e: number;
187
- let f: number;
188
- let red: number;
189
- let green: number;
190
- let blue: number;
191
- let endTextPosition: number;
192
- let current: number;
193
- let prev: number;
194
- let locationY: number;
195
- let difference: number;
196
- switch (token) {
197
- case 'q':
198
- this._hasET = false;
199
- this._objects.unshift(this._objects[0]);
200
- this._ctm = this._objects[0];
201
- break;
202
- case 'Q':
203
- this._objects.splice(0, 1);
204
- this._ctm = this._objects[0];
205
- break;
206
- case 'Tc':
207
- this._characterSpacing = Number(element[0]);
208
- break;
209
- case 'Tw':
210
- this._wordSpacing = Number(element[0]);
211
- break;
212
- case 'Tm':
213
- this._hasTm = true;
214
- a = Number(element[0]);
215
- b = Number(element[1]);
216
- c = Number(element[2]);
217
- d = Number(element[3]);
218
- e = Number(element[4]);
219
- f = Number(element[5]);
220
- this._textMatrix = new _MatrixHelper(a, b, c, d, e, f);
221
- this._textLineMatrix = this._textMatrix;
222
- this._currentLocation = [0, 0];
223
- this._isTextMatrix = true;
224
- textlineMatrix = this._textLineMatrix;
225
- if (this._textMatrix._offsetY === this._textLineMatrix._offsetY &&
226
- this._textMatrix._offsetX !== this._textLineMatrix._offsetX) {
227
- this._textLineMatrix = this._textMatrix;
228
- }
229
- if (this._textLineMatrix._offsetY !== this._currentTextMatrix._offsetY ||
230
- ((this._textLineMatrix._offsetX !== this._currentTextMatrix._offsetX) && this._hasBeginMarkedContent && !this._hasTj))
231
- {
232
- this._tempBoundingRectangle = { x: 0, y: 0, width: 0, height: 0 } ;
233
- this._hasBeginMarkedContent = false;
234
- }
235
- break;
236
- case 'Tf':
237
- this._renderFont(element);
238
- break;
239
- case 'TL':
240
- this._textLeading = -Number(element);
241
- break;
242
- case 'T*':
243
- this._moveToNextLine(0, this._textLeading, textlineMatrix);
244
- textlineMatrix = this._textLineMatrix;
245
- break;
246
- case 'BT':
247
- this._textMatrix = new _MatrixHelper(1.0, 0.0, 0.0, 1.0, 0.0, 0.0);
248
- this._textLineMatrix = textlineMatrix = new _MatrixHelper(1.0, 0.0, 0.0, 1.0, 0.0, 0.0);
249
- break;
250
- case 'ET':
251
- this._hasET = true;
252
- endTextPosition = (this._textLineMatrix._offsetX - (this._tempBoundingRectangle.width + this._tempBoundingRectangle.x))
253
- / 10;
254
- if (this._isLayout && this._hasLeading && endTextPosition === 0 && this._hasNoSpacing) {
255
- this._resultantText += String.fromCharCode(32);
256
- this._tempBoundingRectangle = {x: 0, y: 0, width: 0 , height: 0};
257
- this._hasLeading = false;
258
- }
259
- this._currentLocation = [];
260
- if (this._isTextMatrix) {
261
- this._isTextMatrix = false;
262
- }
263
- this._characterSpacing = 0;
264
- this._wordSpacing = 0;
265
- break;
266
- case 're':
267
- break;
268
- case 'cm':
269
- a = parseFloat(element[0]);
270
- b = parseFloat(element[1]);
271
- c = parseFloat(element[2]);
272
- d = parseFloat(element[3]);
273
- e = parseFloat(element[4]);
274
- f = parseFloat(element[5]);
275
- this._hasET = false;
276
- currentCmY = Number(element[5]);
277
- current = currentCmY;
278
- prev = prevCmY;
279
- this._ctm = new _MatrixHelper(a, b, c, d, e, f)._multiply(this._objects[0]);
280
- this._objects[0] = this._ctm;
281
- locationY = (current - prev) / 10;
282
- if ((current !== prev) && this._hasTm && (locationY < 0 || locationY >= 1)) {
283
- this._resultantText += '\r\n';
284
- this._hasTm = false;
285
- }
286
- prevCmY = currentCmY;
287
- break;
288
- case 'BDC':
289
- this._hasBeginMarkedContent = true;
290
- this._hasET = true;
291
- hexElement = element;
292
- break;
293
- case 'TD':
294
- this._setTextLeading(Number(-element[1]));
295
- this._moveToNextLine(Number(element[0]), Number(element[1]), textlineMatrix);
296
- textlineMatrix = this._textLineMatrix;
297
- if (this._textLineMatrix._offsetY !== this._currentTextMatrix._offsetY ||
298
- (this._hasBeginMarkedContent && this._textLineMatrix._offsetX !== this._currentTextMatrix._offsetX && !this._hasTj)) {
299
- this._tempBoundingRectangle = {x: 0, y: 0, width: 0, height: 0};
300
- this._hasBeginMarkedContent = false;
301
- }
302
- break;
303
- case 'Td':
304
- this._moveToNextLine(Number(element[0]), Number(element[1]), textlineMatrix);
305
- textlineMatrix = this._textLineMatrix;
306
- if (this._textLineMatrix._offsetY !== this._currentTextMatrix._offsetY ||
307
- (this._hasBeginMarkedContent && this._textLineMatrix._offsetX !== this._currentTextMatrix._offsetX)) {
308
- this._tempBoundingRectangle = {x: 0, y: 0, width: 0, height: 0};
309
- this._hasBeginMarkedContent = false;
310
- }
311
- if (Math.abs(this._textLineMatrix._offsetX - this._currentTextMatrix._offsetX) > 0 && !spaceBetweenWord && this._hasTj) {
312
- this._differenceX = Math.abs(this._textLineMatrix._offsetX - this._currentTextMatrix._offsetX);
313
- spaceBetweenWord = true;
314
- }
315
- currentTd[0] = Number(element[0]);
316
- currentTd[1] = Number(element[1]);
317
- break;
318
- case 'Tz':
319
- this._textHorizontalScaling = Number(element[0]);
320
- break;
321
- case "'": // eslint-disable-line
322
- {
323
- this._moveToNextLine(0, this._textLeading, textlineMatrix);
324
- textlineMatrix = this._textLineMatrix;
325
- currentYLocation = this._textMatrix._offsetY;
326
- this._hasNoSpacing = false;
327
- difference = 0;
328
- if (this._fontSize >= 10) {
329
- difference = Math.round((currentYLocation - prevYLocation) / 10);
330
- } else {
331
- difference = Math.round((currentYLocation - prevYLocation) / this._fontSize);
332
- }
333
- if (difference < 0) {
334
- difference = -difference;
335
- }
336
- this._hasLeading = true;
337
- if (prevYLocation !== 0 && difference >= 1) {
338
- this._resultantText += '\r\n';
339
- }
340
- const currentXPosition: number = Math.floor(this._textLineMatrix._offsetX);
341
- const prevXPosition: number = Math.floor(this._currentTextMatrix._offsetX);
342
- if ((prevXPosition - currentXPosition) > 0) {
343
- this._hasNoSpacing = true;
344
- }
345
- const backUpMatrix: _MatrixHelper = this._textLineMatrix;
346
- if (this._isRotatePage) {
347
- this._buildTextContentStream(element, page, fontCollection);
348
- } else {
349
- this._currentExtractedText = this._renderTextElementFromTJ(element, page, fontCollection);
350
- }
351
- prevYLocation = currentYLocation;
352
- if (this._isLayout) {
353
- this._resultantText += this._currentExtractedText;
354
- } else {
355
- this._textLineMatrix = backUpMatrix;
356
- }
357
- this._currentTextMatrix = this._textLineMatrix;
358
- this._textMatrix = this._textLineMatrix;
359
- break;
360
- }
361
- case 'TJ':
362
- currentYLocation = this._textMatrix._offsetY;
363
- difference = 0;
364
- if (this._fontSize >= 10) {
365
- difference = Math.round((currentYLocation - prevYLocation) / 10);
366
- } else {
367
- difference = Math.round((currentYLocation - prevYLocation) / this._fontSize);
368
- }
369
- if (difference < 0) {
370
- difference = -difference;
371
- }
372
- if (spaceBetweenWord) {
373
- if (differenceX > this._fontSize) {
374
- differenceX = 0;
375
- }
376
- spaceBetweenWord = false;
377
- }
378
- this._hasTj = true;
379
- if (prevYLocation !== 0 && difference >= 1) {
380
- this._resultantText += '\r\n';
381
- }
382
- if (this._isRotatePage) {
383
- this._buildTextContentStream(element, page, fontCollection);
384
- } else {
385
- this._currentExtractedText = this._renderTextElementFromTJ(element, page, fontCollection);
386
- }
387
- prevYLocation = currentYLocation;
388
- if (this._isLayout) {
389
- this._resultantText += this._currentExtractedText;
390
- }
391
- this._currentTextMatrix = this._textLineMatrix;
392
- this._text += this._currentExtractedText;
393
- if (this._isLayout && this._textLineMatrix._m11 !== -1 && this._textLineMatrix._m22 !== 1) {
394
- this._resultantText += ' ';
395
- }
396
- this._textMatrix = this._textLineMatrix;
397
- this._hasET = false;
398
- this._hasBeginMarkedContent = true;
399
- break;
400
- case 'Tj':
401
- {
402
- currentYLocation = this._textMatrix._offsetY;
403
- let difference: number = 0;
404
- let hex: string = '';
405
- let hexChar: string = '';
406
- if (this._fontSize >= 10) {
407
- difference = Math.round((currentYLocation - prevYLocation) / 10);
408
- } else {
409
- difference = Math.round((currentYLocation - prevYLocation) / this._fontSize);
410
- }
411
- if (difference < 0) {
412
- difference = -difference;
413
- }
414
- if (spaceBetweenWord) {
415
- if (differenceX > this._fontSize) {
416
- differenceX = 0;
417
- }
418
- if (typeof(hexElement) !== 'undefined' && hexElement.length > 1) {
419
- hexElement[1] = hexElement[1].replace(/^</, '');
420
- hex = hexElement[1].replace(/>$/, '');
421
- hexChar = element[0].replace(/^\(|\)$/g, '');
422
- if (hex !== '' && hex.indexOf('<') !== -1 && hexChar.length === 1 && /^[a-zA-Z]$/.test(hexChar)) {
423
- this._hasET = false;
424
- }
425
- }
426
- if (this._hasET) {
427
- this._resultantText += ' ';
428
- }
429
- this._hasET = false;
430
- spaceBetweenWord = false;
431
- }
432
- this._hasTj = true;
433
- if (prevYLocation !== 0 && difference >= 1) {
434
- this._resultantText += '\r\n';
435
- }
436
- if (this._isRotatePage) {
437
- this._buildTextContentStream(element, page, fontCollection);
438
- } else {
439
- this._currentExtractedText = this._renderTextElementFromTJ(element, page, fontCollection);
440
- }
441
- this._currentTextMatrix = this._textLineMatrix;
442
- prevYLocation = currentYLocation;
443
- this._previousExtractText = this._currentExtractedText;
444
- if (this._previousTextMatrix._offsetY !== 0 && this._currentTextMatrix._offsetY !== 0 &&
445
- this._previousTextMatrix._offsetY + this._previousFontSize > this._currentTextMatrix._offsetY + this._fontSize &&
446
- this._previousTextMatrix._offsetY < this._currentTextMatrix._offsetY) {
447
- if (this._resultantText.length >= 2 && this._resultantText.slice(-2) === '\r\n') {
448
- this._resultantText = this._resultantText.slice(0, -2);
449
- }
450
- }
451
- this._previousFontSize = this._fontSize;
452
- if (this._isLayout) {
453
- this._resultantText += this._currentExtractedText;
454
- }
455
- this._textMatrix = this._textLineMatrix;
456
- this._previousTextMatrix = this._textLineMatrix;
457
- }
458
- break;
459
- case 'Do':
460
- _getXObject(element, page, xObjectCollection, this);
461
- break;
462
- case 'RG':
463
- case 'k':
464
- case 'g':
465
- case 'rg':
466
- red = Number(element[0]);
467
- green = Number(element[1]);
468
- blue = Number(element[2]);
469
- this._textColor = [red, green, blue];
470
- }
471
- });
472
- }
473
- _renderText(page: PdfPage, fontCollection: Map<string, _FontStructure>, xObjectCollection: Map<string, any>, graphicState: _GraphicState): any { // eslint-disable-line
474
- const recordCollection: _PdfRecord[] = this._contentParser._getPageRecordCollection(page);
475
- let text: any; // eslint-disable-line
476
- if (this._isLayout) {
477
- this._renderTextAsLayOut(recordCollection, page, fontCollection, xObjectCollection);
478
- } else if (this._isExtractTextLines) {
479
- text = this._contentParser._processRecordCollection(recordCollection, page, fontCollection, xObjectCollection, graphicState);
480
- this._textLine = text;
481
- } else {
482
- text = this._contentParser._processRecordCollection(recordCollection, page, fontCollection, xObjectCollection, graphicState);
483
- this._resultantText = text;
484
- }
485
- }
486
- _setTextLeading(textLeading: number): void {
487
- this._textLeading = -textLeading;
488
- }
489
- _moveToNextLine(tx: number, ty: number, textLineMatrix: _MatrixHelper): void {
490
- const matrix: _MatrixHelper = new _MatrixHelper(1, 0 , 0, 1, tx, ty);
491
- this._textLineMatrix = this._textMatrix = matrix._multiply(textLineMatrix);
492
- }
493
- _updateTextMatrix(tj: number): _MatrixHelper {
494
- const x: number = - (tj * 0.001 * this._fontSize * this._textHorizontalScaling / 100);
495
- const point: number[] = this._textLineMatrix._transform(0.0, 0.0);
496
- const point2: number[] = this._textLineMatrix._transform(x, 0.0);
497
- if (point[0] !== point2[0]) {
498
- this._textLineMatrix._offsetX = point2[0];
499
- } else {
500
- this._textLineMatrix._offsetY = point2[1];
501
- }
502
- return this._textLineMatrix;
503
- }
504
- _updateTextLineMatrix(char: string, width: number): void {
505
- let wordSpacing: number = 0;
506
- if (char.indexOf(' ') !== -1) {
507
- wordSpacing = this._wordSpacing;
508
- }
509
- const offsetX: number = (width * this._fontSize + this._characterSpacing + wordSpacing) * (this._textHorizontalScaling / 100);
510
- const matrix: _MatrixHelper = new _MatrixHelper(1.0, 0.0, 0.0, 1.0, offsetX, 0.0);
511
- const result: _MatrixHelper = matrix._multiply(this._textLineMatrix);
512
- this._textLineMatrix = result;
513
- }
514
- _renderTextElementFromTJ(elements: string[], page: PdfPage, fontCollection: Map<string, _FontStructure>): string {
515
- let extractedText: string = '';
516
- const curretFont: _FontStructure = fontCollection.get(this._currentFont);
517
- let textValues: string[] = [];
518
- let widthTable: number[][] = [];
519
- this._textWord = [];
520
- let charWidth: number = 0;
521
- let k: number = 0;
522
- const textMarix: _MatrixHelper = new _MatrixHelper(this._textMatrix._m11, this._textMatrix._m12, this._textMatrix._m21,
523
- this._textMatrix._m22, this._textMatrix._offsetX, this._textMatrix._offsetY);
524
- const decodedList: [string[], number[][]] = _parseEncodedText(elements[0], curretFont);
525
- textValues = decodedList[0];
526
- widthTable = decodedList[1];
527
- this._previousRect = {x: 0, y: 0, width: 0 , height: 0};
528
- for (let j: number = 0; j < textValues.length; j++) {
529
- const word: string = textValues[Number.parseInt(j.toString(), 10)];
530
- const tj: number = Number(word);
531
- if (Number(word)) {
532
- this._textLineMatrix = this._updateTextMatrix(tj);
533
- if (Math.round(this._textLineMatrix._offsetX - this._textMatrix._offsetX) > 1 && !this._hasBeginMarkedContent) {
534
- extractedText += String.fromCharCode(32);
535
- }
536
- } else {
537
- let text: string = word.slice(0, -1);
538
- text = _ignoreEscapeSequence(text);
539
- for (let i: number = 0; i < text.length; i++) {
540
- const ch: string = text[Number.parseInt(i.toString(), 10)];
541
- let matrixTransform: _MatrixHelper = new _MatrixHelper(1, 0, 0, 1, 0, 0);
542
- this._textMatrix = this._getTextRenderingMatrix();
543
- let identity: _MatrixHelper = new _MatrixHelper(1, 0, 0, 1, 0, 0);
544
- identity = identity._scale(0.01, 0.01, 0.0, 0.0);
545
- identity = identity._translate(0.0, 1.0);
546
- this._transformations = new _TransformationStack(this._initialTransForm);
547
- this._transformations._pushTransform(identity._multiply(this._textMatrix));
548
- const transform: _MatrixHelper = matrixTransform;
549
- let matrix: _MatrixHelper = transform._clone();
550
- const mat: _MatrixHelper = this._transformations._CurrentTransform;
551
- matrix = matrix._multiply(mat);
552
- matrixTransform = matrix;
553
- let tempFontSize: number = 0;
554
- if (this._textMatrix._m11 > 0) {
555
- tempFontSize = this._textMatrix._m11;
556
- } else if (this._textMatrix._m12 !== 0 && this._textMatrix._m21 !== 0) {
557
- if (this._textMatrix._m12 < 0) {
558
- tempFontSize = -this._textMatrix._m12;
559
- } else {
560
- tempFontSize = this._textMatrix._m12;
561
- }
562
- }
563
- let height: number = 0;
564
- if (curretFont._isType3Font) {
565
- height = this._getTextHeight(curretFont, textMarix);
566
- }
567
- const width: number = widthTable[Number.parseInt(k.toString(), 10)][Number.parseInt(i.toString(), 10)];
568
- let scale: number;
569
- if (curretFont._fontMatrix) {
570
- scale = curretFont._fontMatrix[0];
571
- } else {
572
- scale = 0.001;
573
- }
574
- this._boundingRectangle.x = (matrix._offsetX / 1.3333333333333333) / 1.0;
575
- if (this._isLayout) {
576
- charWidth = this._parser._getCharacterWidth((scale * width), curretFont);
577
- } else {
578
- charWidth = scale * width;
579
- }
580
- this._boundingRectangle.width = charWidth * tempFontSize;
581
- if (!curretFont._isType3Font) {
582
- this._boundingRectangle.y = ((matrix._offsetY / 1.3333333333333333) - ((tempFontSize * 1.0) /
583
- 1.3333333333333333)) / 1.0;
584
- this._boundingRectangle.height = tempFontSize;
585
- } else {
586
- this._boundingRectangle.y = ((matrix._offsetY / 1.3333333333333333) - ((height * 1.0) / 1.3333333333333333))
587
- / 1.0;
588
- this._boundingRectangle.height = height;
589
- }
590
- const right: number = this._tempBoundingRectangle.x + this._tempBoundingRectangle.width;
591
- if (this._tempBoundingRectangle) {
592
- const boundDifference: number = Math.round((this._boundingRectangle.x - right) / 10);
593
- if ((right !== 0 && this._boundingRectangle.x !== 0) && (boundDifference > 1)) {
594
- extractedText += String.fromCharCode(32);
595
- }
596
- }
597
- extractedText += ch;
598
- this._updateTextLineMatrix(ch, charWidth);
599
- this._transformations._popTransform();
600
- this._tempBoundingRectangle = this._boundingRectangle;
601
- this._textMatrix = new _MatrixHelper(this._textLineMatrix._m11, this._textLineMatrix._m12, this._textLineMatrix._m21,
602
- this._textLineMatrix._m22, this._textLineMatrix._offsetX,
603
- this._textLineMatrix._offsetY);
604
- }
605
- if (text.length > 0) {
606
- k++;
607
- }
608
- }
609
- }
610
- return extractedText;
611
- }
612
- _getTextHeight(font: _FontStructure, textMatrix: _MatrixHelper): number {
613
- const tsm: number[] = [this._fontSize * this._textHorizontalScaling / 100, 0, 0, this._fontSize, 0, this._arise];
614
- if (this._fontSize <= 1 && !_isArrayEqual(font._fontMatrix, [0.001, 0, 0, 0.001, 0, 0])) {
615
- const glyphHeight: number = font._boundingBox[3] - font._boundingBox[1];
616
- if (glyphHeight > 0) {
617
- tsm[3] *= glyphHeight * font._fontMatrix[3];
618
- }
619
- }
620
- const currentTextMatrix: number[] = [textMatrix._m11, textMatrix._m12, textMatrix._m21,
621
- textMatrix._m22, textMatrix._offsetX, textMatrix._offsetY];
622
- const ctm: number[] = [this._ctm._m11, this._ctm._m12, this._ctm._m21,
623
- this._ctm._m22, this._ctm._offsetX, this._ctm._offsetY];
624
- const matrix: number[] = this._transform(currentTextMatrix, tsm);
625
- const transform: number[] = this._transform(ctm, matrix);
626
- const height: number = Math.hypot(transform[2], transform[3]);
627
- return height;
628
- }
629
- _transform(m1: number[], m2: number[]): number[] {
630
- return [m1[0] * m2[0] + m1[2] * m2[1], m1[1] * m2[0] + m1[3] * m2[1], m1[0] * m2[2] + m1[2] * m2[3], m1[1] * m2[2] + m1[3] * m2[3]
631
- , m1[0] * m2[4] + m1[2] * m2[5] + m1[4], m1[1] * m2[4] + m1[3] * m2[5] + m1[5]];
632
- }
633
- _buildTextContentStream(elements: string[], page: PdfPage, fontCollection: Map<string, _FontStructure>): void {
634
- const curretFont: _FontStructure = fontCollection.get(this._currentFont);
635
- let textValues: string[] = [];
636
- this._textWord = [];
637
- let tempString: string = '';
638
- const decodedList: [string[], number[][]] = _parseEncodedText(elements[0], curretFont);
639
- textValues = decodedList[0];
640
- let iszerspace: boolean = false;
641
- let text: string = '';
642
- let str: string = '';
643
- this._previousRect = {x: 0, y: 0, width: 0 , height: 0};
644
- for (let j: number = 0; j < textValues.length; j++) {
645
- const word: string = textValues[Number.parseInt(j.toString(), 10)];
646
- const digit: any = Number(word); // eslint-disable-line
647
- if (digit) {
648
- tempString = this._getTextWidth(text, Number(word) * -0.001, curretFont, page, tempString);
649
- } else if (digit !== 0) {
650
- text = word.slice(0, -1);
651
- if (iszerspace) {
652
- text = str + text;
653
- iszerspace = false;
654
- }
655
- } else {
656
- iszerspace = true;
657
- str = text;
658
- }
659
- }
660
- if (text.length > 0) {
661
- tempString = this._getTextWidth(text, 0, curretFont, page , tempString);
662
- }
663
- if (tempString !== '') {
664
- const textWord: TextWord = new TextWord();
665
- textWord._text = tempString;
666
- textWord._glyphs = this._textGlyph;
667
- if (this._isRotatePage) {
668
- textWord._bounds = [this._textGlyph[0]._bounds[0], this._textGlyph[0]._bounds[1], this._textGlyph[0]._bounds[2],
669
- this._height];
670
- } else {
671
- textWord._bounds = [this._textGlyph[0]._bounds[0], this._textGlyph[0]._bounds[1], this._width,
672
- this._textGlyph[0]._bounds[2]];
673
- }
674
- textWord._fontName = curretFont._name;
675
- textWord._fontStyle = curretFont._fontStyle;
676
- textWord._fontSize = this._fontSize;
677
- this._textWord.push(textWord);
678
- this._height = 0;
679
- }
680
- this._width = 0;
681
- this._textGlyph = [];
682
- const textLine1: TextLine = new TextLine();
683
- textLine1._text = this._extractedText;
684
- textLine1._wordCollection = this._textWord;
685
- textLine1._fontName = curretFont._name;
686
- textLine1._fontStyle = curretFont._fontStyle;
687
- textLine1._fontSize = this._fontSize;
688
- textLine1._pageIndex = page._pageIndex;
689
- const pdfPath: PdfPath = new PdfPath();
690
- for (let i: number = 0; i < this._textWord.length; i++) {
691
- pdfPath.addRectangle(this._textWord[Number.parseInt(i.toString(), 10)
692
- ]._bounds[0]
693
- , this._textWord[Number.parseInt(i.toString(), 10)
694
- ]._bounds[1], this._textWord[Number.parseInt(i.toString(), 10)
695
- ]._bounds[2],
696
- this._textWord[Number.parseInt(i.toString(), 10)]._bounds[3]);
697
- }
698
- textLine1._bounds = pdfPath._getBounds();
699
- this._textLine.push(textLine1);
700
- this._textExtraction.push(this._extractedText);
701
- this._extractedText = '';
702
- }
703
- _getTextWidth(text: string, extraSpacing: number, currentFont: _FontStructure, page: PdfPage, tempString: string): string {
704
- let scale: number = 0;
705
- if (currentFont._fontMatrix) {
706
- scale = currentFont._fontMatrix[0] * this._fontSize;
707
- } else {
708
- scale = 0.001 * this._fontSize;
709
- }
710
- let g: any = currentFont._charsToGlyphs(text); // eslint-disable-line
711
- for (let i: number = 0; i < g.length; i++) {
712
- const glyph: string = g[Number.parseInt(i.toString(), 10)]._unicode;
713
- let charSpacing: number = this._characterSpacing + (i + 1 === text.length ? extraSpacing : 0 );
714
- let tempFontSize: number = 0;
715
- if (this._textMatrix._m11 > 0) {
716
- tempFontSize = this._textMatrix._m11;
717
- } else if (this._textMatrix._m12 !== 0 && this._textMatrix._m21 !== 0) {
718
- if (this._textMatrix._m12 < 0) {
719
- tempFontSize = -this._textMatrix._m12;
720
- } else {
721
- tempFontSize = this._textMatrix._m12;
722
- }
723
- }
724
- const width: number = g[Number.parseInt(i.toString(), 10)]._width;
725
- const w: number = scale * width * tempFontSize;
726
- if (tempFontSize < this._fontSize) {
727
- tempFontSize = this._fontSize;
728
- }
729
- let charWidth: number;
730
- let charHeight: number;
731
- const scaledDim: number = scale * width * (this._textHorizontalScaling / 100);
732
- this._boundingRectangle.x = this._textMatrix._offsetY;
733
- this._boundingRectangle.y = this._textMatrix._offsetX;
734
- if (glyph === ' ') {
735
- charWidth = w + this._wordSpacing;
736
- charSpacing += scaledDim + this._wordSpacing;
737
- this._textMatrix = this._parser._translateTextMatrix(charSpacing * (this._textHorizontalScaling / 100), 0,
738
- this._textMatrix);
739
- } else {
740
- charWidth = w;
741
- }
742
- if (this._textMatrix._m11 > 0) {
743
- charHeight = tempFontSize;
744
- } else {
745
- charHeight = -(tempFontSize);
746
- }
747
- if (this._textMatrix._m11 <= 0 && this._textMatrix._m22 <= 0) {
748
- this._boundingRectangle.width = charWidth;
749
- this._boundingRectangle.height = charHeight;
750
- } else {
751
- this._boundingRectangle.width = charHeight;
752
- this._boundingRectangle.height = charWidth;
753
- }
754
- if (glyph !== ' ') {
755
- this._textMatrix = this._parser._translateTextMatrix(scaledDim, 0, this._textMatrix);
756
- }
757
- this._extractedText += glyph;
758
- tempString = this._splitWords(glyph, tempString, currentFont._name, currentFont._fontStyle, page);
759
- if (this._previousRect) {
760
- this._previousRect = {x: this._boundingRectangle.x, y: this._boundingRectangle.y,
761
- width: this._boundingRectangle.width, height: this._boundingRectangle.height};
762
- } else {
763
- this._previousRect = {x: 0, y: 0, width: 0 , height: 0};
764
- }
765
- if (glyph === ' ') {
766
- continue;
767
- }
768
- if (charSpacing) {
769
- this._textMatrix = this._parser._translateTextMatrix(charSpacing * (this._textHorizontalScaling / 100), 0,
770
- this._textMatrix);
771
- }
772
- }
773
- return tempString;
774
- }
775
- _splitWords(glyph: string, tempString: string, fontName: string, fontStyle: PdfFontStyle , page: PdfPage,
776
- rotation?: number, textColor?: number[]): string {
777
- let isSpace: boolean = false;
778
- if (/\s/.test(glyph)) {
779
- isSpace = true;
780
- }
781
- const currentRect: any = this._boundingRectangle; //eslint-disable-line
782
- const addTextWord: any = (text: string, glyphs: TextGlyph[], width: number) => { //eslint-disable-line
783
- const textWord: TextWord = new TextWord();
784
- textWord._text = text;
785
- textWord._glyphs = glyphs;
786
- const pdfPath: PdfPath = new PdfPath();
787
- for (let i: number = 0; i < glyphs.length; i++) {
788
- pdfPath.addRectangle(glyphs[Number.parseInt(i.toString(), 10)
789
- ]._bounds[0]
790
- , glyphs[Number.parseInt(i.toString(), 10)
791
- ]._bounds[1], glyphs[Number.parseInt(i.toString(), 10)
792
- ]._bounds[2],
793
- glyphs[Number.parseInt(i.toString(), 10)]._bounds[3]);
794
- }
795
- textWord._bounds = pdfPath._getBounds();
796
- textWord._fontName = fontName;
797
- textWord._fontStyle = fontStyle;
798
- textWord._fontSize = this._fontSize;
799
- this._textWord.push(textWord);
800
- };
801
- if (isSpace) {
802
- if (tempString) {
803
- if (page.rotation === PdfRotationAngle.angle90 || page.rotation === PdfRotationAngle.angle270 || rotation === 90) {
804
- addTextWord(tempString, this._textGlyph, this._height);
805
- } else {
806
- addTextWord(tempString, this._textGlyph, this._width);
807
- }
808
- this._textGlyph = [];
809
- tempString = '';
810
- }
811
- const textGlyph: TextGlyph = new TextGlyph();
812
- textGlyph._text = glyph;
813
- textGlyph._bounds = [currentRect.x, currentRect.y, currentRect.width, currentRect.height];
814
- textGlyph._fontName = fontName;
815
- textGlyph._fontStyle = fontStyle;
816
- textGlyph._fontSize = this._fontSize;
817
- textGlyph._color = textColor;
818
- if (page.rotation !== PdfRotationAngle.angle0) {
819
- textGlyph._isRotated = true;
820
- } else {
821
- textGlyph._isRotated = false;
822
- }
823
- this._textGlyph.push(textGlyph);
824
- if (page.rotation === PdfRotationAngle.angle90 || page.rotation === PdfRotationAngle.angle270 || rotation === 90) {
825
- addTextWord(glyph, this._textGlyph, currentRect.height);
826
- } else {
827
- addTextWord(glyph, this._textGlyph, currentRect.width);
828
- }
829
- this._width = 0;
830
- this._height = 0;
831
- this._textGlyph = [];
832
- this._previousRect = null;
833
- } else if (this._previousRect !== null && this._previousRect.width > 0) {
834
- let spacingFactor: number = currentRect.height * 0.07;
835
- if (spacingFactor < 2) {
836
- spacingFactor = 2;
837
- }
838
- let difference: number;
839
- if (page.rotation === PdfRotationAngle.angle90) {
840
- difference = this._previousRect.y + this._previousRect.height - currentRect.y;
841
- } else if (page.rotation === PdfRotationAngle.angle270 || rotation === 90) {
842
- difference = currentRect.y + currentRect.height - this._previousRect.y;
843
- } else if (page.rotation === PdfRotationAngle.angle180) {
844
- difference = currentRect.x + currentRect.width - this._previousRect.x;
845
- } else {
846
- difference = this._previousRect.x + this._previousRect.width - currentRect.x;
847
- }
848
- if (difference > 0) {
849
- if (spacingFactor === 2) {
850
- spacingFactor = 2.5;
851
- }
852
- }
853
- if (Math.abs(difference) > spacingFactor) {
854
- if (page.rotation === PdfRotationAngle.angle90 || page.rotation === PdfRotationAngle.angle270) {
855
- addTextWord(tempString, this._textGlyph, this._height);
856
- } else {
857
- addTextWord(tempString, this._textGlyph, this._width);
858
- }
859
- this._width = 0;
860
- this._height = 0;
861
- this._textGlyph = [];
862
- tempString = '';
863
- this._previousRect = {x: 0, y: 0, width: 0 , height: 0};
864
- }
865
- }
866
- if (!isSpace) {
867
- const textGlyph: TextGlyph = new TextGlyph();
868
- textGlyph._text = glyph;
869
- textGlyph._bounds = [currentRect.x, currentRect.y, currentRect.width, currentRect.height];
870
- textGlyph._fontName = fontName;
871
- textGlyph._fontStyle = fontStyle;
872
- textGlyph._fontSize = this._fontSize;
873
- textGlyph._color = textColor;
874
- if (page.rotation !== PdfRotationAngle.angle0) {
875
- textGlyph._isRotated = true;
876
- } else {
877
- textGlyph._isRotated = false;
878
- }
879
- textGlyph._isRotated = false;
880
- this._textGlyph.push(textGlyph);
881
- if (page.rotation === PdfRotationAngle.angle90 || page.rotation === PdfRotationAngle.angle270 || rotation === 90) {
882
- this._height += currentRect.height;
883
- } else {
884
- this._width += currentRect.width;
885
- }
886
- tempString += glyph;
887
- }
888
- return tempString;
889
- }
890
- _getTextRenderingMatrix(): _MatrixHelper {
891
- let matrix: _MatrixHelper = new _MatrixHelper(this._fontSize, 0, 0, -this._fontSize, 0, this._fontSize + this._arise);
892
- matrix = matrix._multiply(this._textLineMatrix);
893
- matrix = matrix._multiply(this._ctm);
894
- return matrix;
895
- }
896
- _renderFont(fontElements: string[]): void {
897
- let i: number = 0;
898
- for (i; i < fontElements.length; i++) {
899
- if (fontElements[Number.parseInt(i.toString(), 10)].indexOf('/') !== -1) {
900
- this._currentFont = fontElements[Number.parseInt(i.toString(), 10)].replace('/', '');
901
- break;
902
- }
903
- }
904
- this._fontSize = Number(fontElements[i + 1]);
905
- }
906
- /**
907
- * Extract `TextLine` collection from the PDF document.
908
- *
909
- * @returns {TextLine[]} The extracted textLines
910
- *
911
- * ```typescript
912
- * // Load an existing PDF document
913
- * let document: PdfDocument = new PdfDocument(data1);
914
- * // Initialize a new instance of the `PdfDataExtractor` class
915
- * let extractor: PdfDataExtractor = new PdfDataExtractor(document);
916
- * // Extract `TextLine` from the PDF document.
917
- * let textCollection: TextLine[] = extractor.extractTextLines();
918
- * // Save the output PDF
919
- * document.save(‘Output.pdf’);
920
- * // Destroy the documents
921
- * document.destroy();
922
- * ```
923
- */
924
- extractTextLines(): TextLine[];
925
- /**
926
- * Extract `TextLine` from the PDF document.
927
- *
928
- * @param {object} options The options to specify the page range to be selected.
929
- * @returns {TextLine[]} The extracted textLines
930
- *
931
- * ```typescript
932
- * // Load an existing PDF document
933
- * let document: PdfDocument = new PdfDocument(data1);
934
- * // Initialize a new instance of the `PdfDataExtractor` class
935
- * let extractor: PdfDataExtractor = new PdfDataExtractor(document);
936
- * // Extract `TextLine` from the PDF document.
937
- * let textCollection: TextLine[] = extractor.extractTextLines({ startPageIndex: 0, endPageIndex: document.pageCount - 1});
938
- * // Save the output PDF
939
- * document.save(‘Output.pdf’);
940
- * // Destroy the documents
941
- * document.destroy();
942
- * ```
943
- */
944
- extractTextLines(options: { startPageIndex?: number, endPageIndex?: number }): TextLine[];
945
- extractTextLines(options?: { startPageIndex?: number, endPageIndex?: number }): TextLine[] {
946
- let startIndex: number = 0;
947
- this._isExtractTextLines = true;
948
- this._contentParser = new _PdfContentParserHelper(_TextProcessingMode.textLineExtraction);
949
- if (options && typeof(options.startPageIndex) === 'number') {
950
- startIndex = options.startPageIndex;
951
- }
952
- let endIndex: number = this._document.pageCount - 1;
953
- if (options && typeof(options.endPageIndex) === 'number') {
954
- endIndex = options.endPageIndex;
955
- }
956
- this._textLine = [];
957
- this._processPages(startIndex, endIndex);
958
- this._isExtractTextLines = false;
959
- return this._textLine;
960
- }
961
- _processPages(startIndex: number, endIndex: number): void {
962
- for (let pageIndex: number = startIndex; pageIndex <= endIndex; pageIndex++) {
963
- const page: PdfPage = this._document.getPage(pageIndex);
964
- if (page.rotation !== PdfRotationAngle.angle0 && !this._isLayout) {
965
- this._isRotatePage = true;
966
- }
967
- const graphicState: _GraphicState = new _GraphicState();
968
- const resource: _PdfDictionary = page._pageDictionary.get('Resources');
969
- if (resource !== null && typeof(resource) !== 'undefined') {
970
- const fontCollection: Map<string, _FontStructure> = _addFontResources(resource, this._crossReference);
971
- const xObjectCollection: Map<string, _FontStructure> = _getXObjectResources(resource, this._crossReference);
972
- this._renderText(page, fontCollection, xObjectCollection, graphicState);
973
- }
974
- this._isRotatePage = false;
975
- }
976
- }
977
- }