@pdftron/pdfnet-node-samples 10.9.0 → 10.10.0-beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. package/{samples/AddImageTest → AddImageTest}/AddImageTest.js +115 -115
  2. package/{samples/AdvancedImagingTest → AdvancedImagingTest}/AdvancedImagingTest.js +78 -78
  3. package/{samples/AnnotationTest → AnnotationTest}/AnnotationTest.js +641 -641
  4. package/{samples/BookmarkTest → BookmarkTest}/BookmarkTest.js +219 -219
  5. package/{samples/CAD2PDFTest → CAD2PDFTest}/CAD2PDFTest.js +79 -79
  6. package/{samples/ContentReplacerTest → ContentReplacerTest}/ContentReplacerTest.js +75 -75
  7. package/{samples/ConvertPrintTest → ConvertPrintTest}/ConvertPrintTest.js +153 -153
  8. package/{samples/ConvertTest → ConvertTest}/ConvertTest.js +203 -203
  9. package/{samples/DataExtractionTest → DataExtractionTest}/DataExtractionTest.js +214 -214
  10. package/{samples/DigitalSignaturesTest → DigitalSignaturesTest}/DigitalSignaturesTest.js +622 -527
  11. package/{samples/DocumentCreationTest → DocumentCreationTest}/DocumentCreationTest.js +409 -409
  12. package/{samples/ElementBuilderTest → ElementBuilderTest}/ElementBuilderTest.js +513 -513
  13. package/{samples/ElementEditTest → ElementEditTest}/ElementEditTest.js +110 -110
  14. package/{samples/ElementReaderAdvTest → ElementReaderAdvTest}/ElementReaderAdvTest.js +305 -305
  15. package/{samples/ElementReaderTest → ElementReaderTest}/ElementReaderTest.js +77 -77
  16. package/{samples/EncTest → EncTest}/EncTest.js +175 -175
  17. package/{samples/FDFTest → FDFTest}/FDFTest.js +218 -218
  18. package/{samples/HTML2PDFTest → HTML2PDFTest}/HTML2PDFTest.js +164 -164
  19. package/{samples/HighlightsTest → HighlightsTest}/HighlightsTest.js +97 -97
  20. package/{samples/ImageExtractTest → ImageExtractTest}/ImageExtractTest.js +129 -129
  21. package/{samples/ImpositionTest → ImpositionTest}/ImpositionTest.js +86 -86
  22. package/{samples/InteractiveFormsTest → InteractiveFormsTest}/InteractiveFormsTest.js +381 -381
  23. package/{samples/JBIG2Test → JBIG2Test}/JBIG2Test.js +88 -88
  24. package/{samples/LicenseKey → LicenseKey}/LicenseKey.js +11 -11
  25. package/{samples/LogicalStructureTest → LogicalStructureTest}/LogicalStructureTest.js +250 -250
  26. package/{samples/OCRTest → OCRTest}/OCRTest.js +235 -235
  27. package/{samples/OfficeTemplateTest → OfficeTemplateTest}/OfficeTemplateTest.js +79 -79
  28. package/{samples/OfficeToPDFTest → OfficeToPDFTest}/OfficeToPDFTest.js +125 -125
  29. package/{samples/OptimizerTest → OptimizerTest}/OptimizerTest.js +191 -191
  30. package/{samples/PDF2HtmlTest → PDF2HtmlTest}/PDF2HtmlTest.js +123 -123
  31. package/{samples/PDF2OfficeTest → PDF2OfficeTest}/PDF2OfficeTest.js +158 -158
  32. package/{samples/PDFATest → PDFATest}/PDFATest.js +85 -85
  33. package/{samples/PDFDocMemoryTest → PDFDocMemoryTest}/PDFDocMemoryTest.js +84 -84
  34. package/{samples/PDFDrawTest → PDFDrawTest}/PDFDrawTest.js +305 -305
  35. package/{samples/PDFLayersTest → PDFLayersTest}/PDFLayersTest.js +294 -294
  36. package/{samples/PDFPackageTest → PDFPackageTest}/PDFPackageTest.js +111 -111
  37. package/{samples/PDFPageTest → PDFPageTest}/PDFPageTest.js +189 -189
  38. package/{samples/PDFRedactTest → PDFRedactTest}/PDFRedactTest.js +74 -74
  39. package/{samples/PageLabelsTest → PageLabelsTest}/PageLabelsTest.js +138 -138
  40. package/{samples/PatternTest → PatternTest}/PatternTest.js +226 -226
  41. package/{samples/RectTest → RectTest}/RectTest.js +40 -40
  42. package/{samples/SDFTest → SDFTest}/SDFTest.js +87 -87
  43. package/{samples/StamperTest → StamperTest}/StamperTest.js +255 -255
  44. package/{samples/TestFiles → TestFiles}/Misc-Fixed.pfa +1166 -1166
  45. package/{samples/TestFiles → TestFiles}/SHA-2 Root USERTrust RSA CA Sectigo timestamping.crt +34 -34
  46. package/{samples/TestFiles → TestFiles}/form1_annots.xfdf +33 -33
  47. package/{samples/TestFiles → TestFiles}/form1_data.xfdf +139 -139
  48. package/{samples/TestFiles → TestFiles}/my_stream.txt +2310 -2310
  49. package/{samples/TestFiles → TestFiles}/tiger.svg +378 -378
  50. package/{samples/TextExtractTest → TextExtractTest}/TextExtractTest.js +286 -286
  51. package/{samples/TextSearchTest → TextSearchTest}/TextSearchTest.js +121 -121
  52. package/{samples/U3DTest → U3DTest}/U3DTest.js +104 -104
  53. package/{samples/UndoRedoTest → UndoRedoTest}/UndoRedoTest.js +101 -101
  54. package/{samples/UnicodeWriteTest → UnicodeWriteTest}/UnicodeWriteTest.js +173 -173
  55. package/{samples/WebViewerConvertTest → WebViewerConvertTest}/WebViewerConvertTest.js +135 -135
  56. package/legal.txt +632 -0
  57. package/license.pdf +0 -0
  58. package/package.json +20 -21
  59. package/readme.md +38 -13
  60. package/{samples/runall.bat → runall.bat} +12 -12
  61. package/{samples/runall.sh → runall.sh} +15 -15
  62. /package/{samples/TestFiles → TestFiles}/BusinessCardTemplate.pdf +0 -0
  63. /package/{samples/TestFiles → TestFiles}/Fishermen.docx +0 -0
  64. /package/{samples/TestFiles → TestFiles}/Font_licenses.txt +0 -0
  65. /package/{samples/TestFiles → TestFiles}/GlobalSignRootForTST.cer +0 -0
  66. /package/{samples/TestFiles → TestFiles}/License.txt +0 -0
  67. /package/{samples/TestFiles → TestFiles}/NotoSans_with_hindi.ttf +0 -0
  68. /package/{samples/TestFiles → TestFiles}/Output/empty +0 -0
  69. /package/{samples/TestFiles → TestFiles}/SYH_Letter.docx +0 -0
  70. /package/{samples/TestFiles → TestFiles}/TigerText.pdf +0 -0
  71. /package/{samples/TestFiles → TestFiles}/US061222892-a.pdf +0 -0
  72. /package/{samples/TestFiles → TestFiles}/butterfly.png +0 -0
  73. /package/{samples/TestFiles → TestFiles}/credit card numbers.pdf +0 -0
  74. /package/{samples/TestFiles → TestFiles}/dice.jpg +0 -0
  75. /package/{samples/TestFiles → TestFiles}/dice.u3d +0 -0
  76. /package/{samples/TestFiles → TestFiles}/doc_to_sign.pdf +0 -0
  77. /package/{samples/TestFiles → TestFiles}/factsheet_Arabic.docx +0 -0
  78. /package/{samples/TestFiles → TestFiles}/financial.pdf +0 -0
  79. /package/{samples/TestFiles → TestFiles}/fish.pdf +0 -0
  80. /package/{samples/TestFiles → TestFiles}/font.ttf +0 -0
  81. /package/{samples/TestFiles → TestFiles}/form1.pdf +0 -0
  82. /package/{samples/TestFiles → TestFiles}/form1_data.fdf +0 -0
  83. /package/{samples/TestFiles → TestFiles}/formfields-scanned-withfields.pdf +0 -0
  84. /package/{samples/TestFiles → TestFiles}/formfields-scanned.pdf +0 -0
  85. /package/{samples/TestFiles → TestFiles}/formfields.pdf +0 -0
  86. /package/{samples/TestFiles → TestFiles}/grayscale.tif +0 -0
  87. /package/{samples/TestFiles → TestFiles}/hindi_sample_utf16le.txt +0 -0
  88. /package/{samples/TestFiles → TestFiles}/imagemask.dat +0 -0
  89. /package/{samples/TestFiles → TestFiles}/logo_red.png +0 -0
  90. /package/{samples/TestFiles → TestFiles}/lorem_ipsum.pdf +0 -0
  91. /package/{samples/TestFiles → TestFiles}/multipage.tif +0 -0
  92. /package/{samples/TestFiles → TestFiles}/newsletter.pdf +0 -0
  93. /package/{samples/TestFiles → TestFiles}/newsletter.xod +0 -0
  94. /package/{samples/TestFiles → TestFiles}/numbered.pdf +0 -0
  95. /package/{samples/TestFiles → TestFiles}/op_blend_test.pdf +0 -0
  96. /package/{samples/TestFiles → TestFiles}/palm.jp2 +0 -0
  97. /package/{samples/TestFiles → TestFiles}/paragraphs_and_tables.pdf +0 -0
  98. /package/{samples/TestFiles → TestFiles}/pdfnet.gif +0 -0
  99. /package/{samples/TestFiles → TestFiles}/pdftron.bmp +0 -0
  100. /package/{samples/TestFiles → TestFiles}/pdftron.cer +0 -0
  101. /package/{samples/TestFiles → TestFiles}/pdftron.pfx +0 -0
  102. /package/{samples/TestFiles → TestFiles}/pdftron_smart_substitution.plugin +0 -0
  103. /package/{samples/TestFiles → TestFiles}/peppers.jpg +0 -0
  104. /package/{samples/TestFiles → TestFiles}/signature.jpg +0 -0
  105. /package/{samples/TestFiles → TestFiles}/simple-emf.emf +0 -0
  106. /package/{samples/TestFiles → TestFiles}/simple-excel_2007.xlsx +0 -0
  107. /package/{samples/TestFiles → TestFiles}/simple-outlook.msg +0 -0
  108. /package/{samples/TestFiles → TestFiles}/simple-powerpoint_2007.pptx +0 -0
  109. /package/{samples/TestFiles → TestFiles}/simple-publisher.pub +0 -0
  110. /package/{samples/TestFiles → TestFiles}/simple-rtf.rtf +0 -0
  111. /package/{samples/TestFiles → TestFiles}/simple-text.txt +0 -0
  112. /package/{samples/TestFiles → TestFiles}/simple-visio.vsd +0 -0
  113. /package/{samples/TestFiles → TestFiles}/simple-webpage.html +0 -0
  114. /package/{samples/TestFiles → TestFiles}/simple-webpage.mht +0 -0
  115. /package/{samples/TestFiles → TestFiles}/simple-webpage_files/colorschememapping.xml +0 -0
  116. /package/{samples/TestFiles → TestFiles}/simple-webpage_files/filelist.xml +0 -0
  117. /package/{samples/TestFiles → TestFiles}/simple-webpage_files/image001.gif +0 -0
  118. /package/{samples/TestFiles → TestFiles}/simple-webpage_files/image002.png +0 -0
  119. /package/{samples/TestFiles → TestFiles}/simple-webpage_files/image003.jpg +0 -0
  120. /package/{samples/TestFiles → TestFiles}/simple-webpage_files/image004.emz +0 -0
  121. /package/{samples/TestFiles → TestFiles}/simple-webpage_files/image005.gif +0 -0
  122. /package/{samples/TestFiles → TestFiles}/simple-webpage_files/image006.png +0 -0
  123. /package/{samples/TestFiles → TestFiles}/simple-webpage_files/image007.gif +0 -0
  124. /package/{samples/TestFiles → TestFiles}/simple-webpage_files/oledata.mso +0 -0
  125. /package/{samples/TestFiles → TestFiles}/simple-webpage_files/themedata.thmx +0 -0
  126. /package/{samples/TestFiles → TestFiles}/simple-word_2007.docx +0 -0
  127. /package/{samples/TestFiles → TestFiles}/simple-xps.xps +0 -0
  128. /package/{samples/TestFiles → TestFiles}/table.pdf +0 -0
  129. /package/{samples/TestFiles → TestFiles}/tagged.pdf +0 -0
  130. /package/{samples/TestFiles → TestFiles}/the_rime_of_the_ancient_mariner.docx +0 -0
  131. /package/{samples/TestFiles → TestFiles}/tiger.pdf +0 -0
  132. /package/{samples/TestFiles → TestFiles}/waiver.pdf +0 -0
  133. /package/{samples/TestFiles → TestFiles}/waiver_withApprovalField.pdf +0 -0
  134. /package/{samples/TestFiles → TestFiles}/waiver_withApprovalField_certified.pdf +0 -0
  135. /package/{samples/TestFiles → TestFiles}/waiver_withApprovalField_certified_approved.pdf +0 -0
@@ -1,287 +1,287 @@
1
- //---------------------------------------------------------------------------------------
2
- // Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3
- // Consult legal.txt regarding legal and license information.
4
- //---------------------------------------------------------------------------------------
5
-
6
-
7
- const { PDFNet } = require('@pdftron/pdfnet-node');
8
- const PDFTronLicense = require('../LicenseKey/LicenseKey');
9
-
10
- ((exports) => {
11
-
12
- exports.runTextExtractTest = async () => {
13
- // A utility method used to dump all text content in the console window.
14
- const dumpAllText = async (reader) => {
15
- let element;
16
- let bbox;
17
- let arr;
18
- while ((element = await reader.next()) !== null) {
19
- switch (await element.getType()) {
20
- case PDFNet.Element.Type.e_text_begin:
21
- console.log('\n--> Text Block Begin');
22
- break;
23
- case PDFNet.Element.Type.e_text_end:
24
- console.log('\n--> Text Block End');
25
- break;
26
- case PDFNet.Element.Type.e_text:
27
- bbox = await element.getBBox();
28
- console.log('\n--> BBox: ' + bbox.x1.toFixed(2) + ', ' + bbox.y1.toFixed(2) + ', ' + bbox.x2.toFixed(2) + ', ' + bbox.y2.toFixed(2) + '\n');
29
- arr = await element.getTextString();
30
- console.log(arr);
31
- break;
32
- case PDFNet.Element.Type.e_text_new_line:
33
- console.log('\n--> New Line');
34
- break;
35
- case PDFNet.Element.Type.e_form:
36
- reader.formBegin();
37
- await dumpAllText(reader);
38
- reader.end();
39
- break;
40
- }
41
- }
42
- };
43
-
44
- // helper method for ReadTextFromRect
45
- const rectTextSearch = async (reader, pos, srchStr) => {
46
- let element;
47
- let arr;
48
- while ((element = await reader.next()) !== null) {
49
- let bbox;
50
- switch (await element.getType()) {
51
- case PDFNet.Element.Type.e_text:
52
- bbox = await element.getBBox();
53
- if (await bbox.intersectRect(bbox, pos)) {
54
- arr = await element.getTextString();
55
- srchStr += arr + '\n';
56
- }
57
- break;
58
- case PDFNet.Element.Type.e_text_new_line:
59
- break;
60
- case PDFNet.Element.Type.e_form:
61
- reader.formBegin();
62
- srchStr += await rectTextSearch(reader, pos, srchStr); // possibly need srchStr = ...
63
- reader.end();
64
- break;
65
- }
66
- }
67
- return srchStr;
68
- };
69
-
70
- const readTextFromRect = async (page, pos, reader) => {
71
- let srchStr = '';
72
- reader.beginOnPage(page); // uses default parameters.
73
- srchStr += await rectTextSearch(reader, pos, srchStr);
74
- reader.end();
75
- return srchStr;
76
- };
77
-
78
- const twoDigitHex = function (num) {
79
- const hexStr = num.toString(16).toUpperCase();
80
- return ('0' + hexStr).substr(-2);
81
- }
82
-
83
- const printStyle = async (s) => {
84
- const rgb = await s.getColor();
85
- const rColorVal = await rgb.get(0);
86
- const gColorVal = await rgb.get(1);
87
- const bColorVal = await rgb.get(2);
88
- const rgbHex = twoDigitHex(rColorVal) + twoDigitHex(gColorVal) + twoDigitHex(bColorVal)
89
- const fontName = await s.getFontName();
90
- const fontSize = await s.getFontSize();
91
- const serifOutput = ((await s.isSerif()) ? ' sans-serif; ' : ' ');
92
- const returnString = ' style="font-family:' + fontName + '; font-size:' + fontSize + ';' + serifOutput + 'color:#' + rgbHex + ';"';
93
- return returnString;
94
- };
95
-
96
- const main = async () => {
97
- // eslint-disable-next-line no-unused-vars
98
- let ret = 0;
99
-
100
- // Relative path to the folder containing test files.
101
- const inputPath = '../TestFiles/';
102
- const inputFilename = 'newsletter.pdf'; // addimage.pdf, newsletter.pdf
103
-
104
- const example1Basic = false;
105
- const example2XML = false;
106
- const example3Wordlist = false;
107
- const example4Advanced = true;
108
- const example5LowLevel = false;
109
-
110
- try {
111
- await PDFNet.startDeallocateStack();
112
- const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + inputFilename);
113
- doc.initSecurityHandler();
114
-
115
- const page = await doc.getPage(1);
116
-
117
- if (page.id === '0') {
118
- console.log('Page not found.');
119
- return 1;
120
- }
121
-
122
- const txt = await PDFNet.TextExtractor.create();
123
- txt.begin(page);
124
-
125
- let text;
126
- let line;
127
- let word;
128
-
129
- // Example 1. Get all text on the page in a single string.
130
- // Words will be separated with space or new line characters.
131
- if (example1Basic) {
132
- const wordCount = await txt.getWordCount();
133
- console.log('Word Count: ' + wordCount);
134
- text = await txt.getAsText();
135
- console.log('\n\n- GetAsText --------------------------');
136
- console.log(text);
137
- console.log('-----------------------------------------------------------');
138
- }
139
-
140
- // Example 2. Get XML logical structure for the page.
141
- if (example2XML) {
142
- text = await txt.getAsXML(PDFNet.TextExtractor.XMLOutputFlags.e_words_as_elements | PDFNet.TextExtractor.XMLOutputFlags.e_output_bbox | PDFNet.TextExtractor.XMLOutputFlags.e_output_style_info);
143
- console.log('\n\n- GetAsXML --------------------------\n' + text);
144
- console.log('-----------------------------------------------------------');
145
- }
146
-
147
- // Example 3. Extract words one by one.
148
- if (example3Wordlist) {
149
- line = await txt.getFirstLine();
150
- for (; (await line.isValid()); line = (await line.getNextLine())) {
151
- for (word = await line.getFirstWord(); await word.isValid(); word = await word.getNextWord()) {
152
- text = await word.getString();
153
- console.log(text);
154
- }
155
- }
156
- console.log('-----------------------------------------------------------');
157
- }
158
-
159
- // Example 4. A more advanced text extraction example.
160
- // The output is XML structure containing paragraphs, lines, words,
161
- // as well as style and positioning information.
162
- if (example4Advanced) {
163
- let b;
164
- let q;
165
- let curFlowID = -1;
166
- let curParaID = -1;
167
-
168
- console.log('<PDFText>');
169
-
170
- // For each line on the page...
171
- for (line = await txt.getFirstLine(); await line.isValid(); line = await line.getNextLine()) {
172
- if ((await line.getNumWords()) === 0) {
173
- continue;
174
- }
175
- if (curFlowID !== await line.getFlowID()) {
176
- if (curFlowID !== -1) {
177
- if (curParaID !== -1) {
178
- curParaID = -1;
179
- console.log('</Para>');
180
- }
181
- console.log('</Flow>');
182
- }
183
- curFlowID = await line.getFlowID();
184
- console.log('<Flow id="' + curFlowID + '">');
185
- }
186
- if (curParaID !== await line.getParagraphID()) {
187
- if (curParaID !== -1) {
188
- console.log('</Para>');
189
- }
190
- curParaID = await line.getParagraphID();
191
- console.log('<Para id="' + curParaID + '">');
192
- }
193
- b = await line.getBBox();
194
- const lineStyle = await line.getStyle();
195
- let outputStringLineBox = '<Line box="' + b.x1.toFixed(2) + ', ' + b.y1.toFixed(2) + ', ' + b.x2.toFixed(2) + ', ' + b.y2.toFixed(2) + '"';
196
- outputStringLineBox += (await printStyle(lineStyle));
197
- const currentLineNum = await line.getCurrentNum();
198
- outputStringLineBox += ' cur_num="' + currentLineNum + '">';
199
- console.log(outputStringLineBox);
200
-
201
- // For each word in the line...
202
- for (word = await line.getFirstWord(); await word.isValid(); word = await word.getNextWord()) {
203
- // output bounding box for the word
204
- q = await word.getBBox();
205
- const currentNum = await word.getCurrentNum();
206
- let outputStringWord = '<Word box="' + q.x1.toFixed(2) + ', ' + q.y1.toFixed(2) + ', ' + q.x2.toFixed(2) + ', ' + q.y2.toFixed(2) + '" cur_num="' + currentNum + '"';
207
- const sz = await word.getStringLen();
208
- if (sz === 0) {
209
- continue;
210
- }
211
- // if the word style is different from the parent style, output the new style
212
- const sty = await word.getStyle();
213
- if (!(await sty.compare(lineStyle))) {
214
- outputStringWord += await printStyle(sty);
215
- }
216
- outputStringWord += '>' + (await word.getString()) + '</Word>';
217
- console.log(outputStringWord);
218
- }
219
- console.log('</Line>');
220
- }
221
- if (curFlowID !== -1) {
222
- if (curParaID !== -1) {
223
- curParaID = -1;
224
- console.log('</Para>');
225
- }
226
- console.log('</Flow>');
227
- }
228
- console.log('</PDFText>');
229
- }
230
- await PDFNet.endDeallocateStack();
231
- } catch (err) {
232
- console.log(err);
233
- console.log(err.stack);
234
- ret = 1;
235
- }
236
-
237
-
238
- if (example5LowLevel) {
239
- ret = 0;
240
- try {
241
- await PDFNet.startDeallocateStack();
242
- const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + inputFilename);
243
- doc.initSecurityHandler();
244
-
245
- // Example 1. Extract all text content from the document
246
- const reader = await PDFNet.ElementReader.create();
247
- const itr = await doc.getPageIterator(1);
248
-
249
- // Read every page
250
- for (itr; await itr.hasNext(); itr.next()) {
251
- const page = await itr.current();
252
- reader.beginOnPage(page);
253
- await dumpAllText(reader);
254
- reader.end();
255
- }
256
- // Example 2. Extract text content based on the
257
- // selection rectangle.
258
- console.log('\n----------------------------------------------------');
259
- console.log('Extract text based on the selection rectangle.');
260
- console.log('----------------------------------------------------');
261
-
262
-
263
- const firstPage = await (await doc.getPageIterator()).current();
264
- let s1 = await readTextFromRect(firstPage, (await PDFNet.Rect.init(27, 392, 563, 534)), reader);
265
- console.log('\nField 1: ' + s1);
266
-
267
- s1 = await readTextFromRect(firstPage, (await PDFNet.Rect.init(28, 551, 106, 623)), reader);
268
- console.log('Field 2: ' + s1);
269
-
270
- s1 = await readTextFromRect(firstPage, (await PDFNet.Rect.init(208, 550, 387, 621)), reader);
271
- console.log('Field 3: ' + s1);
272
-
273
- // ...
274
- console.log('Done');
275
- await PDFNet.endDeallocateStack();
276
- } catch (err) {
277
- console.log(err.stack);
278
- ret = 1;
279
- }
280
- }
281
- };
282
- PDFNet.runWithCleanup(main, PDFTronLicense.Key).catch(function (error) { console.log('Error: ' + JSON.stringify(error)); }).then(function () { return PDFNet.shutdown(); });
283
- };
284
- exports.runTextExtractTest();
285
- })(exports);
286
- // eslint-disable-next-line spaced-comment
1
+ //---------------------------------------------------------------------------------------
2
+ // Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3
+ // Consult legal.txt regarding legal and license information.
4
+ //---------------------------------------------------------------------------------------
5
+
6
+
7
+ const { PDFNet } = require('@pdftron/pdfnet-node');
8
+ const PDFTronLicense = require('../LicenseKey/LicenseKey');
9
+
10
+ ((exports) => {
11
+
12
+ exports.runTextExtractTest = async () => {
13
+ // A utility method used to dump all text content in the console window.
14
+ const dumpAllText = async (reader) => {
15
+ let element;
16
+ let bbox;
17
+ let arr;
18
+ while ((element = await reader.next()) !== null) {
19
+ switch (await element.getType()) {
20
+ case PDFNet.Element.Type.e_text_begin:
21
+ console.log('\n--> Text Block Begin');
22
+ break;
23
+ case PDFNet.Element.Type.e_text_end:
24
+ console.log('\n--> Text Block End');
25
+ break;
26
+ case PDFNet.Element.Type.e_text:
27
+ bbox = await element.getBBox();
28
+ console.log('\n--> BBox: ' + bbox.x1.toFixed(2) + ', ' + bbox.y1.toFixed(2) + ', ' + bbox.x2.toFixed(2) + ', ' + bbox.y2.toFixed(2) + '\n');
29
+ arr = await element.getTextString();
30
+ console.log(arr);
31
+ break;
32
+ case PDFNet.Element.Type.e_text_new_line:
33
+ console.log('\n--> New Line');
34
+ break;
35
+ case PDFNet.Element.Type.e_form:
36
+ reader.formBegin();
37
+ await dumpAllText(reader);
38
+ reader.end();
39
+ break;
40
+ }
41
+ }
42
+ };
43
+
44
+ // helper method for ReadTextFromRect
45
+ const rectTextSearch = async (reader, pos, srchStr) => {
46
+ let element;
47
+ let arr;
48
+ while ((element = await reader.next()) !== null) {
49
+ let bbox;
50
+ switch (await element.getType()) {
51
+ case PDFNet.Element.Type.e_text:
52
+ bbox = await element.getBBox();
53
+ if (await bbox.intersectRect(bbox, pos)) {
54
+ arr = await element.getTextString();
55
+ srchStr += arr + '\n';
56
+ }
57
+ break;
58
+ case PDFNet.Element.Type.e_text_new_line:
59
+ break;
60
+ case PDFNet.Element.Type.e_form:
61
+ reader.formBegin();
62
+ srchStr += await rectTextSearch(reader, pos, srchStr); // possibly need srchStr = ...
63
+ reader.end();
64
+ break;
65
+ }
66
+ }
67
+ return srchStr;
68
+ };
69
+
70
+ const readTextFromRect = async (page, pos, reader) => {
71
+ let srchStr = '';
72
+ reader.beginOnPage(page); // uses default parameters.
73
+ srchStr += await rectTextSearch(reader, pos, srchStr);
74
+ reader.end();
75
+ return srchStr;
76
+ };
77
+
78
+ const twoDigitHex = function (num) {
79
+ const hexStr = num.toString(16).toUpperCase();
80
+ return ('0' + hexStr).substr(-2);
81
+ }
82
+
83
+ const printStyle = async (s) => {
84
+ const rgb = await s.getColor();
85
+ const rColorVal = await rgb.get(0);
86
+ const gColorVal = await rgb.get(1);
87
+ const bColorVal = await rgb.get(2);
88
+ const rgbHex = twoDigitHex(rColorVal) + twoDigitHex(gColorVal) + twoDigitHex(bColorVal)
89
+ const fontName = await s.getFontName();
90
+ const fontSize = await s.getFontSize();
91
+ const serifOutput = ((await s.isSerif()) ? ' sans-serif; ' : ' ');
92
+ const returnString = ' style="font-family:' + fontName + '; font-size:' + fontSize + ';' + serifOutput + 'color:#' + rgbHex + ';"';
93
+ return returnString;
94
+ };
95
+
96
+ const main = async () => {
97
+ // eslint-disable-next-line no-unused-vars
98
+ let ret = 0;
99
+
100
+ // Relative path to the folder containing test files.
101
+ const inputPath = '../TestFiles/';
102
+ const inputFilename = 'newsletter.pdf'; // addimage.pdf, newsletter.pdf
103
+
104
+ const example1Basic = false;
105
+ const example2XML = false;
106
+ const example3Wordlist = false;
107
+ const example4Advanced = true;
108
+ const example5LowLevel = false;
109
+
110
+ try {
111
+ await PDFNet.startDeallocateStack();
112
+ const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + inputFilename);
113
+ doc.initSecurityHandler();
114
+
115
+ const page = await doc.getPage(1);
116
+
117
+ if (page.id === '0') {
118
+ console.log('Page not found.');
119
+ return 1;
120
+ }
121
+
122
+ const txt = await PDFNet.TextExtractor.create();
123
+ txt.begin(page);
124
+
125
+ let text;
126
+ let line;
127
+ let word;
128
+
129
+ // Example 1. Get all text on the page in a single string.
130
+ // Words will be separated with space or new line characters.
131
+ if (example1Basic) {
132
+ const wordCount = await txt.getWordCount();
133
+ console.log('Word Count: ' + wordCount);
134
+ text = await txt.getAsText();
135
+ console.log('\n\n- GetAsText --------------------------');
136
+ console.log(text);
137
+ console.log('-----------------------------------------------------------');
138
+ }
139
+
140
+ // Example 2. Get XML logical structure for the page.
141
+ if (example2XML) {
142
+ text = await txt.getAsXML(PDFNet.TextExtractor.XMLOutputFlags.e_words_as_elements | PDFNet.TextExtractor.XMLOutputFlags.e_output_bbox | PDFNet.TextExtractor.XMLOutputFlags.e_output_style_info);
143
+ console.log('\n\n- GetAsXML --------------------------\n' + text);
144
+ console.log('-----------------------------------------------------------');
145
+ }
146
+
147
+ // Example 3. Extract words one by one.
148
+ if (example3Wordlist) {
149
+ line = await txt.getFirstLine();
150
+ for (; (await line.isValid()); line = (await line.getNextLine())) {
151
+ for (word = await line.getFirstWord(); await word.isValid(); word = await word.getNextWord()) {
152
+ text = await word.getString();
153
+ console.log(text);
154
+ }
155
+ }
156
+ console.log('-----------------------------------------------------------');
157
+ }
158
+
159
+ // Example 4. A more advanced text extraction example.
160
+ // The output is XML structure containing paragraphs, lines, words,
161
+ // as well as style and positioning information.
162
+ if (example4Advanced) {
163
+ let b;
164
+ let q;
165
+ let curFlowID = -1;
166
+ let curParaID = -1;
167
+
168
+ console.log('<PDFText>');
169
+
170
+ // For each line on the page...
171
+ for (line = await txt.getFirstLine(); await line.isValid(); line = await line.getNextLine()) {
172
+ if ((await line.getNumWords()) === 0) {
173
+ continue;
174
+ }
175
+ if (curFlowID !== await line.getFlowID()) {
176
+ if (curFlowID !== -1) {
177
+ if (curParaID !== -1) {
178
+ curParaID = -1;
179
+ console.log('</Para>');
180
+ }
181
+ console.log('</Flow>');
182
+ }
183
+ curFlowID = await line.getFlowID();
184
+ console.log('<Flow id="' + curFlowID + '">');
185
+ }
186
+ if (curParaID !== await line.getParagraphID()) {
187
+ if (curParaID !== -1) {
188
+ console.log('</Para>');
189
+ }
190
+ curParaID = await line.getParagraphID();
191
+ console.log('<Para id="' + curParaID + '">');
192
+ }
193
+ b = await line.getBBox();
194
+ const lineStyle = await line.getStyle();
195
+ let outputStringLineBox = '<Line box="' + b.x1.toFixed(2) + ', ' + b.y1.toFixed(2) + ', ' + b.x2.toFixed(2) + ', ' + b.y2.toFixed(2) + '"';
196
+ outputStringLineBox += (await printStyle(lineStyle));
197
+ const currentLineNum = await line.getCurrentNum();
198
+ outputStringLineBox += ' cur_num="' + currentLineNum + '">';
199
+ console.log(outputStringLineBox);
200
+
201
+ // For each word in the line...
202
+ for (word = await line.getFirstWord(); await word.isValid(); word = await word.getNextWord()) {
203
+ // output bounding box for the word
204
+ q = await word.getBBox();
205
+ const currentNum = await word.getCurrentNum();
206
+ let outputStringWord = '<Word box="' + q.x1.toFixed(2) + ', ' + q.y1.toFixed(2) + ', ' + q.x2.toFixed(2) + ', ' + q.y2.toFixed(2) + '" cur_num="' + currentNum + '"';
207
+ const sz = await word.getStringLen();
208
+ if (sz === 0) {
209
+ continue;
210
+ }
211
+ // if the word style is different from the parent style, output the new style
212
+ const sty = await word.getStyle();
213
+ if (!(await sty.compare(lineStyle))) {
214
+ outputStringWord += await printStyle(sty);
215
+ }
216
+ outputStringWord += '>' + (await word.getString()) + '</Word>';
217
+ console.log(outputStringWord);
218
+ }
219
+ console.log('</Line>');
220
+ }
221
+ if (curFlowID !== -1) {
222
+ if (curParaID !== -1) {
223
+ curParaID = -1;
224
+ console.log('</Para>');
225
+ }
226
+ console.log('</Flow>');
227
+ }
228
+ console.log('</PDFText>');
229
+ }
230
+ await PDFNet.endDeallocateStack();
231
+ } catch (err) {
232
+ console.log(err);
233
+ console.log(err.stack);
234
+ ret = 1;
235
+ }
236
+
237
+
238
+ if (example5LowLevel) {
239
+ ret = 0;
240
+ try {
241
+ await PDFNet.startDeallocateStack();
242
+ const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + inputFilename);
243
+ doc.initSecurityHandler();
244
+
245
+ // Example 1. Extract all text content from the document
246
+ const reader = await PDFNet.ElementReader.create();
247
+ const itr = await doc.getPageIterator(1);
248
+
249
+ // Read every page
250
+ for (itr; await itr.hasNext(); itr.next()) {
251
+ const page = await itr.current();
252
+ reader.beginOnPage(page);
253
+ await dumpAllText(reader);
254
+ reader.end();
255
+ }
256
+ // Example 2. Extract text content based on the
257
+ // selection rectangle.
258
+ console.log('\n----------------------------------------------------');
259
+ console.log('Extract text based on the selection rectangle.');
260
+ console.log('----------------------------------------------------');
261
+
262
+
263
+ const firstPage = await (await doc.getPageIterator()).current();
264
+ let s1 = await readTextFromRect(firstPage, (await PDFNet.Rect.init(27, 392, 563, 534)), reader);
265
+ console.log('\nField 1: ' + s1);
266
+
267
+ s1 = await readTextFromRect(firstPage, (await PDFNet.Rect.init(28, 551, 106, 623)), reader);
268
+ console.log('Field 2: ' + s1);
269
+
270
+ s1 = await readTextFromRect(firstPage, (await PDFNet.Rect.init(208, 550, 387, 621)), reader);
271
+ console.log('Field 3: ' + s1);
272
+
273
+ // ...
274
+ console.log('Done');
275
+ await PDFNet.endDeallocateStack();
276
+ } catch (err) {
277
+ console.log(err.stack);
278
+ ret = 1;
279
+ }
280
+ }
281
+ };
282
+ PDFNet.runWithCleanup(main, PDFTronLicense.Key).catch(function (error) { console.log('Error: ' + JSON.stringify(error)); }).then(function () { return PDFNet.shutdown(); });
283
+ };
284
+ exports.runTextExtractTest();
285
+ })(exports);
286
+ // eslint-disable-next-line spaced-comment
287
287
  //# sourceURL=TextExtractTest.js