@pdftron/pdfnet-node-samples 9.2.0 → 9.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. package/package.json +2 -2
  2. package/samples/AddImageTest/AddImageTest.js +115 -115
  3. package/samples/AdvancedImagingTest/AdvancedImagingTest.js +78 -64
  4. package/samples/AnnotationTest/AnnotationTest.js +641 -641
  5. package/samples/BookmarkTest/BookmarkTest.js +219 -219
  6. package/samples/CAD2PDFTest/CAD2PDFTest.js +4 -4
  7. package/samples/ContentReplacerTest/ContentReplacerTest.js +75 -75
  8. package/samples/ConvertTest/ConvertTest.js +5 -5
  9. package/samples/DigitalSignaturesTest/DigitalSignaturesTest.js +1 -1
  10. package/samples/ElementBuilderTest/ElementBuilderTest.js +513 -513
  11. package/samples/ElementEditTest/ElementEditTest.js +110 -110
  12. package/samples/ElementReaderAdvTest/ElementReaderAdvTest.js +305 -305
  13. package/samples/ElementReaderTest/ElementReaderTest.js +77 -77
  14. package/samples/EncTest/EncTest.js +175 -175
  15. package/samples/FDFTest/FDFTest.js +1 -1
  16. package/samples/HTML2PDFTest/HTML2PDFTest.js +53 -43
  17. package/samples/HighlightsTest/HighlightsTest.js +97 -0
  18. package/samples/ImageExtractTest/ImageExtractTest.js +1 -1
  19. package/samples/ImpositionTest/ImpositionTest.js +1 -1
  20. package/samples/InteractiveFormsTest/InteractiveFormsTest.js +1 -1
  21. package/samples/JBIG2Test/JBIG2Test.js +1 -1
  22. package/samples/LicenseKey/LicenseKey.js +11 -11
  23. package/samples/LogicalStructureTest/LogicalStructureTest.js +250 -250
  24. package/samples/OCRTest/OCRTest.js +19 -34
  25. package/samples/OfficeTemplateTest/OfficeTemplateTest.js +77 -0
  26. package/samples/OfficeToPDFTest/OfficeToPDFTest.js +1 -1
  27. package/samples/OptimizerTest/OptimizerTest.js +1 -1
  28. package/samples/PDF2HtmlTest/PDF2HtmlTest.js +123 -123
  29. package/samples/PDF2OfficeTest/PDF2OfficeTest.js +158 -158
  30. package/samples/PDFATest/PDFATest.js +85 -85
  31. package/samples/PDFDocMemoryTest/PDFDocMemoryTest.js +1 -1
  32. package/samples/PDFDrawTest/PDFDrawTest.js +305 -305
  33. package/samples/PDFLayersTest/PDFLayersTest.js +294 -294
  34. package/samples/PDFPackageTest/PDFPackageTest.js +1 -1
  35. package/samples/PDFPageTest/PDFPageTest.js +189 -189
  36. package/samples/PDFRedactTest/PDFRedactTest.js +74 -74
  37. package/samples/PageLabelsTest/PageLabelsTest.js +1 -1
  38. package/samples/PatternTest/PatternTest.js +1 -1
  39. package/samples/RectTest/RectTest.js +40 -40
  40. package/samples/SDFTest/SDFTest.js +88 -88
  41. package/samples/StamperTest/StamperTest.js +255 -255
  42. package/samples/TestFiles/Misc-Fixed.pfa +1166 -1166
  43. package/samples/TestFiles/SHA-2 Root USERTrust RSA CA Sectigo timestamping.crt +34 -34
  44. package/samples/TestFiles/form1_annots.xfdf +33 -33
  45. package/samples/TestFiles/form1_data.xfdf +139 -139
  46. package/samples/TestFiles/my_stream.txt +2310 -2310
  47. package/samples/TextExtractTest/TextExtractTest.js +286 -286
  48. package/samples/TextSearchTest/TextSearchTest.js +121 -121
  49. package/samples/U3DTest/U3DTest.js +1 -1
  50. package/samples/UndoRedoTest/UndoRedoTest.js +101 -101
  51. package/samples/UnicodeWriteTest/UnicodeWriteTest.js +173 -173
  52. package/samples/WebViewerConvertTest/WebViewerConvertTest.js +1 -1
  53. package/samples/runall.bat +6 -7
  54. package/samples/runall.sh +15 -14
  55. package/samples/AddImageTest/RunTest.bat +0 -2
  56. package/samples/AddImageTest/RunTest.sh +0 -2
  57. package/samples/AdvancedImagingTest/RunTest.bat +0 -2
  58. package/samples/AdvancedImagingTest/RunTest.sh +0 -2
  59. package/samples/AnnotationTest/RunTest.bat +0 -2
  60. package/samples/AnnotationTest/RunTest.sh +0 -2
  61. package/samples/BookmarkTest/RunTest.bat +0 -2
  62. package/samples/BookmarkTest/RunTest.sh +0 -2
  63. package/samples/CAD2PDFTest/RunTest.bat +0 -2
  64. package/samples/CAD2PDFTest/RunTest.sh +0 -2
  65. package/samples/ContentReplacerTest/RunTest.bat +0 -2
  66. package/samples/ContentReplacerTest/RunTest.sh +0 -2
  67. package/samples/ConvertTest/RunTest.bat +0 -2
  68. package/samples/ConvertTest/RunTest.sh +0 -2
  69. package/samples/DigitalSignaturesTest/RunTest.bat +0 -2
  70. package/samples/DigitalSignaturesTest/RunTest.sh +0 -2
  71. package/samples/ElementBuilderTest/RunTest.bat +0 -2
  72. package/samples/ElementBuilderTest/RunTest.sh +0 -2
  73. package/samples/ElementEditTest/RunTest.bat +0 -2
  74. package/samples/ElementEditTest/RunTest.sh +0 -2
  75. package/samples/ElementReaderAdvTest/RunTest.bat +0 -2
  76. package/samples/ElementReaderAdvTest/RunTest.sh +0 -2
  77. package/samples/ElementReaderTest/RunTest.bat +0 -2
  78. package/samples/ElementReaderTest/RunTest.sh +0 -2
  79. package/samples/EncTest/RunTest.bat +0 -2
  80. package/samples/EncTest/RunTest.sh +0 -2
  81. package/samples/FDFTest/RunTest.bat +0 -2
  82. package/samples/FDFTest/RunTest.sh +0 -2
  83. package/samples/HTML2PDFTest/RunTest.bat +0 -2
  84. package/samples/HTML2PDFTest/RunTest.sh +0 -2
  85. package/samples/ImageExtractTest/RunTest.bat +0 -2
  86. package/samples/ImageExtractTest/RunTest.sh +0 -2
  87. package/samples/ImpositionTest/RunTest.bat +0 -2
  88. package/samples/ImpositionTest/RunTest.sh +0 -2
  89. package/samples/InteractiveFormsTest/RunTest.bat +0 -2
  90. package/samples/InteractiveFormsTest/RunTest.sh +0 -2
  91. package/samples/JBIG2Test/RunTest.bat +0 -2
  92. package/samples/JBIG2Test/RunTest.sh +0 -2
  93. package/samples/LogicalStructureTest/RunTest.bat +0 -2
  94. package/samples/LogicalStructureTest/RunTest.sh +0 -2
  95. package/samples/OCRTest/RunTest.bat +0 -2
  96. package/samples/OCRTest/RunTest.sh +0 -2
  97. package/samples/OfficeToPDFTest/RunTest.bat +0 -2
  98. package/samples/OfficeToPDFTest/RunTest.sh +0 -2
  99. package/samples/OptimizerTest/RunTest.bat +0 -2
  100. package/samples/OptimizerTest/RunTest.sh +0 -2
  101. package/samples/PDF2HtmlTest/RunTest.bat +0 -2
  102. package/samples/PDF2HtmlTest/RunTest.sh +0 -2
  103. package/samples/PDF2OfficeTest/RunTest.bat +0 -2
  104. package/samples/PDF2OfficeTest/RunTest.sh +0 -2
  105. package/samples/PDFATest/RunTest.bat +0 -2
  106. package/samples/PDFATest/RunTest.sh +0 -2
  107. package/samples/PDFDocMemoryTest/RunTest.bat +0 -2
  108. package/samples/PDFDocMemoryTest/RunTest.sh +0 -2
  109. package/samples/PDFDrawTest/RunTest.bat +0 -2
  110. package/samples/PDFDrawTest/RunTest.sh +0 -2
  111. package/samples/PDFLayersTest/RunTest.bat +0 -2
  112. package/samples/PDFLayersTest/RunTest.sh +0 -2
  113. package/samples/PDFPackageTest/RunTest.bat +0 -2
  114. package/samples/PDFPackageTest/RunTest.sh +0 -2
  115. package/samples/PDFPageTest/RunTest.bat +0 -2
  116. package/samples/PDFPageTest/RunTest.sh +0 -2
  117. package/samples/PDFRedactTest/RunTest.bat +0 -2
  118. package/samples/PDFRedactTest/RunTest.sh +0 -2
  119. package/samples/PageLabelsTest/RunTest.bat +0 -2
  120. package/samples/PageLabelsTest/RunTest.sh +0 -2
  121. package/samples/PatternTest/RunTest.bat +0 -2
  122. package/samples/PatternTest/RunTest.sh +0 -2
  123. package/samples/RectTest/RunTest.bat +0 -2
  124. package/samples/RectTest/RunTest.sh +0 -2
  125. package/samples/SDFTest/RunTest.bat +0 -2
  126. package/samples/SDFTest/RunTest.sh +0 -2
  127. package/samples/StamperTest/RunTest.bat +0 -2
  128. package/samples/StamperTest/RunTest.sh +0 -2
  129. package/samples/TextExtractTest/RunTest.bat +0 -2
  130. package/samples/TextExtractTest/RunTest.sh +0 -2
  131. package/samples/TextSearchTest/RunTest.bat +0 -2
  132. package/samples/TextSearchTest/RunTest.sh +0 -2
  133. package/samples/U3DTest/RunTest.bat +0 -2
  134. package/samples/U3DTest/RunTest.sh +0 -2
  135. package/samples/UndoRedoTest/RunTest.bat +0 -2
  136. package/samples/UndoRedoTest/RunTest.sh +0 -2
  137. package/samples/UnicodeWriteTest/RunTest.bat +0 -2
  138. package/samples/UnicodeWriteTest/RunTest.sh +0 -2
  139. package/samples/WebViewerConvertTest/RunTest.bat +0 -2
  140. package/samples/WebViewerConvertTest/RunTest.sh +0 -2
@@ -1,251 +1,251 @@
1
- //---------------------------------------------------------------------------------------
2
- // Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
3
- // Consult legal.txt regarding legal and license information.
4
- //---------------------------------------------------------------------------------------
5
-
6
- //---------------------------------------------------------------------------------------
7
- // This sample explores the structure and content of a tagged PDF document and dumps
8
- // the structure information to the console window.
9
- //
10
- // In tagged PDF documents StructTree acts as a central repository for information
11
- // related to a PDF document's logical structure. The tree consists of StructElement-s
12
- // and ContentItem-s which are leaf nodes of the structure tree.
13
- //
14
- // The sample can be extended to access and extract the marked-content elements such
15
- // as text and images.
16
- //---------------------------------------------------------------------------------------
17
-
18
-
19
- const { PDFNet } = require('@pdftron/pdfnet-node');
20
- const PDFTronLicense = require('../LicenseKey/LicenseKey');
21
-
22
- ((exports) => {
23
-
24
- exports.runLogicalStructureTest = () => {
25
-
26
- const printAndIndent = (printState, indent) => {
27
- console.log(printState.str);
28
-
29
- let indentStr = '';
30
- for (let i = 0; i < indent; ++i) {
31
- indentStr += ' ';
32
- }
33
- printState.str = indentStr;
34
- };
35
-
36
- // Used in code snippet 1.
37
- const processStructElement = async(element, indent, printState) => {
38
- if (!(await element.isValid())) {
39
- return;
40
- }
41
-
42
-
43
- // Print out the type and title info, if any.
44
- printAndIndent(printState, indent++);
45
- printState.str += 'Type: ' + (await element.getType());
46
- if (await element.hasTitle()) {
47
- printState.str += '. Title: ' + (await element.getTitle());
48
- }
49
-
50
- const num = await element.getNumKids();
51
- for (let i = 0; i < num; ++i) {
52
- // Check is the kid is a leaf node (i.e. it is a ContentItem).
53
- if (await element.isContentItem(i)) {
54
- const cont = await element.getAsContentItem(i);
55
- const type = await cont.getType();
56
-
57
- const page = await cont.getPage();
58
-
59
- printAndIndent(printState, indent);
60
- printState.str += 'Content Item. Part of page #' + (await page.getIndex());
61
-
62
- printAndIndent(printState, indent);
63
- switch (type) {
64
- case PDFNet.ContentItem.Type.e_MCID:
65
- case PDFNet.ContentItem.Type.e_MCR:
66
- printState.str += 'MCID: ' + (await cont.getMCID());
67
- break;
68
- case PDFNet.ContentItem.Type.e_OBJR:
69
- {
70
- printState.str += 'OBJR ';
71
- const refObj = await cont.getRefObj();
72
- if (refObj) {
73
- printState.str += '- Referenced Object#: ' + refObj.getObjNum();
74
- }
75
- }
76
- break;
77
- default:
78
- break;
79
- }
80
- } else { // the kid is another StructElement node.
81
- await processStructElement(await element.getAsStructElem(i), indent, printState);
82
- }
83
- }
84
- };
85
-
86
- // Used in code snippet 2.
87
- const processElements = async(reader, printState) => {
88
- let element;
89
- while (element = await reader.next()) { // Read page contents
90
- // In this sample we process only paths & text, but the code can be
91
- // extended to handle any element type.
92
- const type = await element.getType();
93
- if (type === PDFNet.Element.Type.e_path || type === PDFNet.Element.Type.e_text || type === PDFNet.Element.Type.e_path) {
94
- switch (type) {
95
- case PDFNet.Element.Type.e_path: // Process path ...
96
- printState.str += '\nPATH: ';
97
- break;
98
- case PDFNet.Element.Type.e_text: // Process text ...
99
- printState.str += '\nTEXT: ' + (await element.getTextString()) + '\n';
100
- break;
101
- case PDFNet.Element.Type.e_form: // Process form XObjects
102
- printState.str += '\nFORM XObject: ';
103
- // reader.formBegin();
104
- // await ProcessElements(reader);
105
- // reader.end();
106
- break;
107
- }
108
-
109
- // Check if the element is associated with any structural element.
110
- // Content items are leaf nodes of the structure tree.
111
- const structParent = await element.getParentStructElement();
112
- if (await structParent.isValid()) {
113
- // Print out the parent structural element's type, title, and object number.
114
- printState.str += ' Type: ' + (await structParent.getType()) + ', MCID: ' + (await element.getStructMCID());
115
- if (await structParent.hasTitle()) {
116
- printState.str += '. Title: ' + (await structParent.getTitle());
117
- }
118
- printState.str += ', Obj#: ' + (await (await structParent.getSDFObj()).getObjNum());
119
- }
120
- }
121
- }
122
- };
123
-
124
- // Used in code snippet 3.
125
- const processElements2 = async(reader, mcidPageMap) => {
126
- let element;
127
- while (element = await reader.next()) { // Read page contents
128
- // In this sample we process only text, but the code can be extended
129
- // to handle paths, images, or any other Element type.
130
- const mcid = await element.getStructMCID();
131
- if (mcid >= 0 && (await element.getType()) === PDFNet.Element.Type.e_text) {
132
- const val = await element.getTextString();
133
- if (mcid in mcidPageMap) {
134
- mcidPageMap[mcid] += val;
135
- } else {
136
- mcidPageMap[mcid] = val;
137
- }
138
- }
139
- }
140
- };
141
-
142
- // Used in code snippet 3.
143
- const processStructElement2 = async(element, mcidDocMap, indent, printState) => {
144
- if (!(await element.isValid())) {
145
- return;
146
- }
147
-
148
- // Print out the type and title info, if any.
149
- printAndIndent(printState, indent);
150
- printState.str += '<' + (await element.getType());
151
- if (await element.hasTitle()) {
152
- printState.str += ' title="' + (await element.getTitle()) + '"';
153
- }
154
- printState.str += '>';
155
-
156
- const num = await element.getNumKids();
157
- for (let i = 0; i < num; ++i) {
158
- if (await element.isContentItem(i)) {
159
- const cont = await element.getAsContentItem(i);
160
- if ((await cont.getType()) === PDFNet.ContentItem.Type.e_MCID) {
161
- const pageNum = await (await cont.getPage()).getIndex();
162
- const mcidPageMap = mcidDocMap[pageNum];
163
- if (mcidPageMap) {
164
- const mcid = await cont.getMCID();
165
- if (mcid in mcidPageMap) {
166
- printState.str += mcidPageMap[mcid];
167
- }
168
- }
169
- }
170
- } else { // the kid is another StructElement node.
171
- await processStructElement2(await element.getAsStructElem(i), mcidDocMap, indent + 1, printState);
172
- }
173
- }
174
-
175
- printAndIndent(printState, indent);
176
- printState.str += '</' + (await element.getType()) + '>';
177
- };
178
-
179
- const main = async() => {
180
- // Relative path to the folder containing test files.
181
- const inputPath = '../TestFiles/';
182
- const printState = { str: '' };
183
- try { // Extract logical structure from a PDF document
184
- const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + 'tagged.pdf');
185
- doc.initSecurityHandler();
186
-
187
- let reader = null;
188
- let tree = null;
189
-
190
- console.log('____________________________________________________________');
191
- console.log('Sample 1 - Traverse logical structure tree...');
192
- tree = await doc.getStructTree();
193
- if (await tree.isValid()) {
194
- console.log('Document has a StructTree root.');
195
- for (let i = 0, numKids = await tree.getNumKids(); i < numKids; ++i) {
196
- // Recursively get structure info for all child elements.
197
- await processStructElement(await tree.getKid(i), 0, printState);
198
- }
199
- } else {
200
- console.log('This document does not contain any logical structure.');
201
- }
202
- printAndIndent(printState, 0);
203
- console.log('Done 1.');
204
-
205
- console.log('____________________________________________________________');
206
- console.log('Sample 2 - Get parent logical structure elements from');
207
- console.log('layout elements.');
208
- reader = await PDFNet.ElementReader.create();
209
- for (let itr = await doc.getPageIterator(); await itr.hasNext(); itr.next()) {
210
- reader.beginOnPage(await itr.current());
211
- await processElements(reader, printState);
212
- reader.end();
213
- }
214
- printAndIndent(printState, 0);
215
- console.log('Done 2.');
216
-
217
- console.log('____________________________________________________________');
218
- console.log("Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
219
- {
220
- const mcidDocMap = {};
221
- for (let itr = await doc.getPageIterator(); await itr.hasNext(); itr.next()) {
222
- const page = await itr.current();
223
- reader.beginOnPage(page);
224
- const pageNum = await page.getIndex();
225
- const pageMcidMap = {};
226
- mcidDocMap[pageNum] = pageMcidMap;
227
- await processElements2(reader, pageMcidMap);
228
- reader.end();
229
- }
230
-
231
- tree = await doc.getStructTree();
232
- if (await tree.isValid()) {
233
- for (let i = 0, numKids = await tree.getNumKids(); i < numKids; ++i) {
234
- await processStructElement2(await tree.getKid(i), mcidDocMap, 0, printState);
235
- }
236
- }
237
- }
238
- printAndIndent(printState, 0);
239
- console.log('Done 3.');
240
- await doc.save(inputPath + 'Output/LogicalStructure.pdf', 0);
241
- } catch (err) {
242
- console.log(err);
243
- }
244
- };
245
-
246
- PDFNet.runWithCleanup(main, PDFTronLicense.Key).catch(function(error){console.log('Error: ' + JSON.stringify(error));}).then(function(){return PDFNet.shutdown();});
247
- };
248
- exports.runLogicalStructureTest();
249
- })(exports);
250
- // eslint-disable-next-line spaced-comment
1
+ //---------------------------------------------------------------------------------------
2
+ // Copyright (c) 2001-2022 by PDFTron Systems Inc. All Rights Reserved.
3
+ // Consult legal.txt regarding legal and license information.
4
+ //---------------------------------------------------------------------------------------
5
+
6
+ //---------------------------------------------------------------------------------------
7
+ // This sample explores the structure and content of a tagged PDF document and dumps
8
+ // the structure information to the console window.
9
+ //
10
+ // In tagged PDF documents StructTree acts as a central repository for information
11
+ // related to a PDF document's logical structure. The tree consists of StructElement-s
12
+ // and ContentItem-s which are leaf nodes of the structure tree.
13
+ //
14
+ // The sample can be extended to access and extract the marked-content elements such
15
+ // as text and images.
16
+ //---------------------------------------------------------------------------------------
17
+
18
+
19
+ const { PDFNet } = require('@pdftron/pdfnet-node');
20
+ const PDFTronLicense = require('../LicenseKey/LicenseKey');
21
+
22
+ ((exports) => {
23
+
24
+ exports.runLogicalStructureTest = () => {
25
+
26
+ const printAndIndent = (printState, indent) => {
27
+ console.log(printState.str);
28
+
29
+ let indentStr = '';
30
+ for (let i = 0; i < indent; ++i) {
31
+ indentStr += ' ';
32
+ }
33
+ printState.str = indentStr;
34
+ };
35
+
36
+ // Used in code snippet 1.
37
+ const processStructElement = async(element, indent, printState) => {
38
+ if (!(await element.isValid())) {
39
+ return;
40
+ }
41
+
42
+
43
+ // Print out the type and title info, if any.
44
+ printAndIndent(printState, indent++);
45
+ printState.str += 'Type: ' + (await element.getType());
46
+ if (await element.hasTitle()) {
47
+ printState.str += '. Title: ' + (await element.getTitle());
48
+ }
49
+
50
+ const num = await element.getNumKids();
51
+ for (let i = 0; i < num; ++i) {
52
+ // Check is the kid is a leaf node (i.e. it is a ContentItem).
53
+ if (await element.isContentItem(i)) {
54
+ const cont = await element.getAsContentItem(i);
55
+ const type = await cont.getType();
56
+
57
+ const page = await cont.getPage();
58
+
59
+ printAndIndent(printState, indent);
60
+ printState.str += 'Content Item. Part of page #' + (await page.getIndex());
61
+
62
+ printAndIndent(printState, indent);
63
+ switch (type) {
64
+ case PDFNet.ContentItem.Type.e_MCID:
65
+ case PDFNet.ContentItem.Type.e_MCR:
66
+ printState.str += 'MCID: ' + (await cont.getMCID());
67
+ break;
68
+ case PDFNet.ContentItem.Type.e_OBJR:
69
+ {
70
+ printState.str += 'OBJR ';
71
+ const refObj = await cont.getRefObj();
72
+ if (refObj) {
73
+ printState.str += '- Referenced Object#: ' + refObj.getObjNum();
74
+ }
75
+ }
76
+ break;
77
+ default:
78
+ break;
79
+ }
80
+ } else { // the kid is another StructElement node.
81
+ await processStructElement(await element.getAsStructElem(i), indent, printState);
82
+ }
83
+ }
84
+ };
85
+
86
+ // Used in code snippet 2.
87
+ const processElements = async(reader, printState) => {
88
+ let element;
89
+ while (element = await reader.next()) { // Read page contents
90
+ // In this sample we process only paths & text, but the code can be
91
+ // extended to handle any element type.
92
+ const type = await element.getType();
93
+ if (type === PDFNet.Element.Type.e_path || type === PDFNet.Element.Type.e_text || type === PDFNet.Element.Type.e_path) {
94
+ switch (type) {
95
+ case PDFNet.Element.Type.e_path: // Process path ...
96
+ printState.str += '\nPATH: ';
97
+ break;
98
+ case PDFNet.Element.Type.e_text: // Process text ...
99
+ printState.str += '\nTEXT: ' + (await element.getTextString()) + '\n';
100
+ break;
101
+ case PDFNet.Element.Type.e_form: // Process form XObjects
102
+ printState.str += '\nFORM XObject: ';
103
+ // reader.formBegin();
104
+ // await ProcessElements(reader);
105
+ // reader.end();
106
+ break;
107
+ }
108
+
109
+ // Check if the element is associated with any structural element.
110
+ // Content items are leaf nodes of the structure tree.
111
+ const structParent = await element.getParentStructElement();
112
+ if (await structParent.isValid()) {
113
+ // Print out the parent structural element's type, title, and object number.
114
+ printState.str += ' Type: ' + (await structParent.getType()) + ', MCID: ' + (await element.getStructMCID());
115
+ if (await structParent.hasTitle()) {
116
+ printState.str += '. Title: ' + (await structParent.getTitle());
117
+ }
118
+ printState.str += ', Obj#: ' + (await (await structParent.getSDFObj()).getObjNum());
119
+ }
120
+ }
121
+ }
122
+ };
123
+
124
+ // Used in code snippet 3.
125
+ const processElements2 = async(reader, mcidPageMap) => {
126
+ let element;
127
+ while (element = await reader.next()) { // Read page contents
128
+ // In this sample we process only text, but the code can be extended
129
+ // to handle paths, images, or any other Element type.
130
+ const mcid = await element.getStructMCID();
131
+ if (mcid >= 0 && (await element.getType()) === PDFNet.Element.Type.e_text) {
132
+ const val = await element.getTextString();
133
+ if (mcid in mcidPageMap) {
134
+ mcidPageMap[mcid] += val;
135
+ } else {
136
+ mcidPageMap[mcid] = val;
137
+ }
138
+ }
139
+ }
140
+ };
141
+
142
+ // Used in code snippet 3.
143
+ const processStructElement2 = async(element, mcidDocMap, indent, printState) => {
144
+ if (!(await element.isValid())) {
145
+ return;
146
+ }
147
+
148
+ // Print out the type and title info, if any.
149
+ printAndIndent(printState, indent);
150
+ printState.str += '<' + (await element.getType());
151
+ if (await element.hasTitle()) {
152
+ printState.str += ' title="' + (await element.getTitle()) + '"';
153
+ }
154
+ printState.str += '>';
155
+
156
+ const num = await element.getNumKids();
157
+ for (let i = 0; i < num; ++i) {
158
+ if (await element.isContentItem(i)) {
159
+ const cont = await element.getAsContentItem(i);
160
+ if ((await cont.getType()) === PDFNet.ContentItem.Type.e_MCID) {
161
+ const pageNum = await (await cont.getPage()).getIndex();
162
+ const mcidPageMap = mcidDocMap[pageNum];
163
+ if (mcidPageMap) {
164
+ const mcid = await cont.getMCID();
165
+ if (mcid in mcidPageMap) {
166
+ printState.str += mcidPageMap[mcid];
167
+ }
168
+ }
169
+ }
170
+ } else { // the kid is another StructElement node.
171
+ await processStructElement2(await element.getAsStructElem(i), mcidDocMap, indent + 1, printState);
172
+ }
173
+ }
174
+
175
+ printAndIndent(printState, indent);
176
+ printState.str += '</' + (await element.getType()) + '>';
177
+ };
178
+
179
+ const main = async() => {
180
+ // Relative path to the folder containing test files.
181
+ const inputPath = '../TestFiles/';
182
+ const printState = { str: '' };
183
+ try { // Extract logical structure from a PDF document
184
+ const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + 'tagged.pdf');
185
+ doc.initSecurityHandler();
186
+
187
+ let reader = null;
188
+ let tree = null;
189
+
190
+ console.log('____________________________________________________________');
191
+ console.log('Sample 1 - Traverse logical structure tree...');
192
+ tree = await doc.getStructTree();
193
+ if (await tree.isValid()) {
194
+ console.log('Document has a StructTree root.');
195
+ for (let i = 0, numKids = await tree.getNumKids(); i < numKids; ++i) {
196
+ // Recursively get structure info for all child elements.
197
+ await processStructElement(await tree.getKid(i), 0, printState);
198
+ }
199
+ } else {
200
+ console.log('This document does not contain any logical structure.');
201
+ }
202
+ printAndIndent(printState, 0);
203
+ console.log('Done 1.');
204
+
205
+ console.log('____________________________________________________________');
206
+ console.log('Sample 2 - Get parent logical structure elements from');
207
+ console.log('layout elements.');
208
+ reader = await PDFNet.ElementReader.create();
209
+ for (let itr = await doc.getPageIterator(); await itr.hasNext(); itr.next()) {
210
+ reader.beginOnPage(await itr.current());
211
+ await processElements(reader, printState);
212
+ reader.end();
213
+ }
214
+ printAndIndent(printState, 0);
215
+ console.log('Done 2.');
216
+
217
+ console.log('____________________________________________________________');
218
+ console.log("Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
219
+ {
220
+ const mcidDocMap = {};
221
+ for (let itr = await doc.getPageIterator(); await itr.hasNext(); itr.next()) {
222
+ const page = await itr.current();
223
+ reader.beginOnPage(page);
224
+ const pageNum = await page.getIndex();
225
+ const pageMcidMap = {};
226
+ mcidDocMap[pageNum] = pageMcidMap;
227
+ await processElements2(reader, pageMcidMap);
228
+ reader.end();
229
+ }
230
+
231
+ tree = await doc.getStructTree();
232
+ if (await tree.isValid()) {
233
+ for (let i = 0, numKids = await tree.getNumKids(); i < numKids; ++i) {
234
+ await processStructElement2(await tree.getKid(i), mcidDocMap, 0, printState);
235
+ }
236
+ }
237
+ }
238
+ printAndIndent(printState, 0);
239
+ console.log('Done 3.');
240
+ await doc.save(inputPath + 'Output/LogicalStructure.pdf', 0);
241
+ } catch (err) {
242
+ console.log(err);
243
+ }
244
+ };
245
+
246
+ PDFNet.runWithCleanup(main, PDFTronLicense.Key).catch(function(error){console.log('Error: ' + JSON.stringify(error));}).then(function(){return PDFNet.shutdown();});
247
+ };
248
+ exports.runLogicalStructureTest();
249
+ })(exports);
250
+ // eslint-disable-next-line spaced-comment
251
251
  //# sourceURL=LogicalStructureTest.js
@@ -1,5 +1,5 @@
1
1
  //---------------------------------------------------------------------------------------
2
- // Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
2
+ // Copyright (c) 2001-2022 by PDFTron Systems Inc. All Rights Reserved.
3
3
  // Consult legal.txt regarding legal and license information.
4
4
  //---------------------------------------------------------------------------------------
5
5
 
@@ -18,6 +18,8 @@ const PDFTronLicense = require('../LicenseKey/LicenseKey');
18
18
  try {
19
19
 
20
20
  PDFNet.addResourceSearchPath('../../lib/');
21
+
22
+ const useIRIS = await PDFNet.OCRModule.isIRISModuleAvailable();
21
23
  if (!(await PDFNet.OCRModule.isModuleAvailable())) {
22
24
  console.log('\nUnable to run OCRTest: PDFTron SDK OCR module not available.');
23
25
  console.log('---------------------------------------------------------------');
@@ -39,10 +41,14 @@ const PDFTronLicense = require('../LicenseKey/LicenseKey');
39
41
 
40
42
  // A) Setup empty destination doc
41
43
  const doc = await PDFNet.PDFDoc.create();
44
+
42
45
  await doc.initSecurityHandler();
43
46
 
47
+ const opts = new PDFNet.OCRModule.OCROptions();
48
+ if(useIRIS) opts.setOCREngine('iris');
49
+
44
50
  // B) Run OCR on the .png with options
45
- await PDFNet.OCRModule.imageToPDF(doc, input_path + 'psychomachia_excerpt.png');
51
+ await PDFNet.OCRModule.imageToPDF(doc, input_path + 'psychomachia_excerpt.png', opts);
46
52
 
47
53
  // C) check the result
48
54
  await doc.save(output_path + 'psychomachia_excerpt.pdf', 0);
@@ -62,6 +68,7 @@ const PDFTronLicense = require('../LicenseKey/LicenseKey');
62
68
 
63
69
  // B) Setup options with multiple target languages, English will always be considered as secondary language
64
70
  const opts = new PDFNet.OCRModule.OCROptions();
71
+ if(useIRIS) opts.setOCREngine('iris');
65
72
  opts.addLang('rus');
66
73
  opts.addLang('deu');
67
74
 
@@ -85,6 +92,7 @@ const PDFTronLicense = require('../LicenseKey/LicenseKey');
85
92
 
86
93
  // B) Setup options with a single language and an ignore zone
87
94
  const opts = new PDFNet.OCRModule.OCROptions();
95
+ if(useIRIS) opts.setOCREngine('iris');
88
96
  opts.addLang('deu');
89
97
 
90
98
  const ignore_zones = [];
@@ -111,6 +119,7 @@ const PDFTronLicense = require('../LicenseKey/LicenseKey');
111
119
 
112
120
  // B) Setup options with a single language plus text/ignore zones
113
121
  const opts = new PDFNet.OCRModule.OCROptions();
122
+ if(useIRIS) opts.setOCREngine('iris');
114
123
  opts.addLang('eng');
115
124
 
116
125
  var ignore_zones = [];
@@ -163,8 +172,11 @@ const PDFTronLicense = require('../LicenseKey/LicenseKey');
163
172
  const doc = await PDFNet.PDFDoc.createFromFilePath(input_path + 'zero_value_test_no_text.pdf');
164
173
  await doc.initSecurityHandler();
165
174
 
175
+ const opts = new PDFNet.OCRModule.OCROptions();
176
+ if(useIRIS) opts.setOCREngine('iris');
177
+
166
178
  // B) Run OCR on the .pdf with default English language
167
- const json = await PDFNet.OCRModule.getOCRJsonFromPDF(doc);
179
+ const json = await PDFNet.OCRModule.getOCRJsonFromPDF(doc, opts);
168
180
 
169
181
  // C) Post-processing step (whatever it might be)
170
182
  console.log('Have OCR result JSON, re-applying to PDF ');
@@ -188,9 +200,12 @@ const PDFTronLicense = require('../LicenseKey/LicenseKey');
188
200
  const doc = await PDFNet.PDFDoc.create();
189
201
  await doc.initSecurityHandler();
190
202
 
203
+ const opts = new PDFNet.OCRModule.OCROptions();
204
+ if(useIRIS) opts.setOCREngine('iris');
205
+
191
206
  // B) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
192
207
  // in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.
193
- const xml = await PDFNet.OCRModule.getOCRXmlFromImage(doc, input_path + 'physics.tif');
208
+ const xml = await PDFNet.OCRModule.getOCRXmlFromImage(doc, input_path + 'physics.tif', opts);
194
209
 
195
210
  // C) Post-processing step (whatever it might be)
196
211
  console.log('Have OCR result XML, re-applying to PDF');
@@ -205,36 +220,6 @@ const PDFTronLicense = require('../LicenseKey/LicenseKey');
205
220
  } catch (err) {
206
221
  console.log(err);
207
222
  }
208
-
209
-
210
- //--------------------------------------------------------------------------------
211
- // Example 7) Resolution can be manually set, when DPI missing from metadata or is wrong
212
- try {
213
- // A) Setup empty destination doc
214
- const doc = await PDFNet.PDFDoc.create();
215
- await doc.initSecurityHandler();
216
-
217
- // B) Setup options with a text zone
218
- const opts = new PDFNet.OCRModule.OCROptions();
219
-
220
- const text_zones = [];
221
- text_zones.push(new PDFNet.Rect(140, 870, 310, 920));
222
- opts.addTextZonesForPage(text_zones, 1);
223
-
224
- // C) Manually override DPI
225
- opts.addDPI(100)
226
-
227
- // D) Run OCR on the .jpg with options
228
- await PDFNet.OCRModule.imageToPDF(doc, input_path + 'corrupted_dpi.jpg', opts);
229
-
230
- // E) check the result
231
- await doc.save(output_path + 'corrupted_dpi.pdf', 0);
232
-
233
- console.log('Example 7: converting image with corrupted resolution metadata corrupted_dpi.jpg to pdf with searchable text');
234
- } catch (err) {
235
- console.log(err);
236
- }
237
-
238
223
  console.log('Done.');
239
224
  } catch (err) {
240
225
  console.log(err);