npm - @pdftron/data-extraction - Versions diffs - 10.1.1 → 10.2.0 - Mend

@pdftron/data-extraction 10.1.1 → 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/package.json +5 -4
package/readme.md +50 -21

package/package.json CHANGED Viewed

@@ -1,12 +1,12 @@
 {
   "name": "@pdftron/data-extraction",
-  "version": "10.1.1",
+  "version": "10.2.0",
   "main": "./lib/main.js",
   "binary": {
     "module_name": "ApryseIDP",
     "module_path": "./lib",
     "remote_path": "./downloads/PDFNetNode/Data-Extraction",
-    "package_name": "data-extraction-{platform}-{arch}.tar.gz",
+    "package_name": "data-extraction-{version}-{platform}-{arch}.tar.gz",
     "host": "https://www.pdftron.com"
   },
   "scripts": {
@@ -17,10 +17,10 @@
   },
   "description": "The Apryse SDK Data Extraction Module.",
   "author": "Apryse Software Inc.",
-  "license": "SEE LICENSE IN license.pdf",
+  "license": "Commercial",
   "homepage": "https://www.apryse.com",
   "dependencies": {
-    "@pdftron/pdfnet-node": "10.1.1",
+    "@pdftron/pdfnet-node": "~10.2.0",
     "@mapbox/node-pre-gyp": "^1.0.3"
   },
   "keywords": [
@@ -30,3 +30,4 @@
     "Data Extraction"
   ]
 }

package/readme.md CHANGED Viewed

@@ -1,38 +1,67 @@
-## @pdftron/pdfnet-node
+## @pdftron/data-extraction
-This package leverages the full power of PDFTron's native SDK for maximal performance and accuracy. In order to maintain consistency across platforms the Javascript API is used in the same manner as the PDFNet API available in PDFTron's Web platform. Since access to the filesystem is included in Node.js/Electron some additional APIs requiring filesystem access have also been included.
+This package is meant to be used in conjunction with @pdftron/pdfnet-node to support IDP data extraction from Apryse. Follow this guide for more info on usage.
+https://docs.apryse.com/documentation/core/guides/intelligent-data-extraction/
+For further reading checkout our blog post on the project.
+https://apryse.com/blog/introducing-automated-data-extraction-pdf-idp
 #### Supported platform, Node.js, and Electron versions
 This package depends on unmanaged add-on binaries, and the add-on binaries are not cross-platform. At the moment we have support for
-  * **OS**: Linux (excluding Alpine), Windows(x64), Mac
+  * **OS**: Linux (excluding Alpine), Windows(x64)
   * **Node.js version**: 8 - 18
   * **Electron version**: 6 - 19
 Installation will fail if your OS, Node.js or Electron version is not supported.
-To install for Electron, *runtime* and *target* options are needed. For example, For Electron 6, we need to run  *npm i @pdftron/pdfnet-node --runtime=electron --target=6.0.0*. Note that we need to use *6.0.0* for all Electron 6 versions.
 #### Usage
-Here is a code snippet to demonstrate how to use this package.
+Add the `@pdftron/data-extraction` package as a dependency in your `package.json`
+Inside of your @pdftron/pdfnet-node code after initialization you should include the following line:
+```javascript
+await PDFNet.addResourceSearchPath("./node_modules/@pdftron/data-extraction/lib")
+```
+Here is an example of data extraction being used with this line.
 ```javascript
-const { PDFNet } = require('@pdftron/pdfnet-node');  // you may need to set up NODE_PATH environment variable to make this work.
-const main = async() => {
-  const doc = await PDFNet.PDFDoc.create();
-  const page = await doc.pageCreate();
-  doc.pagePushBack(page);
-  doc.save('blank.pdf', PDFNet.SDFDoc.SaveOptions.e_linearized);
-};
-// add your own license key as the second parameter, e.g. in place of 'YOUR_LICENSE_KEY'.
-PDFNet.runWithCleanup(main, 'YOUR_LICENSE_KEY').catch(function(error) {
-  console.log('Error: ' + JSON.stringify(error));
-}).then(function(){ return PDFNet.shutdown(); });
+const { PDFNet } = require('@pdftron/pdfnet-node');
+const licenseKey = "Insert license key here"
+const inputFile = "Insert input file location here"
+async function main() {
+        // This is where we import data-extraction
+        await PDFNet.addResourceSearchPath("./node_modules/@pdftron/data-extraction/lib")
+        // Extract document structure as a JSON file
+        console.log('Extract document structure as a JSON file');
+        let outputFile = 'out/paragraphs_and_tables.json';
+        await PDFNet.DataExtractionModule.extractData(inputFile, outputFile, PDFNet.DataExtractionModule.DataExtractionEngine.e_DocStructure);
+        console.log('Result saved in ' + outputFile);
+        ///////////////////////////////////////////////////////
+        // Extract document structure as a JSON string
+        console.log('Extract document structure as a JSON string');
+        outputFile = 'out/tagged.json';
+        const json = await PDFNet.DataExtractionModule.extractDataAsString(inputFile, PDFNet.DataExtractionModule.DataExtractionEngine.e_DocStructure);
+        fs.writeFileSync(outputFile, json);
+}
+PDFNet.runWithCleanup(main, licenseKey).catch(function (error) {
+    console.log('Error: ' + JSON.stringify(error));
+}).then(function () { return PDFNet.shutdown(); });;
 ```
-There are some code samples in the [@pdftron/pdfnet-node-samples](https://www.npmjs.com/package/@pdftron/pdfnet-node-samples) package.
+A larger code sample can be found [here](https://docs.apryse.com/documentation/samples/node/js/DataExtractionTest/)
 To get started please see the documentation at https://www.pdftron.com/documentation/nodejs/get-started/integration.
 #### Licensing
-Please go to https://www.pdftron.com/pws/get-key to obtain a demo license or https://www.pdftron.com/form/contact-sales to obtain a production key. For further information, please visit https://www.pdftron.com/licensing.
+Please go to https://docs.apryse.com/documentation/core/info/license/ to obtain a demo or production license.