@pdftron/data-extraction 10.1.1 → 10.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +5 -4
- package/readme.md +50 -21
package/package.json
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@pdftron/data-extraction",
|
|
3
|
-
"version": "10.
|
|
3
|
+
"version": "10.2.0",
|
|
4
4
|
"main": "./lib/main.js",
|
|
5
5
|
"binary": {
|
|
6
6
|
"module_name": "ApryseIDP",
|
|
7
7
|
"module_path": "./lib",
|
|
8
8
|
"remote_path": "./downloads/PDFNetNode/Data-Extraction",
|
|
9
|
-
"package_name": "data-extraction-{platform}-{arch}.tar.gz",
|
|
9
|
+
"package_name": "data-extraction-{version}-{platform}-{arch}.tar.gz",
|
|
10
10
|
"host": "https://www.pdftron.com"
|
|
11
11
|
},
|
|
12
12
|
"scripts": {
|
|
@@ -17,10 +17,10 @@
|
|
|
17
17
|
},
|
|
18
18
|
"description": "The Apryse SDK Data Extraction Module.",
|
|
19
19
|
"author": "Apryse Software Inc.",
|
|
20
|
-
"license": "
|
|
20
|
+
"license": "Commercial",
|
|
21
21
|
"homepage": "https://www.apryse.com",
|
|
22
22
|
"dependencies": {
|
|
23
|
-
"@pdftron/pdfnet-node": "10.
|
|
23
|
+
"@pdftron/pdfnet-node": "~10.2.0",
|
|
24
24
|
"@mapbox/node-pre-gyp": "^1.0.3"
|
|
25
25
|
},
|
|
26
26
|
"keywords": [
|
|
@@ -30,3 +30,4 @@
|
|
|
30
30
|
"Data Extraction"
|
|
31
31
|
]
|
|
32
32
|
}
|
|
33
|
+
|
package/readme.md
CHANGED
|
@@ -1,38 +1,67 @@
|
|
|
1
|
-
## @pdftron/
|
|
1
|
+
## @pdftron/data-extraction
|
|
2
2
|
|
|
3
|
-
This package
|
|
3
|
+
This package is meant to be used in conjunction with @pdftron/pdfnet-node to support IDP data extraction from Apryse. Follow this guide for more info on usage.
|
|
4
|
+
https://docs.apryse.com/documentation/core/guides/intelligent-data-extraction/
|
|
5
|
+
|
|
6
|
+
For further reading checkout our blog post on the project.
|
|
7
|
+
https://apryse.com/blog/introducing-automated-data-extraction-pdf-idp
|
|
4
8
|
|
|
5
9
|
#### Supported platform, Node.js, and Electron versions
|
|
6
10
|
This package depends on unmanaged add-on binaries, and the add-on binaries are not cross-platform. At the moment we have support for
|
|
7
|
-
* **OS**: Linux (excluding Alpine), Windows(x64)
|
|
11
|
+
* **OS**: Linux (excluding Alpine), Windows(x64)
|
|
8
12
|
* **Node.js version**: 8 - 18
|
|
9
13
|
* **Electron version**: 6 - 19
|
|
10
14
|
|
|
11
15
|
Installation will fail if your OS, Node.js or Electron version is not supported.
|
|
12
16
|
|
|
13
|
-
To install for Electron, *runtime* and *target* options are needed. For example, For Electron 6, we need to run *npm i @pdftron/pdfnet-node --runtime=electron --target=6.0.0*. Note that we need to use *6.0.0* for all Electron 6 versions.
|
|
14
|
-
|
|
15
17
|
#### Usage
|
|
16
|
-
|
|
18
|
+
|
|
19
|
+
Add the `@pdftron/data-extraction` package as a dependency in your `package.json`
|
|
20
|
+
|
|
21
|
+
Inside of your @pdftron/pdfnet-node code after initialization you should include the following line:
|
|
22
|
+
|
|
23
|
+
```javascript
|
|
24
|
+
await PDFNet.addResourceSearchPath("./node_modules/@pdftron/data-extraction/lib")
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
Here is an example of data extraction being used with this line.
|
|
28
|
+
|
|
17
29
|
```javascript
|
|
18
|
-
const { PDFNet } = require('@pdftron/pdfnet-node');
|
|
19
|
-
|
|
20
|
-
const
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
30
|
+
const { PDFNet } = require('@pdftron/pdfnet-node');
|
|
31
|
+
const licenseKey = "Insert license key here"
|
|
32
|
+
const inputFile = "Insert input file location here"
|
|
33
|
+
|
|
34
|
+
async function main() {
|
|
35
|
+
// This is where we import data-extraction
|
|
36
|
+
await PDFNet.addResourceSearchPath("./node_modules/@pdftron/data-extraction/lib")
|
|
37
|
+
|
|
38
|
+
// Extract document structure as a JSON file
|
|
39
|
+
console.log('Extract document structure as a JSON file');
|
|
40
|
+
|
|
41
|
+
let outputFile = 'out/paragraphs_and_tables.json';
|
|
42
|
+
await PDFNet.DataExtractionModule.extractData(inputFile, outputFile, PDFNet.DataExtractionModule.DataExtractionEngine.e_DocStructure);
|
|
43
|
+
|
|
44
|
+
console.log('Result saved in ' + outputFile);
|
|
45
|
+
|
|
46
|
+
///////////////////////////////////////////////////////
|
|
47
|
+
// Extract document structure as a JSON string
|
|
48
|
+
console.log('Extract document structure as a JSON string');
|
|
49
|
+
|
|
50
|
+
outputFile = 'out/tagged.json';
|
|
51
|
+
const json = await PDFNet.DataExtractionModule.extractDataAsString(inputFile, PDFNet.DataExtractionModule.DataExtractionEngine.e_DocStructure);
|
|
52
|
+
|
|
53
|
+
fs.writeFileSync(outputFile, json);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
PDFNet.runWithCleanup(main, licenseKey).catch(function (error) {
|
|
57
|
+
console.log('Error: ' + JSON.stringify(error));
|
|
58
|
+
}).then(function () { return PDFNet.shutdown(); });;
|
|
59
|
+
|
|
31
60
|
```
|
|
32
61
|
|
|
33
|
-
|
|
62
|
+
A larger code sample can be found [here](https://docs.apryse.com/documentation/samples/node/js/DataExtractionTest/)
|
|
34
63
|
|
|
35
64
|
To get started please see the documentation at https://www.pdftron.com/documentation/nodejs/get-started/integration.
|
|
36
65
|
|
|
37
66
|
#### Licensing
|
|
38
|
-
Please go to https://
|
|
67
|
+
Please go to https://docs.apryse.com/documentation/core/info/license/ to obtain a demo or production license.
|