n8n-nodes-lite-parser 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +57 -0
- package/dist/index.js +6 -0
- package/package.json +39 -0
- package/src/index.ts +1 -0
- package/src/nodes/LiteParse/LiteParse.node.ts +275 -0
- package/src/nodes/LiteParse/liteparse.svg +7 -0
- package/tsconfig.json +20 -0
package/README.md
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# n8n-nodes-liteparse
|
|
2
|
+
|
|
3
|
+
An n8n community node for [LiteParse](https://github.com/run-llama/liteparse) — fast, local document parsing with Markdown and JSON output.
|
|
4
|
+
|
|
5
|
+
## What It Does
|
|
6
|
+
|
|
7
|
+
Parse PDFs, Office documents (DOCX, XLSX, PPTX), and images into structured Markdown, plain text, or JSON — all locally, no API keys, no cloud.
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- **Markdown output** — headings, tables, lists, links preserved
|
|
12
|
+
- **JSON output** — structured data with bounding boxes and spatial coordinates
|
|
13
|
+
- **OCR** — built-in Tesseract for scanned documents (100+ languages)
|
|
14
|
+
- **Encrypted PDFs** — password-protected PDF support
|
|
15
|
+
- **Page targeting** — parse specific pages only
|
|
16
|
+
- **Lightweight** — Rust core, no GPU, no PyTorch, minimal system load
|
|
17
|
+
|
|
18
|
+
## Supported Formats
|
|
19
|
+
|
|
20
|
+
**Native:** PDF
|
|
21
|
+
**Via LibreOffice:** DOCX, DOC, PPTX, PPT, XLSX, XLS, ODT, RTF, Pages, Numbers, Keynote
|
|
22
|
+
**Via ImageMagick:** JPG, PNG, GIF, BMP, TIFF, WebP, SVG
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
In n8n:
|
|
27
|
+
1. Go to **Settings** > **Community Nodes**
|
|
28
|
+
2. Enter `n8n-nodes-liteparse`
|
|
29
|
+
3. Click **Install**
|
|
30
|
+
|
|
31
|
+
Or manually:
|
|
32
|
+
```bash
|
|
33
|
+
cd ~/.n8n/nodes
|
|
34
|
+
npm install n8n-nodes-liteparse
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Node Parameters
|
|
38
|
+
|
|
39
|
+
| Parameter | Default | Description |
|
|
40
|
+
|---|---|---|
|
|
41
|
+
| Input Binary Field | `data` | Binary property containing the document |
|
|
42
|
+
| Output Format | `markdown` | `markdown` / `text` / `json` |
|
|
43
|
+
| OCR Enabled | `true` | Run OCR on scanned docs |
|
|
44
|
+
| OCR Language | `eng` | Tesseract language code |
|
|
45
|
+
| Password | — | For encrypted PDFs |
|
|
46
|
+
| Target Pages | — | e.g. `1-5,10` |
|
|
47
|
+
| DPI | `150` | Rendering resolution |
|
|
48
|
+
|
|
49
|
+
## Example Workflow
|
|
50
|
+
|
|
51
|
+
```
|
|
52
|
+
[Read Binary File / HTTP Request] → [LiteParse] → [Code Node / AI Agent]
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## License
|
|
56
|
+
|
|
57
|
+
MIT
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.LiteParseNode = void 0;
|
|
4
|
+
var LiteParse_node_1 = require("./nodes/LiteParse/LiteParse.node");
|
|
5
|
+
Object.defineProperty(exports, "LiteParseNode", { enumerable: true, get: function () { return LiteParse_node_1.LiteParseNode; } });
|
|
6
|
+
//# sourceMappingURL=index.js.map
|
package/package.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "n8n-nodes-lite-parser",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "n8n community node for LiteParse — fast, local document parsing (PDF, Office, images) with Markdown/JSON output",
|
|
5
|
+
"main": "dist/index.js",
|
|
6
|
+
"scripts": {
|
|
7
|
+
"build": "tsc",
|
|
8
|
+
"dev": "tsc --watch",
|
|
9
|
+
"prepublishOnly": "npm run build"
|
|
10
|
+
},
|
|
11
|
+
"keywords": [
|
|
12
|
+
"n8n-community-node-package",
|
|
13
|
+
"n8n",
|
|
14
|
+
"pdf",
|
|
15
|
+
"parser",
|
|
16
|
+
"ocr",
|
|
17
|
+
"markdown",
|
|
18
|
+
"liteparse",
|
|
19
|
+
"document-parsing"
|
|
20
|
+
],
|
|
21
|
+
"author": "",
|
|
22
|
+
"license": "MIT",
|
|
23
|
+
"devDependencies": {
|
|
24
|
+
"@types/node": "^20.0.0",
|
|
25
|
+
"typescript": "^5.0.0"
|
|
26
|
+
},
|
|
27
|
+
"dependencies": {
|
|
28
|
+
"@llamaindex/liteparse": "^2.1.2"
|
|
29
|
+
},
|
|
30
|
+
"peerDependencies": {
|
|
31
|
+
"n8n-workflow": "*"
|
|
32
|
+
},
|
|
33
|
+
"n8n": {
|
|
34
|
+
"n8nNodesApiVersion": 1,
|
|
35
|
+
"nodes": [
|
|
36
|
+
"dist/nodes/LiteParse/LiteParse.node.js"
|
|
37
|
+
]
|
|
38
|
+
}
|
|
39
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { LiteParseNode } from './nodes/LiteParse/LiteParse.node';
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
import {
|
|
2
|
+
IExecuteFunctions,
|
|
3
|
+
INodeExecutionData,
|
|
4
|
+
INodeType,
|
|
5
|
+
INodeTypeDescription,
|
|
6
|
+
NodeConnectionTypes,
|
|
7
|
+
} from 'n8n-workflow';
|
|
8
|
+
|
|
9
|
+
export class LiteParseNode implements INodeType {
|
|
10
|
+
description: INodeTypeDescription = {
|
|
11
|
+
displayName: 'LiteParse',
|
|
12
|
+
name: 'liteParse',
|
|
13
|
+
icon: 'file:liteparse.svg',
|
|
14
|
+
group: ['transform'],
|
|
15
|
+
version: 1,
|
|
16
|
+
subtitle: '={{ $parameter["outputFormat"] }}',
|
|
17
|
+
description: 'Parse PDFs, Office docs, and images into Markdown, JSON, or text — locally, no API keys',
|
|
18
|
+
defaults: {
|
|
19
|
+
name: 'LiteParse',
|
|
20
|
+
},
|
|
21
|
+
inputs: [NodeConnectionTypes.Main],
|
|
22
|
+
outputs: [NodeConnectionTypes.Main],
|
|
23
|
+
properties: [
|
|
24
|
+
{
|
|
25
|
+
displayName: 'Input Binary Field',
|
|
26
|
+
name: 'binaryPropertyName',
|
|
27
|
+
type: 'string',
|
|
28
|
+
default: 'data',
|
|
29
|
+
required: true,
|
|
30
|
+
description: 'Name of the binary property containing the document to parse',
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
displayName: 'Output Format',
|
|
34
|
+
name: 'outputFormat',
|
|
35
|
+
type: 'options',
|
|
36
|
+
options: [
|
|
37
|
+
{
|
|
38
|
+
name: 'Markdown',
|
|
39
|
+
value: 'markdown',
|
|
40
|
+
description: 'Structured Markdown with headings, tables, lists, and links',
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
name: 'Text',
|
|
44
|
+
value: 'text',
|
|
45
|
+
description: 'Plain text with layout preserved',
|
|
46
|
+
},
|
|
47
|
+
{
|
|
48
|
+
name: 'JSON',
|
|
49
|
+
value: 'json',
|
|
50
|
+
description: 'Structured JSON with bounding boxes and spatial data',
|
|
51
|
+
},
|
|
52
|
+
],
|
|
53
|
+
default: 'markdown',
|
|
54
|
+
description: 'Format of the parsed output',
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
displayName: 'OCR Enabled',
|
|
58
|
+
name: 'ocrEnabled',
|
|
59
|
+
type: 'boolean',
|
|
60
|
+
default: true,
|
|
61
|
+
description: 'Whether to run OCR on scanned documents and images',
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
displayName: 'OCR Language',
|
|
65
|
+
name: 'ocrLanguage',
|
|
66
|
+
type: 'string',
|
|
67
|
+
default: 'eng',
|
|
68
|
+
displayOptions: {
|
|
69
|
+
show: {
|
|
70
|
+
ocrEnabled: [true],
|
|
71
|
+
},
|
|
72
|
+
},
|
|
73
|
+
description: 'Tesseract language code (e.g. eng, fra, deu, ara, ara+eng)',
|
|
74
|
+
},
|
|
75
|
+
{
|
|
76
|
+
displayName: 'Password',
|
|
77
|
+
name: 'password',
|
|
78
|
+
type: 'string',
|
|
79
|
+
typeOptions: { password: true },
|
|
80
|
+
default: '',
|
|
81
|
+
description: 'Password for encrypted/protected PDFs (leave empty if not encrypted)',
|
|
82
|
+
},
|
|
83
|
+
{
|
|
84
|
+
displayName: 'Target Pages',
|
|
85
|
+
name: 'targetPages',
|
|
86
|
+
type: 'string',
|
|
87
|
+
default: '',
|
|
88
|
+
placeholder: '1-5,10,15-20',
|
|
89
|
+
description: 'Specific pages to parse (leave empty for all pages)',
|
|
90
|
+
},
|
|
91
|
+
{
|
|
92
|
+
displayName: 'DPI',
|
|
93
|
+
name: 'dpi',
|
|
94
|
+
type: 'number',
|
|
95
|
+
default: 150,
|
|
96
|
+
typeOptions: {
|
|
97
|
+
minValue: 72,
|
|
98
|
+
maxValue: 600,
|
|
99
|
+
},
|
|
100
|
+
description: 'Rendering DPI for OCR and screenshots (higher = better quality, slower)',
|
|
101
|
+
},
|
|
102
|
+
{
|
|
103
|
+
displayName: 'Options',
|
|
104
|
+
name: 'options',
|
|
105
|
+
type: 'collection',
|
|
106
|
+
placeholder: 'Add Option',
|
|
107
|
+
default: {},
|
|
108
|
+
options: [
|
|
109
|
+
{
|
|
110
|
+
displayName: 'Image Mode',
|
|
111
|
+
name: 'imageMode',
|
|
112
|
+
type: 'options',
|
|
113
|
+
options: [
|
|
114
|
+
{
|
|
115
|
+
name: 'Placeholder',
|
|
116
|
+
value: 'placeholder',
|
|
117
|
+
description: 'Reference images with placeholder links',
|
|
118
|
+
},
|
|
119
|
+
{
|
|
120
|
+
name: 'Off',
|
|
121
|
+
value: 'off',
|
|
122
|
+
description: 'Strip all images from output',
|
|
123
|
+
},
|
|
124
|
+
{
|
|
125
|
+
name: 'Embed',
|
|
126
|
+
value: 'embed',
|
|
127
|
+
description: 'Write image PNGs to disk and reference them',
|
|
128
|
+
},
|
|
129
|
+
],
|
|
130
|
+
default: 'placeholder',
|
|
131
|
+
description: 'How to handle images in Markdown output',
|
|
132
|
+
displayOptions: {
|
|
133
|
+
show: {
|
|
134
|
+
'/outputFormat': ['markdown'],
|
|
135
|
+
},
|
|
136
|
+
},
|
|
137
|
+
},
|
|
138
|
+
{
|
|
139
|
+
displayName: 'Extract Links',
|
|
140
|
+
name: 'extractLinks',
|
|
141
|
+
type: 'boolean',
|
|
142
|
+
default: true,
|
|
143
|
+
description: 'Whether to render hyperlinks as [text](url) in Markdown output',
|
|
144
|
+
displayOptions: {
|
|
145
|
+
show: {
|
|
146
|
+
'/outputFormat': ['markdown'],
|
|
147
|
+
},
|
|
148
|
+
},
|
|
149
|
+
},
|
|
150
|
+
{
|
|
151
|
+
displayName: 'Max Pages',
|
|
152
|
+
name: 'maxPages',
|
|
153
|
+
type: 'number',
|
|
154
|
+
default: 1000,
|
|
155
|
+
description: 'Maximum number of pages to parse',
|
|
156
|
+
},
|
|
157
|
+
{
|
|
158
|
+
displayName: 'OCR Workers',
|
|
159
|
+
name: 'numWorkers',
|
|
160
|
+
type: 'number',
|
|
161
|
+
default: 4,
|
|
162
|
+
description: 'Number of concurrent OCR workers',
|
|
163
|
+
},
|
|
164
|
+
{
|
|
165
|
+
displayName: 'Quiet Mode',
|
|
166
|
+
name: 'quiet',
|
|
167
|
+
type: 'boolean',
|
|
168
|
+
default: true,
|
|
169
|
+
description: 'Suppress progress output from LiteParse',
|
|
170
|
+
},
|
|
171
|
+
],
|
|
172
|
+
},
|
|
173
|
+
],
|
|
174
|
+
};
|
|
175
|
+
|
|
176
|
+
async execute(this: IExecuteFunctions): Promise<INodeExecutionData[][]> {
|
|
177
|
+
const { LiteParse } = await import('@llamaindex/liteparse');
|
|
178
|
+
|
|
179
|
+
const items = this.getInputData();
|
|
180
|
+
const returnData: INodeExecutionData[] = [];
|
|
181
|
+
|
|
182
|
+
for (let i = 0; i < items.length; i++) {
|
|
183
|
+
try {
|
|
184
|
+
const binaryPropertyName = this.getNodeParameter('binaryPropertyName', i) as string;
|
|
185
|
+
const outputFormat = this.getNodeParameter('outputFormat', i) as 'markdown' | 'text' | 'json';
|
|
186
|
+
const ocrEnabled = this.getNodeParameter('ocrEnabled', i) as boolean;
|
|
187
|
+
const ocrLanguage = this.getNodeParameter('ocrLanguage', i) as string;
|
|
188
|
+
const password = this.getNodeParameter('password', i) as string;
|
|
189
|
+
const targetPages = this.getNodeParameter('targetPages', i) as string;
|
|
190
|
+
const dpi = this.getNodeParameter('dpi', i) as number;
|
|
191
|
+
const options = this.getNodeParameter('options', i) as {
|
|
192
|
+
imageMode?: string;
|
|
193
|
+
extractLinks?: boolean;
|
|
194
|
+
maxPages?: number;
|
|
195
|
+
numWorkers?: number;
|
|
196
|
+
quiet?: boolean;
|
|
197
|
+
};
|
|
198
|
+
|
|
199
|
+
// Validate binary data exists
|
|
200
|
+
const binaryMetadata = items[i].binary?.[binaryPropertyName];
|
|
201
|
+
if (!binaryMetadata) {
|
|
202
|
+
throw new Error(`Binary property "${binaryPropertyName}" does not exist on item ${i}`);
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
// Get the file buffer
|
|
206
|
+
const buffer = await this.helpers.getBinaryDataBuffer(i, binaryPropertyName);
|
|
207
|
+
|
|
208
|
+
// Build parser config
|
|
209
|
+
const parserConfig: Record<string, any> = {
|
|
210
|
+
outputFormat,
|
|
211
|
+
ocrEnabled,
|
|
212
|
+
dpi,
|
|
213
|
+
};
|
|
214
|
+
|
|
215
|
+
if (ocrEnabled && ocrLanguage) {
|
|
216
|
+
parserConfig.ocrLanguage = ocrLanguage;
|
|
217
|
+
}
|
|
218
|
+
if (password) {
|
|
219
|
+
parserConfig.password = password;
|
|
220
|
+
}
|
|
221
|
+
if (targetPages) {
|
|
222
|
+
parserConfig.targetPages = targetPages;
|
|
223
|
+
}
|
|
224
|
+
if (options.imageMode && outputFormat === 'markdown') {
|
|
225
|
+
parserConfig.imageMode = options.imageMode;
|
|
226
|
+
}
|
|
227
|
+
if (options.extractLinks !== undefined && outputFormat === 'markdown') {
|
|
228
|
+
parserConfig.extractLinks = options.extractLinks;
|
|
229
|
+
}
|
|
230
|
+
if (options.maxPages) {
|
|
231
|
+
parserConfig.maxPages = options.maxPages;
|
|
232
|
+
}
|
|
233
|
+
if (options.numWorkers) {
|
|
234
|
+
parserConfig.numWorkers = options.numWorkers;
|
|
235
|
+
}
|
|
236
|
+
if (options.quiet !== undefined) {
|
|
237
|
+
parserConfig.quiet = options.quiet;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// Parse the document
|
|
241
|
+
const parser = new LiteParse(parserConfig);
|
|
242
|
+
const result = await parser.parse(buffer);
|
|
243
|
+
|
|
244
|
+
// Build output
|
|
245
|
+
const outputJson: Record<string, any> = {
|
|
246
|
+
text: result.text,
|
|
247
|
+
pages: result.pages?.length || 0,
|
|
248
|
+
format: outputFormat,
|
|
249
|
+
};
|
|
250
|
+
|
|
251
|
+
// Include structured page data for JSON format
|
|
252
|
+
if (outputFormat === 'json' && result.pages) {
|
|
253
|
+
outputJson.pageData = result.pages;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
returnData.push({
|
|
257
|
+
json: outputJson,
|
|
258
|
+
binary: items[i].binary,
|
|
259
|
+
pairedItem: { item: i },
|
|
260
|
+
});
|
|
261
|
+
} catch (error) {
|
|
262
|
+
if (this.continueOnFail()) {
|
|
263
|
+
returnData.push({
|
|
264
|
+
json: { error: (error as Error).message },
|
|
265
|
+
pairedItem: { item: i },
|
|
266
|
+
});
|
|
267
|
+
continue;
|
|
268
|
+
}
|
|
269
|
+
throw error;
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
return [returnData];
|
|
274
|
+
}
|
|
275
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
|
2
|
+
<path d="M14 2H6a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V8z"/>
|
|
3
|
+
<polyline points="14 2 14 8 20 8"/>
|
|
4
|
+
<line x1="16" y1="13" x2="8" y2="13"/>
|
|
5
|
+
<line x1="16" y1="17" x2="8" y2="17"/>
|
|
6
|
+
<polyline points="10 9 9 9 8 9"/>
|
|
7
|
+
</svg>
|
package/tsconfig.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
{
|
|
2
|
+
"compilerOptions": {
|
|
3
|
+
"strict": true,
|
|
4
|
+
"module": "commonjs",
|
|
5
|
+
"target": "es2020",
|
|
6
|
+
"lib": ["es2020"],
|
|
7
|
+
"moduleResolution": "node",
|
|
8
|
+
"esModuleInterop": true,
|
|
9
|
+
"skipLibCheck": true,
|
|
10
|
+
"forceConsistentCasingInFileNames": true,
|
|
11
|
+
"resolveJsonModule": true,
|
|
12
|
+
"declaration": true,
|
|
13
|
+
"declarationMap": true,
|
|
14
|
+
"sourceMap": true,
|
|
15
|
+
"outDir": "./dist",
|
|
16
|
+
"rootDir": "./src"
|
|
17
|
+
},
|
|
18
|
+
"include": ["src/**/*.ts"],
|
|
19
|
+
"exclude": ["node_modules", "dist"]
|
|
20
|
+
}
|