n8n-nodes-lite-parser 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,57 @@
1
+ # n8n-nodes-liteparse
2
+
3
+ An n8n community node for [LiteParse](https://github.com/run-llama/liteparse) — fast, local document parsing with Markdown and JSON output.
4
+
5
+ ## What It Does
6
+
7
+ Parse PDFs, Office documents (DOCX, XLSX, PPTX), and images into structured Markdown, plain text, or JSON — all locally, no API keys, no cloud.
8
+
9
+ ## Features
10
+
11
+ - **Markdown output** — headings, tables, lists, links preserved
12
+ - **JSON output** — structured data with bounding boxes and spatial coordinates
13
+ - **OCR** — built-in Tesseract for scanned documents (100+ languages)
14
+ - **Encrypted PDFs** — password-protected PDF support
15
+ - **Page targeting** — parse specific pages only
16
+ - **Lightweight** — Rust core, no GPU, no PyTorch, minimal system load
17
+
18
+ ## Supported Formats
19
+
20
+ **Native:** PDF
21
+ **Via LibreOffice:** DOCX, DOC, PPTX, PPT, XLSX, XLS, ODT, RTF, Pages, Numbers, Keynote
22
+ **Via ImageMagick:** JPG, PNG, GIF, BMP, TIFF, WebP, SVG
23
+
24
+ ## Installation
25
+
26
+ In n8n:
27
+ 1. Go to **Settings** > **Community Nodes**
28
+ 2. Enter `n8n-nodes-liteparse`
29
+ 3. Click **Install**
30
+
31
+ Or manually:
32
+ ```bash
33
+ cd ~/.n8n/nodes
34
+ npm install n8n-nodes-liteparse
35
+ ```
36
+
37
+ ## Node Parameters
38
+
39
+ | Parameter | Default | Description |
40
+ |---|---|---|
41
+ | Input Binary Field | `data` | Binary property containing the document |
42
+ | Output Format | `markdown` | `markdown` / `text` / `json` |
43
+ | OCR Enabled | `true` | Run OCR on scanned docs |
44
+ | OCR Language | `eng` | Tesseract language code |
45
+ | Password | — | For encrypted PDFs |
46
+ | Target Pages | — | e.g. `1-5,10` |
47
+ | DPI | `150` | Rendering resolution |
48
+
49
+ ## Example Workflow
50
+
51
+ ```
52
+ [Read Binary File / HTTP Request] → [LiteParse] → [Code Node / AI Agent]
53
+ ```
54
+
55
+ ## License
56
+
57
+ MIT
package/dist/index.js ADDED
@@ -0,0 +1,6 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.LiteParseNode = void 0;
4
+ var LiteParse_node_1 = require("./nodes/LiteParse/LiteParse.node");
5
+ Object.defineProperty(exports, "LiteParseNode", { enumerable: true, get: function () { return LiteParse_node_1.LiteParseNode; } });
6
+ //# sourceMappingURL=index.js.map
package/package.json ADDED
@@ -0,0 +1,39 @@
1
+ {
2
+ "name": "n8n-nodes-lite-parser",
3
+ "version": "1.0.0",
4
+ "description": "n8n community node for LiteParse — fast, local document parsing (PDF, Office, images) with Markdown/JSON output",
5
+ "main": "dist/index.js",
6
+ "scripts": {
7
+ "build": "tsc",
8
+ "dev": "tsc --watch",
9
+ "prepublishOnly": "npm run build"
10
+ },
11
+ "keywords": [
12
+ "n8n-community-node-package",
13
+ "n8n",
14
+ "pdf",
15
+ "parser",
16
+ "ocr",
17
+ "markdown",
18
+ "liteparse",
19
+ "document-parsing"
20
+ ],
21
+ "author": "",
22
+ "license": "MIT",
23
+ "devDependencies": {
24
+ "@types/node": "^20.0.0",
25
+ "typescript": "^5.0.0"
26
+ },
27
+ "dependencies": {
28
+ "@llamaindex/liteparse": "^2.1.2"
29
+ },
30
+ "peerDependencies": {
31
+ "n8n-workflow": "*"
32
+ },
33
+ "n8n": {
34
+ "n8nNodesApiVersion": 1,
35
+ "nodes": [
36
+ "dist/nodes/LiteParse/LiteParse.node.js"
37
+ ]
38
+ }
39
+ }
package/src/index.ts ADDED
@@ -0,0 +1 @@
1
+ export { LiteParseNode } from './nodes/LiteParse/LiteParse.node';
@@ -0,0 +1,275 @@
1
+ import {
2
+ IExecuteFunctions,
3
+ INodeExecutionData,
4
+ INodeType,
5
+ INodeTypeDescription,
6
+ NodeConnectionTypes,
7
+ } from 'n8n-workflow';
8
+
9
+ export class LiteParseNode implements INodeType {
10
+ description: INodeTypeDescription = {
11
+ displayName: 'LiteParse',
12
+ name: 'liteParse',
13
+ icon: 'file:liteparse.svg',
14
+ group: ['transform'],
15
+ version: 1,
16
+ subtitle: '={{ $parameter["outputFormat"] }}',
17
+ description: 'Parse PDFs, Office docs, and images into Markdown, JSON, or text — locally, no API keys',
18
+ defaults: {
19
+ name: 'LiteParse',
20
+ },
21
+ inputs: [NodeConnectionTypes.Main],
22
+ outputs: [NodeConnectionTypes.Main],
23
+ properties: [
24
+ {
25
+ displayName: 'Input Binary Field',
26
+ name: 'binaryPropertyName',
27
+ type: 'string',
28
+ default: 'data',
29
+ required: true,
30
+ description: 'Name of the binary property containing the document to parse',
31
+ },
32
+ {
33
+ displayName: 'Output Format',
34
+ name: 'outputFormat',
35
+ type: 'options',
36
+ options: [
37
+ {
38
+ name: 'Markdown',
39
+ value: 'markdown',
40
+ description: 'Structured Markdown with headings, tables, lists, and links',
41
+ },
42
+ {
43
+ name: 'Text',
44
+ value: 'text',
45
+ description: 'Plain text with layout preserved',
46
+ },
47
+ {
48
+ name: 'JSON',
49
+ value: 'json',
50
+ description: 'Structured JSON with bounding boxes and spatial data',
51
+ },
52
+ ],
53
+ default: 'markdown',
54
+ description: 'Format of the parsed output',
55
+ },
56
+ {
57
+ displayName: 'OCR Enabled',
58
+ name: 'ocrEnabled',
59
+ type: 'boolean',
60
+ default: true,
61
+ description: 'Whether to run OCR on scanned documents and images',
62
+ },
63
+ {
64
+ displayName: 'OCR Language',
65
+ name: 'ocrLanguage',
66
+ type: 'string',
67
+ default: 'eng',
68
+ displayOptions: {
69
+ show: {
70
+ ocrEnabled: [true],
71
+ },
72
+ },
73
+ description: 'Tesseract language code (e.g. eng, fra, deu, ara, ara+eng)',
74
+ },
75
+ {
76
+ displayName: 'Password',
77
+ name: 'password',
78
+ type: 'string',
79
+ typeOptions: { password: true },
80
+ default: '',
81
+ description: 'Password for encrypted/protected PDFs (leave empty if not encrypted)',
82
+ },
83
+ {
84
+ displayName: 'Target Pages',
85
+ name: 'targetPages',
86
+ type: 'string',
87
+ default: '',
88
+ placeholder: '1-5,10,15-20',
89
+ description: 'Specific pages to parse (leave empty for all pages)',
90
+ },
91
+ {
92
+ displayName: 'DPI',
93
+ name: 'dpi',
94
+ type: 'number',
95
+ default: 150,
96
+ typeOptions: {
97
+ minValue: 72,
98
+ maxValue: 600,
99
+ },
100
+ description: 'Rendering DPI for OCR and screenshots (higher = better quality, slower)',
101
+ },
102
+ {
103
+ displayName: 'Options',
104
+ name: 'options',
105
+ type: 'collection',
106
+ placeholder: 'Add Option',
107
+ default: {},
108
+ options: [
109
+ {
110
+ displayName: 'Image Mode',
111
+ name: 'imageMode',
112
+ type: 'options',
113
+ options: [
114
+ {
115
+ name: 'Placeholder',
116
+ value: 'placeholder',
117
+ description: 'Reference images with placeholder links',
118
+ },
119
+ {
120
+ name: 'Off',
121
+ value: 'off',
122
+ description: 'Strip all images from output',
123
+ },
124
+ {
125
+ name: 'Embed',
126
+ value: 'embed',
127
+ description: 'Write image PNGs to disk and reference them',
128
+ },
129
+ ],
130
+ default: 'placeholder',
131
+ description: 'How to handle images in Markdown output',
132
+ displayOptions: {
133
+ show: {
134
+ '/outputFormat': ['markdown'],
135
+ },
136
+ },
137
+ },
138
+ {
139
+ displayName: 'Extract Links',
140
+ name: 'extractLinks',
141
+ type: 'boolean',
142
+ default: true,
143
+ description: 'Whether to render hyperlinks as [text](url) in Markdown output',
144
+ displayOptions: {
145
+ show: {
146
+ '/outputFormat': ['markdown'],
147
+ },
148
+ },
149
+ },
150
+ {
151
+ displayName: 'Max Pages',
152
+ name: 'maxPages',
153
+ type: 'number',
154
+ default: 1000,
155
+ description: 'Maximum number of pages to parse',
156
+ },
157
+ {
158
+ displayName: 'OCR Workers',
159
+ name: 'numWorkers',
160
+ type: 'number',
161
+ default: 4,
162
+ description: 'Number of concurrent OCR workers',
163
+ },
164
+ {
165
+ displayName: 'Quiet Mode',
166
+ name: 'quiet',
167
+ type: 'boolean',
168
+ default: true,
169
+ description: 'Suppress progress output from LiteParse',
170
+ },
171
+ ],
172
+ },
173
+ ],
174
+ };
175
+
176
+ async execute(this: IExecuteFunctions): Promise<INodeExecutionData[][]> {
177
+ const { LiteParse } = await import('@llamaindex/liteparse');
178
+
179
+ const items = this.getInputData();
180
+ const returnData: INodeExecutionData[] = [];
181
+
182
+ for (let i = 0; i < items.length; i++) {
183
+ try {
184
+ const binaryPropertyName = this.getNodeParameter('binaryPropertyName', i) as string;
185
+ const outputFormat = this.getNodeParameter('outputFormat', i) as 'markdown' | 'text' | 'json';
186
+ const ocrEnabled = this.getNodeParameter('ocrEnabled', i) as boolean;
187
+ const ocrLanguage = this.getNodeParameter('ocrLanguage', i) as string;
188
+ const password = this.getNodeParameter('password', i) as string;
189
+ const targetPages = this.getNodeParameter('targetPages', i) as string;
190
+ const dpi = this.getNodeParameter('dpi', i) as number;
191
+ const options = this.getNodeParameter('options', i) as {
192
+ imageMode?: string;
193
+ extractLinks?: boolean;
194
+ maxPages?: number;
195
+ numWorkers?: number;
196
+ quiet?: boolean;
197
+ };
198
+
199
+ // Validate binary data exists
200
+ const binaryMetadata = items[i].binary?.[binaryPropertyName];
201
+ if (!binaryMetadata) {
202
+ throw new Error(`Binary property "${binaryPropertyName}" does not exist on item ${i}`);
203
+ }
204
+
205
+ // Get the file buffer
206
+ const buffer = await this.helpers.getBinaryDataBuffer(i, binaryPropertyName);
207
+
208
+ // Build parser config
209
+ const parserConfig: Record<string, any> = {
210
+ outputFormat,
211
+ ocrEnabled,
212
+ dpi,
213
+ };
214
+
215
+ if (ocrEnabled && ocrLanguage) {
216
+ parserConfig.ocrLanguage = ocrLanguage;
217
+ }
218
+ if (password) {
219
+ parserConfig.password = password;
220
+ }
221
+ if (targetPages) {
222
+ parserConfig.targetPages = targetPages;
223
+ }
224
+ if (options.imageMode && outputFormat === 'markdown') {
225
+ parserConfig.imageMode = options.imageMode;
226
+ }
227
+ if (options.extractLinks !== undefined && outputFormat === 'markdown') {
228
+ parserConfig.extractLinks = options.extractLinks;
229
+ }
230
+ if (options.maxPages) {
231
+ parserConfig.maxPages = options.maxPages;
232
+ }
233
+ if (options.numWorkers) {
234
+ parserConfig.numWorkers = options.numWorkers;
235
+ }
236
+ if (options.quiet !== undefined) {
237
+ parserConfig.quiet = options.quiet;
238
+ }
239
+
240
+ // Parse the document
241
+ const parser = new LiteParse(parserConfig);
242
+ const result = await parser.parse(buffer);
243
+
244
+ // Build output
245
+ const outputJson: Record<string, any> = {
246
+ text: result.text,
247
+ pages: result.pages?.length || 0,
248
+ format: outputFormat,
249
+ };
250
+
251
+ // Include structured page data for JSON format
252
+ if (outputFormat === 'json' && result.pages) {
253
+ outputJson.pageData = result.pages;
254
+ }
255
+
256
+ returnData.push({
257
+ json: outputJson,
258
+ binary: items[i].binary,
259
+ pairedItem: { item: i },
260
+ });
261
+ } catch (error) {
262
+ if (this.continueOnFail()) {
263
+ returnData.push({
264
+ json: { error: (error as Error).message },
265
+ pairedItem: { item: i },
266
+ });
267
+ continue;
268
+ }
269
+ throw error;
270
+ }
271
+ }
272
+
273
+ return [returnData];
274
+ }
275
+ }
@@ -0,0 +1,7 @@
1
+ <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
2
+ <path d="M14 2H6a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V8z"/>
3
+ <polyline points="14 2 14 8 20 8"/>
4
+ <line x1="16" y1="13" x2="8" y2="13"/>
5
+ <line x1="16" y1="17" x2="8" y2="17"/>
6
+ <polyline points="10 9 9 9 8 9"/>
7
+ </svg>
package/tsconfig.json ADDED
@@ -0,0 +1,20 @@
1
+ {
2
+ "compilerOptions": {
3
+ "strict": true,
4
+ "module": "commonjs",
5
+ "target": "es2020",
6
+ "lib": ["es2020"],
7
+ "moduleResolution": "node",
8
+ "esModuleInterop": true,
9
+ "skipLibCheck": true,
10
+ "forceConsistentCasingInFileNames": true,
11
+ "resolveJsonModule": true,
12
+ "declaration": true,
13
+ "declarationMap": true,
14
+ "sourceMap": true,
15
+ "outDir": "./dist",
16
+ "rootDir": "./src"
17
+ },
18
+ "include": ["src/**/*.ts"],
19
+ "exclude": ["node_modules", "dist"]
20
+ }