@n8n/ai-workflow-builder 0.31.2 → 0.32.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ai-workflow-builder-agent.service.d.ts +6 -2
- package/dist/ai-workflow-builder-agent.service.js +45 -3
- package/dist/ai-workflow-builder-agent.service.js.map +1 -1
- package/dist/build.tsbuildinfo +1 -1
- package/dist/tools/best-practices/data-analysis.d.ts +7 -0
- package/dist/tools/best-practices/data-analysis.js +367 -0
- package/dist/tools/best-practices/data-analysis.js.map +1 -0
- package/dist/tools/best-practices/data-extraction.js +7 -0
- package/dist/tools/best-practices/data-extraction.js.map +1 -1
- package/dist/tools/best-practices/data-transformation.d.ts +7 -0
- package/dist/tools/best-practices/data-transformation.js +181 -0
- package/dist/tools/best-practices/data-transformation.js.map +1 -0
- package/dist/tools/best-practices/document-processing.d.ts +7 -0
- package/dist/tools/best-practices/document-processing.js +324 -0
- package/dist/tools/best-practices/document-processing.js.map +1 -0
- package/dist/tools/best-practices/enrichment.d.ts +7 -0
- package/dist/tools/best-practices/enrichment.js +271 -0
- package/dist/tools/best-practices/enrichment.js.map +1 -0
- package/dist/tools/best-practices/human-in-the-loop.d.ts +7 -0
- package/dist/tools/best-practices/human-in-the-loop.js +268 -0
- package/dist/tools/best-practices/human-in-the-loop.js.map +1 -0
- package/dist/tools/best-practices/index.js +7 -6
- package/dist/tools/best-practices/index.js.map +1 -1
- package/dist/tools/best-practices/knowledge-base.d.ts +7 -0
- package/dist/tools/best-practices/knowledge-base.js +268 -0
- package/dist/tools/best-practices/knowledge-base.js.map +1 -0
- package/dist/tools/best-practices/monitoring.d.ts +7 -0
- package/dist/tools/best-practices/monitoring.js +178 -0
- package/dist/tools/best-practices/monitoring.js.map +1 -0
- package/dist/tools/best-practices/notification.d.ts +7 -0
- package/dist/tools/best-practices/notification.js +229 -0
- package/dist/tools/best-practices/notification.js.map +1 -0
- package/dist/tools/best-practices/scheduling.d.ts +7 -0
- package/dist/tools/best-practices/scheduling.js +281 -0
- package/dist/tools/best-practices/scheduling.js.map +1 -0
- package/dist/tools/best-practices/triage.d.ts +7 -0
- package/dist/tools/best-practices/triage.js +211 -0
- package/dist/tools/best-practices/triage.js.map +1 -0
- package/dist/tools/categorize-prompt.tool.js +1 -0
- package/dist/tools/categorize-prompt.tool.js.map +1 -1
- package/dist/tools/helpers/response.js +2 -0
- package/dist/tools/helpers/response.js.map +1 -1
- package/dist/tools/prompts/main-agent.prompt.js +9 -1
- package/dist/tools/prompts/main-agent.prompt.js.map +1 -1
- package/dist/tools/validate-workflow.tool.js +12 -0
- package/dist/tools/validate-workflow.tool.js.map +1 -1
- package/dist/utils/tool-executor.js +19 -0
- package/dist/utils/tool-executor.js.map +1 -1
- package/dist/validation/checks/agent-prompt.js +2 -0
- package/dist/validation/checks/agent-prompt.js.map +1 -1
- package/dist/validation/checks/connections.js +8 -0
- package/dist/validation/checks/connections.js.map +1 -1
- package/dist/validation/checks/from-ai.js +1 -0
- package/dist/validation/checks/from-ai.js.map +1 -1
- package/dist/validation/checks/tools.js +2 -0
- package/dist/validation/checks/tools.js.map +1 -1
- package/dist/validation/checks/trigger.js +2 -0
- package/dist/validation/checks/trigger.js.map +1 -1
- package/dist/validation/types.d.ts +4 -0
- package/dist/validation/types.js +18 -0
- package/dist/validation/types.js.map +1 -1
- package/dist/workflow-builder-agent.d.ts +5 -2
- package/dist/workflow-builder-agent.js +4 -3
- package/dist/workflow-builder-agent.js.map +1 -1
- package/dist/workflow-state.d.ts +3 -1
- package/dist/workflow-state.js +8 -0
- package/dist/workflow-state.js.map +1 -1
- package/package.json +11 -7
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.DocumentProcessingBestPractices = void 0;
|
|
4
|
+
const categorization_1 = require("../../types/categorization");
|
|
5
|
+
class DocumentProcessingBestPractices {
|
|
6
|
+
technique = categorization_1.WorkflowTechnique.DOCUMENT_PROCESSING;
|
|
7
|
+
version = '1.0.0';
|
|
8
|
+
documentation = `# Best Practices: Document Processing Workflows
|
|
9
|
+
|
|
10
|
+
## Workflow Design
|
|
11
|
+
|
|
12
|
+
Document processing workflows extract and act on content from files like PDFs, images, Word documents, and spreadsheets. Design your workflow following these core patterns:
|
|
13
|
+
|
|
14
|
+
### Core Architecture Pattern
|
|
15
|
+
Trigger → Capture Binary → Extract Text → Parse/Transform → Route to Destination → Notify
|
|
16
|
+
|
|
17
|
+
### Common Flow Patterns
|
|
18
|
+
|
|
19
|
+
**Simple Document Processing:**
|
|
20
|
+
- Gmail Trigger → Check file type → Extract from File → DataTable → Slack notification
|
|
21
|
+
- Best for: Basic text-based PDFs with straightforward data extraction
|
|
22
|
+
|
|
23
|
+
**Complex Document Processing with AI:**
|
|
24
|
+
- Webhook → File Type Check → OCR (if image) → AI Extract → Validate → CRM Update → Multiple notifications
|
|
25
|
+
- Best for: Varied document formats requiring intelligent parsing
|
|
26
|
+
|
|
27
|
+
**Batch Document Processing:**
|
|
28
|
+
- Main workflow: Schedule Trigger → Fetch Files → Split In Batches → Sub-workflow → Merge Results → Bulk Update
|
|
29
|
+
- Sub-workflow When Executed by Another Workflow -> Process result
|
|
30
|
+
- Best for: High-volume processing with API rate limits
|
|
31
|
+
|
|
32
|
+
**Multi-Source Document Aggregation:**
|
|
33
|
+
- Multiple Triggers (Email + Drive + Webhook) → Set common fields → Standardize → Process → Store
|
|
34
|
+
- Best for: Documents from various channels needing unified processing
|
|
35
|
+
|
|
36
|
+
### Branching Strategy
|
|
37
|
+
|
|
38
|
+
Always branch early based on document characteristics:
|
|
39
|
+
- **File Type Branching**: Use IF/Switch nodes immediately after ingestion to route PDFs vs images vs spreadsheets
|
|
40
|
+
- **Provider Branching**: Route documents to provider-specific processing (e.g., different invoice formats)
|
|
41
|
+
- **Quality Branching**: Separate high-confidence extractions from those needing manual review
|
|
42
|
+
|
|
43
|
+
## Binary Data Management
|
|
44
|
+
Documents in n8n are handled as binary data that must be carefully preserved throughout the workflow.
|
|
45
|
+
|
|
46
|
+
### Referencing Binary Data from Other Nodes
|
|
47
|
+
When you need to reference binary data from a previous node, use this syntax:
|
|
48
|
+
- Expression: '{{ $('Node Name').item.binary.property_name }}' or {{ $binary.property_name }} if previous item
|
|
49
|
+
- Example for Gmail attachments: '{{ $('Gmail Trigger').item.binary.attachment_0 }}' or {{ $binary.attachment_0 }} if previous item
|
|
50
|
+
- Example for webhook data: '{{ $('Webhook').item.binary.data }}' or {{ $binary.data }} if previous item
|
|
51
|
+
- Important: The property name depends on how the previous node names the binary data
|
|
52
|
+
|
|
53
|
+
### Preserving Binary Data
|
|
54
|
+
- Many nodes (Code, Edit Fields, AI nodes) output JSON and drop binary data by default
|
|
55
|
+
- Use parallel branches: one for processing, one to preserve the original file
|
|
56
|
+
- Rejoin branches with Merge node in pass-through mode
|
|
57
|
+
- Alternative: Configure nodes to keep binary (e.g., Edit field node's "Include Other Input Fields" option ON)
|
|
58
|
+
|
|
59
|
+
### Memory Optimization
|
|
60
|
+
For high-volume processing:
|
|
61
|
+
- Process files sequentially or in small batches
|
|
62
|
+
- Drop unnecessary binary data after extraction to free memory
|
|
63
|
+
|
|
64
|
+
## Text Extraction Strategy
|
|
65
|
+
|
|
66
|
+
Choose extraction method based on document type and content:
|
|
67
|
+
|
|
68
|
+
### Critical: File Type Detection
|
|
69
|
+
**ALWAYS check the file type before using Extract from File node** (unless the file type is already known):
|
|
70
|
+
- Use IF node to check file extension or MIME type first
|
|
71
|
+
- The Extract from File node has multiple operations, each for a specific file type:
|
|
72
|
+
- "Extract from PDF" for PDF files
|
|
73
|
+
- "Extract from MS Excel" for Excel files (.xlsx, .xls)
|
|
74
|
+
- "Extract from MS Word" for Word documents (.docx, .doc)
|
|
75
|
+
- "Extract from CSV" for CSV files
|
|
76
|
+
- "Extract from HTML" for HTML files
|
|
77
|
+
- "Extract from RTF" for Rich Text Format files
|
|
78
|
+
- "Extract from Text File" for plain text files
|
|
79
|
+
- Using the wrong operation will result in errors or empty output
|
|
80
|
+
|
|
81
|
+
### Decision Tree for Extraction
|
|
82
|
+
1. **Check file type** → Route to appropriate extraction method
|
|
83
|
+
2. **Scanned image/PDF?** → Use OCR service (OCR.space, AWS Textract, Google Vision)
|
|
84
|
+
3. **Structured invoice/receipt?** → Use specialized parser (Mindee) or AI extraction
|
|
85
|
+
4. **Text-based document?** → Use Extract from File with the correct operation for that file type
|
|
86
|
+
|
|
87
|
+
### Fallback Strategy
|
|
88
|
+
Always implement fallback for extraction failures:
|
|
89
|
+
- Check if text extraction returns empty
|
|
90
|
+
- If empty, automatically route to OCR
|
|
91
|
+
- If OCR fails, send to manual review queue
|
|
92
|
+
|
|
93
|
+
## Data Parsing & Classification
|
|
94
|
+
|
|
95
|
+
### AI-Powered Extraction Pattern
|
|
96
|
+
For varied or complex documents:
|
|
97
|
+
|
|
98
|
+
Option 1 - Using Document Loader (Recommended for binary files):
|
|
99
|
+
1. Pass binary data directly to Document Loader node (set Data Source to "Binary")
|
|
100
|
+
2. Connect to AI Agent or LLM Chain for processing
|
|
101
|
+
3. Use Structured Output Parser to ensure consistent JSON
|
|
102
|
+
4. Validate extracted fields before processing
|
|
103
|
+
|
|
104
|
+
Option 2 - Using text extraction:
|
|
105
|
+
1. Extract raw text using Extract from File or OCR
|
|
106
|
+
2. Pass to AI Agent or LLM Chain with structured prompt
|
|
107
|
+
3. Use Structured Output Parser to ensure consistent JSON
|
|
108
|
+
4. Validate extracted fields before processing
|
|
109
|
+
|
|
110
|
+
Example system prompt structure:
|
|
111
|
+
"Extract the following fields from the document: [field list]. Return as JSON with this schema: [schema example]"
|
|
112
|
+
|
|
113
|
+
### Document Classification Flow
|
|
114
|
+
Classify before processing for better accuracy:
|
|
115
|
+
1. Initial AI classification (Invoice vs Receipt vs Contract)
|
|
116
|
+
2. Route to specialized sub-workflow based on type
|
|
117
|
+
3. Use type-specific prompts and validation rules
|
|
118
|
+
4. This reduces errors and improves extraction quality
|
|
119
|
+
|
|
120
|
+
## Error Handling Strategy
|
|
121
|
+
|
|
122
|
+
Build resilience at every step:
|
|
123
|
+
|
|
124
|
+
### Validation Checkpoints
|
|
125
|
+
- After extraction: Verify text not empty
|
|
126
|
+
- After AI parsing: Validate JSON schema
|
|
127
|
+
- Before database insert: Check required fields
|
|
128
|
+
- After API calls: Verify success response
|
|
129
|
+
|
|
130
|
+
## Performance Optimization
|
|
131
|
+
|
|
132
|
+
### Batch Processing Strategy
|
|
133
|
+
- Use Split In Batches node: process 5-10 files at a time
|
|
134
|
+
- Implement delays between batches for rate-limited APIs
|
|
135
|
+
- Monitor memory usage and adjust batch size accordingly
|
|
136
|
+
|
|
137
|
+
## Recommended Nodes
|
|
138
|
+
|
|
139
|
+
### Triggers & Input
|
|
140
|
+
|
|
141
|
+
**Gmail Trigger (n8n-nodes-base.gmailTrigger)**
|
|
142
|
+
Purpose: Monitor Gmail for emails with attachments (Recommended over IMAP)
|
|
143
|
+
Advantages: Real-time processing, simpler authentication, better integration with Google Workspace
|
|
144
|
+
Critical Configuration for Attachments:
|
|
145
|
+
- **MUST set "Simplify" to FALSE** - otherwise attachments won't be available
|
|
146
|
+
- **MUST set "Download Attachments" to TRUE** to retrieve files
|
|
147
|
+
- Set appropriate label filters
|
|
148
|
+
- Set "Property Prefix Name" (e.g., "attachment_") - attachments will be named with this prefix plus index
|
|
149
|
+
- Important: When referencing its binary data, it will be referenced "attachment_0", "attachment_1", etc., NOT "data"
|
|
150
|
+
|
|
151
|
+
**Email Read (IMAP) (n8n-nodes-base.emailReadImap)**
|
|
152
|
+
Purpose: Alternative email fetching if there's no specialized node for email provider
|
|
153
|
+
Configuration:
|
|
154
|
+
- Enable "Download Attachments" to retrieve files
|
|
155
|
+
- Set "Property Prefix Name" (e.g., "attachment_") - attachments will be named with this prefix plus index
|
|
156
|
+
- Important: When referencing binary data, it will be referenced "attachment_0", "attachment_1", etc., NOT "data"
|
|
157
|
+
|
|
158
|
+
**HTTP Webhook (n8n-nodes-base.webhook)**
|
|
159
|
+
Purpose: Receive file uploads from web forms
|
|
160
|
+
Configuration: Enable "Raw Body" for binary data
|
|
161
|
+
|
|
162
|
+
**Google Drive Trigger (n8n-nodes-base.googleDriveTrigger)**
|
|
163
|
+
Purpose: Monitor folders for new documents
|
|
164
|
+
Configuration: Set appropriate folder and file type filters
|
|
165
|
+
|
|
166
|
+
### Text Extraction
|
|
167
|
+
|
|
168
|
+
**Extract from File (n8n-nodes-base.extractFromFile)**
|
|
169
|
+
Purpose: Extract text from various file formats using format-specific operations
|
|
170
|
+
Critical: ALWAYS check file type first with an IF or Switch before and select the correct operation (Extract from PDF, Extract from MS Excel, etc.)
|
|
171
|
+
Output: Extracted text is returned under the "text" key in JSON (e.g., access with {{ $json.text }})
|
|
172
|
+
Pitfalls: Returns empty for scanned documents - always check and fallback to OCR; Using wrong operation causes errors
|
|
173
|
+
|
|
174
|
+
**AWS Textract (n8n-nodes-base.awsTextract)**
|
|
175
|
+
Purpose: Advanced OCR with table and form detection
|
|
176
|
+
Best for: Structured documents like invoices and forms
|
|
177
|
+
|
|
178
|
+
**Mindee (n8n-nodes-base.mindee)**
|
|
179
|
+
Purpose: Specialized invoice and receipt parsing
|
|
180
|
+
Returns: Structured JSON with line items, totals, dates
|
|
181
|
+
|
|
182
|
+
### Data Processing
|
|
183
|
+
|
|
184
|
+
**AI Agent (@n8n/n8n-nodes-langchain.agent)**
|
|
185
|
+
Purpose: Intelligent document parsing and decision making
|
|
186
|
+
Configuration: Include structured output tools for consistent results
|
|
187
|
+
|
|
188
|
+
**LLM Chain (@n8n/n8n-nodes-langchain.chainLlm)**
|
|
189
|
+
Purpose: Document classification and data extraction
|
|
190
|
+
Use with: Structured Output Parser for JSON consistency
|
|
191
|
+
|
|
192
|
+
**Document Loader (@n8n/n8n-nodes-langchain.documentLoader)**
|
|
193
|
+
Purpose: Load and process documents directly from binary data for AI processing
|
|
194
|
+
Critical: Use the "Binary" data source option to handle binary files directly - no need to convert to JSON first
|
|
195
|
+
Configuration: Select "Binary" as Data Source, specify the binary property name (by default data unless renamed in a previous node)
|
|
196
|
+
Best for: Direct document processing in AI workflows without intermediate extraction steps
|
|
197
|
+
|
|
198
|
+
**Structured Output Parser (@n8n/n8n-nodes-langchain.outputParserStructured)**
|
|
199
|
+
Purpose: Ensure AI outputs match expected JSON schema
|
|
200
|
+
Critical for: Database inserts and API calls
|
|
201
|
+
|
|
202
|
+
### Vector Storage (for RAG/Semantic Search)
|
|
203
|
+
**Simple Vector Store (@n8n/n8n-nodes-langchain.vectorStore) - RECOMMENDED**
|
|
204
|
+
Purpose: Easy-to-setup vector storage for document embeddings
|
|
205
|
+
Advantages:
|
|
206
|
+
- No external dependencies or API keys required
|
|
207
|
+
- Works out of the box with local storage
|
|
208
|
+
- Perfect for prototyping and small to medium datasets
|
|
209
|
+
Configuration: Just connect and use - no complex setup needed
|
|
210
|
+
Best for: Most document processing workflows that need semantic search
|
|
211
|
+
|
|
212
|
+
### Flow Control
|
|
213
|
+
|
|
214
|
+
**Split In Batches (n8n-nodes-base.splitInBatches)**
|
|
215
|
+
Purpose: Process multiple documents in controlled batches
|
|
216
|
+
Configuration: Set batch size based on API limits and memory
|
|
217
|
+
Outputs (in order):
|
|
218
|
+
- Output 0 "done": Executes after all batches are processed - use for final aggregation or notifications
|
|
219
|
+
- Output 1 "loop": Connect processing nodes here - executes for each batch
|
|
220
|
+
Important: Connect processing logic to the second output (loop), completion logic to the first output (done)
|
|
221
|
+
|
|
222
|
+
**Merge (n8n-nodes-base.merge)**
|
|
223
|
+
Purpose: Combine data from multiple branches that need to execute together
|
|
224
|
+
Critical: Merge node waits for ALL input branches to complete - do NOT use for independent/optional branches
|
|
225
|
+
Modes: Use "Pass Through" to preserve binary from one branch
|
|
226
|
+
|
|
227
|
+
**Edit Fields (Set) (n8n-nodes-base.set)**
|
|
228
|
+
Purpose: Better choice for combining data from separate/independent branches
|
|
229
|
+
Use for: Adding fields from different sources, preserving binary while adding processed data
|
|
230
|
+
Configuration: Set common fields and use "Include Other Input Fields" OFF to preserve existing data including binary
|
|
231
|
+
|
|
232
|
+
**Execute Workflow Trigger (n8n-nodes-base.executeWorkflowTrigger)**
|
|
233
|
+
Purpose: Start point for sub-workflows that are called by other workflows
|
|
234
|
+
Configuration: Automatically receives data from the calling workflow including binary data
|
|
235
|
+
Best practice: Use for modular workflow design, heavy processing tasks, or reusable workflow components
|
|
236
|
+
Pairing: Must be used with Execute Workflow node in the parent workflow
|
|
237
|
+
|
|
238
|
+
**Execute Workflow (n8n-nodes-base.executeWorkflow)**
|
|
239
|
+
Purpose: Call and execute another workflow from within the current workflow
|
|
240
|
+
Critical configurations:
|
|
241
|
+
- Workflow ID: Use expression "{{ $workflow.id }}" to reference sub-workflows in the same workflow
|
|
242
|
+
- Choose execution mode: "Run Once for All Items" or "Run Once for Each Item"
|
|
243
|
+
- Binary data is automatically passed to the sub-workflow
|
|
244
|
+
Best practice: Use for delegating heavy processing, creating reusable modules, or managing memory in large batch operations
|
|
245
|
+
|
|
246
|
+
### Data Destinations
|
|
247
|
+
|
|
248
|
+
**DataTable (n8n-nodes-base.dataTable)**
|
|
249
|
+
Purpose: Store extracted data in n8n's built-in data tables
|
|
250
|
+
Operations: Insert, Update, Select rows without external dependencies
|
|
251
|
+
Best for: Self-contained workflows that don't require external storage
|
|
252
|
+
|
|
253
|
+
**Google Sheets (n8n-nodes-base.googleSheets)**
|
|
254
|
+
Purpose: Log extracted data in external spreadsheet
|
|
255
|
+
Operations: Use "Append" for new rows, "Update" with key column for existing
|
|
256
|
+
Best for: Collaborative review and manual data validation
|
|
257
|
+
|
|
258
|
+
**Database Nodes**
|
|
259
|
+
- Postgres (n8n-nodes-base.postgres)
|
|
260
|
+
- MySQL (n8n-nodes-base.mySql)
|
|
261
|
+
- MongoDB (n8n-nodes-base.mongoDb)
|
|
262
|
+
Purpose: Store structured extraction results in production databases
|
|
263
|
+
Best Practice: Validate data schema before insert
|
|
264
|
+
|
|
265
|
+
### Utilities
|
|
266
|
+
|
|
267
|
+
**IF/Switch (n8n-nodes-base.if, n8n-nodes-base.switch)**
|
|
268
|
+
Purpose: Route based on file type, extraction quality, or classification
|
|
269
|
+
|
|
270
|
+
**Function/Code (n8n-nodes-base.function, n8n-nodes-base.code)**
|
|
271
|
+
Purpose: Custom validation, data transformation, or regex extraction
|
|
272
|
+
|
|
273
|
+
**HTTP Request (n8n-nodes-base.httpRequest)**
|
|
274
|
+
Purpose: Call external OCR APIs (OCR.space, Google Vision, Mistral OCR)
|
|
275
|
+
Configuration: Set "Response Format: File" for downloads
|
|
276
|
+
Critical: NEVER set API keys directly in the request - user can set credentials from the UI for secure API key management
|
|
277
|
+
|
|
278
|
+
## Common Pitfalls to Avoid
|
|
279
|
+
|
|
280
|
+
### Binary Data Loss
|
|
281
|
+
|
|
282
|
+
**Problem**: Binary file "disappears" after processing nodes
|
|
283
|
+
**Solution**:
|
|
284
|
+
- Use Merge node to reattach binary after processing
|
|
285
|
+
- Or configure nodes to explicitly keep binary data
|
|
286
|
+
- In Code nodes: copy items[0].binary to output
|
|
287
|
+
|
|
288
|
+
### Incorrect OCR Fallback
|
|
289
|
+
|
|
290
|
+
**Problem**: Not detecting when text extraction fails
|
|
291
|
+
**Solution**:
|
|
292
|
+
- Always check if extraction result is empty
|
|
293
|
+
- Implement automatic OCR fallback for scanned documents
|
|
294
|
+
- Don't assume all PDFs have extractable text
|
|
295
|
+
|
|
296
|
+
### API Format Mismatches
|
|
297
|
+
|
|
298
|
+
**Problem**: Sending files in wrong format to APIs
|
|
299
|
+
**Solution**:
|
|
300
|
+
- Check if API needs multipart/form-data vs Base64
|
|
301
|
+
- Use "Extract from File" and "Convert to File" format conversion
|
|
302
|
+
|
|
303
|
+
### Memory Overload
|
|
304
|
+
|
|
305
|
+
**Problem**: Workflow crashes with large or multiple files
|
|
306
|
+
**Solution**:
|
|
307
|
+
- Process files sequentially or in small batches
|
|
308
|
+
- Enable filesystem mode for binary data storage
|
|
309
|
+
- Drop unnecessary data after extraction
|
|
310
|
+
- Create a sub-workflow in the same workflow using "When Executed by Another Workflow" and "Execute Workflow". Delegate the heavy part of the workflow to the sub-workflow.
|
|
311
|
+
|
|
312
|
+
### Duplicate Processing
|
|
313
|
+
|
|
314
|
+
**Problem**: Same documents processed repeatedly
|
|
315
|
+
**Solution**:
|
|
316
|
+
- Configure email triggers to mark as read
|
|
317
|
+
- Use "unseen" filters for email fetching
|
|
318
|
+
- Implement deduplication logic based on file hash or name`;
|
|
319
|
+
getDocumentation() {
|
|
320
|
+
return this.documentation;
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
exports.DocumentProcessingBestPractices = DocumentProcessingBestPractices;
|
|
324
|
+
//# sourceMappingURL=document-processing.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"document-processing.js","sourceRoot":"","sources":["../../../src/tools/best-practices/document-processing.ts"],"names":[],"mappings":";;;AACA,2DAA2D;AAE3D,MAAa,+BAA+B;IAClC,SAAS,GAAG,kCAAiB,CAAC,mBAAmB,CAAC;IAClD,OAAO,GAAG,OAAO,CAAC;IAEV,aAAa,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;2DAsTyB,CAAC;IAE3D,gBAAgB;QACf,OAAO,IAAI,CAAC,aAAa,CAAC;IAC3B,CAAC;CACD;AA/TD,0EA+TC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { BestPracticesDocument } from '../../types/best-practices';
|
|
2
|
+
export declare class EnrichmentBestPractices implements BestPracticesDocument {
|
|
3
|
+
readonly technique: "enrichment";
|
|
4
|
+
readonly version = "1.0.0";
|
|
5
|
+
private readonly documentation;
|
|
6
|
+
getDocumentation(): string;
|
|
7
|
+
}
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.EnrichmentBestPractices = void 0;
|
|
4
|
+
const categorization_1 = require("../../types/categorization");
|
|
5
|
+
class EnrichmentBestPractices {
|
|
6
|
+
technique = categorization_1.WorkflowTechnique.ENRICHMENT;
|
|
7
|
+
version = '1.0.0';
|
|
8
|
+
documentation = `# Best Practices: Data Enrichment Workflows
|
|
9
|
+
|
|
10
|
+
## Workflow Design
|
|
11
|
+
|
|
12
|
+
### Core Principles
|
|
13
|
+
- Start with data retrieval and validation before enrichment
|
|
14
|
+
- Process data incrementally to avoid overwhelming APIs
|
|
15
|
+
- Always include error handling for failed enrichments
|
|
16
|
+
- Design for reusability with sub-workflows where appropriate
|
|
17
|
+
|
|
18
|
+
### Architecture Pattern
|
|
19
|
+
1. **Input Stage**: Validate and prepare incoming data
|
|
20
|
+
2. **Enrichment Stage**: Parallel or sequential API calls based on dependencies
|
|
21
|
+
3. **Transformation Stage**: Normalize and merge enriched data
|
|
22
|
+
4. **Output Stage**: Format and deliver enriched results
|
|
23
|
+
|
|
24
|
+
## Data Enrichment Guidelines
|
|
25
|
+
|
|
26
|
+
### 1. Input Validation
|
|
27
|
+
**Always validate incoming data before enrichment**
|
|
28
|
+
- Use IF node (n8n-nodes-base.if) to check for required fields
|
|
29
|
+
- Implement Set node (n8n-nodes-base.set) to standardize data format
|
|
30
|
+
- Add Code node (n8n-nodes-base.code) for complex validation logic
|
|
31
|
+
|
|
32
|
+
### 2. API Rate Limiting
|
|
33
|
+
**Respect external service limits**
|
|
34
|
+
- Implement Wait node (n8n-nodes-base.wait) between batch requests
|
|
35
|
+
- Use SplitInBatches node (n8n-nodes-base.splitInBatches) for large datasets
|
|
36
|
+
- Set batch size: 10-50 items depending on API limits
|
|
37
|
+
- Add delay: 1-2 seconds between batches
|
|
38
|
+
|
|
39
|
+
### 3. Error Handling
|
|
40
|
+
**Build resilient enrichment flows**
|
|
41
|
+
- Wrap API calls in Try/Catch pattern using Error Trigger node
|
|
42
|
+
- Use StopAndError node (n8n-nodes-base.stopAndError) for critical failures
|
|
43
|
+
- Implement fallback enrichment sources with Switch node (n8n-nodes-base.switch)
|
|
44
|
+
- Log failures to database or file for later retry
|
|
45
|
+
|
|
46
|
+
### 4. Data Merging
|
|
47
|
+
**Combine enriched data effectively**
|
|
48
|
+
- Use Merge node (n8n-nodes-base.merge) with "Merge By Key" mode
|
|
49
|
+
- Specify unique identifiers for accurate matching
|
|
50
|
+
- Handle missing enrichment data with default values
|
|
51
|
+
- Preserve original data alongside enrichments
|
|
52
|
+
|
|
53
|
+
### 5. Caching Strategy
|
|
54
|
+
**Minimize redundant API calls**
|
|
55
|
+
- Check cache before making external requests
|
|
56
|
+
- Use Redis node (n8n-nodes-base.redis) or database for caching
|
|
57
|
+
- Set appropriate TTL values:
|
|
58
|
+
- Static data: 7-30 days
|
|
59
|
+
- Dynamic data: 1-24 hours
|
|
60
|
+
- Real-time data: No caching
|
|
61
|
+
|
|
62
|
+
### 6. Field Mapping
|
|
63
|
+
**Standardize enriched data structure**
|
|
64
|
+
- Use Set node to rename fields consistently
|
|
65
|
+
- Remove unnecessary fields with unset operations
|
|
66
|
+
- Apply data transformations in Code node for complex mappings
|
|
67
|
+
- Document field mappings in workflow description
|
|
68
|
+
|
|
69
|
+
### 7. Quality Scoring
|
|
70
|
+
**Assess enrichment quality**
|
|
71
|
+
- Add confidence scores to enriched fields
|
|
72
|
+
- Track enrichment source for each field
|
|
73
|
+
- Implement validation rules for enriched data
|
|
74
|
+
- Flag incomplete or suspicious enrichments
|
|
75
|
+
|
|
76
|
+
## Recommended Nodes
|
|
77
|
+
|
|
78
|
+
### Essential Nodes
|
|
79
|
+
|
|
80
|
+
**HTTP Request** (n8n-nodes-base.httpRequest):
|
|
81
|
+
- Purpose: Primary enrichment API calls
|
|
82
|
+
- Use cases: Call external APIs for data enrichment
|
|
83
|
+
- Best practices: Configure proper authentication, handle timeouts
|
|
84
|
+
|
|
85
|
+
**Merge** (n8n-nodes-base.merge):
|
|
86
|
+
- Purpose: Combine original and enriched data
|
|
87
|
+
- Modes: Merge By Key, Merge By Index, Append
|
|
88
|
+
- Best practices: Use unique identifiers for matching, handle missing data
|
|
89
|
+
|
|
90
|
+
**Set** (n8n-nodes-base.set):
|
|
91
|
+
- Purpose: Transform and standardize data
|
|
92
|
+
- Use cases: Rename fields, remove unnecessary data, add metadata
|
|
93
|
+
- Best practices: Use "Keep Only Set" carefully, document transformations
|
|
94
|
+
|
|
95
|
+
**IF** (n8n-nodes-base.if):
|
|
96
|
+
- Purpose: Conditional enrichment logic
|
|
97
|
+
- Use cases: Validate required fields, route based on data quality
|
|
98
|
+
- Best practices: Check for null values, validate data types
|
|
99
|
+
|
|
100
|
+
**SplitInBatches** (n8n-nodes-base.splitInBatches):
|
|
101
|
+
- Purpose: Process large datasets in chunks
|
|
102
|
+
- Use cases: Handle datasets with 100+ items
|
|
103
|
+
- Best practices: Set appropriate batch size (10-50 items), add delays
|
|
104
|
+
|
|
105
|
+
### Enrichment Sources
|
|
106
|
+
|
|
107
|
+
**Clearbit** (n8n-nodes-base.clearbit):
|
|
108
|
+
- Purpose: Company and person enrichment
|
|
109
|
+
- Use cases: Enrich email addresses with company data, get person details
|
|
110
|
+
- Best practices: Handle rate limits, cache results
|
|
111
|
+
|
|
112
|
+
**Hunter** (n8n-nodes-base.hunter):
|
|
113
|
+
- Purpose: Email finder and verification
|
|
114
|
+
- Use cases: Find email addresses, verify email validity
|
|
115
|
+
- Best practices: Respect API quotas, handle verification failures
|
|
116
|
+
|
|
117
|
+
**Brandfetch** (n8n-nodes-base.Brandfetch):
|
|
118
|
+
- Purpose: Company branding data
|
|
119
|
+
- Use cases: Get company logos, colors, brand assets
|
|
120
|
+
- Best practices: Cache brand data, handle missing brands
|
|
121
|
+
|
|
122
|
+
**OpenAI** (@n8n/n8n-nodes-langchain.openAi):
|
|
123
|
+
- Purpose: AI-powered data enrichment
|
|
124
|
+
- Use cases: Extract insights, classify data, generate descriptions
|
|
125
|
+
- Best practices: Minimize token usage, batch similar requests
|
|
126
|
+
|
|
127
|
+
**Google Sheets** (n8n-nodes-base.googleSheets):
|
|
128
|
+
- Purpose: Lookup table enrichment
|
|
129
|
+
- Use cases: Reference data enrichment, mapping tables
|
|
130
|
+
- Best practices: Use efficient lookup methods, cache sheet data
|
|
131
|
+
|
|
132
|
+
### Utility Nodes
|
|
133
|
+
|
|
134
|
+
**Code** (n8n-nodes-base.code):
|
|
135
|
+
- Purpose: Custom enrichment logic
|
|
136
|
+
- Use cases: Complex transformations, custom algorithms
|
|
137
|
+
- Best practices: Keep code modular, handle errors gracefully
|
|
138
|
+
|
|
139
|
+
**Wait** (n8n-nodes-base.wait):
|
|
140
|
+
- Purpose: Rate limiting delays
|
|
141
|
+
- Use cases: Add delays between API calls, implement backoff
|
|
142
|
+
- Best practices: Use appropriate delay values (1-2 seconds)
|
|
143
|
+
|
|
144
|
+
**DateTime** (n8n-nodes-base.dateTime):
|
|
145
|
+
- Purpose: Timestamp handling
|
|
146
|
+
- Use cases: Add enrichment timestamps, calculate ages
|
|
147
|
+
- Best practices: Use consistent timezone handling
|
|
148
|
+
|
|
149
|
+
**Redis** (n8n-nodes-base.redis):
|
|
150
|
+
- Purpose: Caching layer
|
|
151
|
+
- Use cases: Cache enrichment results, track processed items
|
|
152
|
+
- Best practices: Set appropriate TTL, handle cache misses
|
|
153
|
+
|
|
154
|
+
**Error Trigger** (n8n-nodes-base.errorTrigger):
|
|
155
|
+
- Purpose: Error handling workflow
|
|
156
|
+
- Use cases: Global error handling, logging failures
|
|
157
|
+
- Best practices: Implement retry logic, alert on critical failures
|
|
158
|
+
|
|
159
|
+
**Switch** (n8n-nodes-base.switch):
|
|
160
|
+
- Purpose: Route based on enrichment results
|
|
161
|
+
- Use cases: Fallback enrichment sources, quality-based routing
|
|
162
|
+
- Best practices: Always define default case
|
|
163
|
+
|
|
164
|
+
**Stop and Error** (n8n-nodes-base.stopAndError):
|
|
165
|
+
- Purpose: Halt workflow on critical failures
|
|
166
|
+
- Use cases: Stop processing on invalid data, critical API failures
|
|
167
|
+
- Best practices: Use for unrecoverable errors only
|
|
168
|
+
|
|
169
|
+
## Common Pitfalls to Avoid
|
|
170
|
+
|
|
171
|
+
### Performance Issues
|
|
172
|
+
|
|
173
|
+
**Problem**: Enriching all fields for every record
|
|
174
|
+
- **Solution**: Only enrich fields that are actually needed
|
|
175
|
+
- Profile your workflow to identify bottlenecks
|
|
176
|
+
- Use conditional enrichment based on data needs
|
|
177
|
+
|
|
178
|
+
**Problem**: Sequential processing of independent enrichments
|
|
179
|
+
- **Solution**: Use parallel branches for non-dependent enrichments
|
|
180
|
+
- Split workflow into parallel paths
|
|
181
|
+
- Merge results after parallel processing
|
|
182
|
+
|
|
183
|
+
**Problem**: No batching for large datasets
|
|
184
|
+
- **Solution**: Always use SplitInBatches for >100 items
|
|
185
|
+
- Set appropriate batch sizes (10-50 items)
|
|
186
|
+
- Add delays between batches
|
|
187
|
+
|
|
188
|
+
### Data Quality Problems
|
|
189
|
+
|
|
190
|
+
**Problem**: Overwriting original data with enrichments
|
|
191
|
+
- **Solution**: Preserve original data and add enriched fields separately
|
|
192
|
+
- Use Set node to add new fields without removing original ones
|
|
193
|
+
- Document which fields are enriched
|
|
194
|
+
|
|
195
|
+
**Problem**: Not handling null or missing enrichment results
|
|
196
|
+
- **Solution**: Implement fallback values and error flags
|
|
197
|
+
- Use IF nodes to check for empty results
|
|
198
|
+
- Add default values for missing enrichments
|
|
199
|
+
|
|
200
|
+
**Problem**: Mixing data types in enriched fields
|
|
201
|
+
- **Solution**: Enforce consistent data types through validation
|
|
202
|
+
- Convert types explicitly in Set or Code nodes
|
|
203
|
+
- Document expected data types
|
|
204
|
+
|
|
205
|
+
### Resource Management
|
|
206
|
+
|
|
207
|
+
**Problem**: No rate limiting on external APIs
|
|
208
|
+
- **Solution**: Implement delays and respect API quotas
|
|
209
|
+
- Use Wait node between API calls
|
|
210
|
+
- Monitor API usage and adjust delays
|
|
211
|
+
|
|
212
|
+
**Problem**: Infinite retry loops on failures
|
|
213
|
+
- **Solution**: Set maximum retry attempts (typically 3)
|
|
214
|
+
- Use exponential backoff for retries
|
|
215
|
+
- Log failed attempts for manual review
|
|
216
|
+
|
|
217
|
+
**Problem**: No caching of expensive enrichments
|
|
218
|
+
- **Solution**: Cache results with appropriate expiration times
|
|
219
|
+
- Use Redis or database for caching
|
|
220
|
+
- Set TTL based on data freshness requirements
|
|
221
|
+
|
|
222
|
+
### Workflow Design Flaws
|
|
223
|
+
|
|
224
|
+
**Problem**: Single point of failure for entire enrichment
|
|
225
|
+
- **Solution**: Use error boundaries and continue on failure options
|
|
226
|
+
- Enable "Continue on Fail" for non-critical enrichments
|
|
227
|
+
- Implement Error Trigger workflow
|
|
228
|
+
|
|
229
|
+
**Problem**: Hard-coded API keys in workflows
|
|
230
|
+
- **Solution**: Use credentials and environment variables
|
|
231
|
+
- Store sensitive data in n8n credentials system
|
|
232
|
+
- Never commit credentials in workflow JSON
|
|
233
|
+
|
|
234
|
+
**Problem**: No monitoring or logging of enrichment quality
|
|
235
|
+
- **Solution**: Add metrics collection and alerting
|
|
236
|
+
- Log enrichment success/failure rates
|
|
237
|
+
- Track enrichment coverage and quality
|
|
238
|
+
|
|
239
|
+
### Common Error Scenarios
|
|
240
|
+
|
|
241
|
+
**API Rate Limits**:
|
|
242
|
+
- Implement exponential backoff
|
|
243
|
+
- Add Wait nodes with increasing delays
|
|
244
|
+
- Use SplitInBatches to control request rate
|
|
245
|
+
|
|
246
|
+
**Invalid API Responses**:
|
|
247
|
+
- Validate response structure before processing
|
|
248
|
+
- Use IF nodes to check response format
|
|
249
|
+
- Log unexpected responses for debugging
|
|
250
|
+
|
|
251
|
+
**Timeout Issues**:
|
|
252
|
+
- Set reasonable timeout values (10-30s)
|
|
253
|
+
- Use shorter timeouts for non-critical enrichments
|
|
254
|
+
- Implement retry logic for timeouts
|
|
255
|
+
|
|
256
|
+
**Data Mismatches**:
|
|
257
|
+
- Use fuzzy matching for lookups
|
|
258
|
+
- Normalize data before matching
|
|
259
|
+
- Handle missing keys gracefully
|
|
260
|
+
|
|
261
|
+
**Duplicate Enrichments**:
|
|
262
|
+
- Implement deduplication logic
|
|
263
|
+
- Check cache before enriching
|
|
264
|
+
- Use unique identifiers for tracking
|
|
265
|
+
`;
|
|
266
|
+
getDocumentation() {
|
|
267
|
+
return this.documentation;
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
exports.EnrichmentBestPractices = EnrichmentBestPractices;
|
|
271
|
+
//# sourceMappingURL=enrichment.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"enrichment.js","sourceRoot":"","sources":["../../../src/tools/best-practices/enrichment.ts"],"names":[],"mappings":";;;AACA,2DAA2D;AAE3D,MAAa,uBAAuB;IAC1B,SAAS,GAAG,kCAAiB,CAAC,UAAU,CAAC;IACzC,OAAO,GAAG,OAAO,CAAC;IAEV,aAAa,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAiQjC,CAAC;IAED,gBAAgB;QACf,OAAO,IAAI,CAAC,aAAa,CAAC;IAC3B,CAAC;CACD;AA1QD,0DA0QC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { BestPracticesDocument } from '../../types/best-practices';
|
|
2
|
+
export declare class HumanInTheLoopBestPractices implements BestPracticesDocument {
|
|
3
|
+
readonly technique: "human_in_the_loop";
|
|
4
|
+
readonly version = "1.0.0";
|
|
5
|
+
private readonly documentation;
|
|
6
|
+
getDocumentation(): string;
|
|
7
|
+
}
|