omgkit 2.2.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/plugin/skills/databases/mongodb/SKILL.md +60 -776
- package/plugin/skills/databases/prisma/SKILL.md +53 -744
- package/plugin/skills/databases/redis/SKILL.md +53 -860
- package/plugin/skills/devops/aws/SKILL.md +68 -672
- package/plugin/skills/devops/github-actions/SKILL.md +54 -657
- package/plugin/skills/devops/kubernetes/SKILL.md +67 -602
- package/plugin/skills/devops/performance-profiling/SKILL.md +59 -863
- package/plugin/skills/frameworks/django/SKILL.md +87 -853
- package/plugin/skills/frameworks/express/SKILL.md +95 -1301
- package/plugin/skills/frameworks/fastapi/SKILL.md +90 -1198
- package/plugin/skills/frameworks/laravel/SKILL.md +87 -1187
- package/plugin/skills/frameworks/nestjs/SKILL.md +106 -973
- package/plugin/skills/frameworks/react/SKILL.md +94 -962
- package/plugin/skills/frameworks/vue/SKILL.md +95 -1242
- package/plugin/skills/frontend/accessibility/SKILL.md +91 -1056
- package/plugin/skills/frontend/frontend-design/SKILL.md +69 -1262
- package/plugin/skills/frontend/responsive/SKILL.md +76 -799
- package/plugin/skills/frontend/shadcn-ui/SKILL.md +73 -921
- package/plugin/skills/frontend/tailwindcss/SKILL.md +60 -788
- package/plugin/skills/frontend/threejs/SKILL.md +72 -1266
- package/plugin/skills/languages/javascript/SKILL.md +106 -849
- package/plugin/skills/methodology/brainstorming/SKILL.md +70 -576
- package/plugin/skills/methodology/defense-in-depth/SKILL.md +79 -831
- package/plugin/skills/methodology/dispatching-parallel-agents/SKILL.md +81 -654
- package/plugin/skills/methodology/executing-plans/SKILL.md +86 -529
- package/plugin/skills/methodology/finishing-development-branch/SKILL.md +95 -586
- package/plugin/skills/methodology/problem-solving/SKILL.md +67 -681
- package/plugin/skills/methodology/receiving-code-review/SKILL.md +70 -533
- package/plugin/skills/methodology/requesting-code-review/SKILL.md +70 -610
- package/plugin/skills/methodology/root-cause-tracing/SKILL.md +70 -646
- package/plugin/skills/methodology/sequential-thinking/SKILL.md +70 -478
- package/plugin/skills/methodology/systematic-debugging/SKILL.md +66 -559
- package/plugin/skills/methodology/test-driven-development/SKILL.md +91 -752
- package/plugin/skills/methodology/testing-anti-patterns/SKILL.md +78 -687
- package/plugin/skills/methodology/token-optimization/SKILL.md +72 -602
- package/plugin/skills/methodology/verification-before-completion/SKILL.md +108 -529
- package/plugin/skills/methodology/writing-plans/SKILL.md +79 -566
- package/plugin/skills/omega/omega-architecture/SKILL.md +91 -752
- package/plugin/skills/omega/omega-coding/SKILL.md +161 -552
- package/plugin/skills/omega/omega-sprint/SKILL.md +132 -777
- package/plugin/skills/omega/omega-testing/SKILL.md +157 -845
- package/plugin/skills/omega/omega-thinking/SKILL.md +165 -606
- package/plugin/skills/security/better-auth/SKILL.md +46 -1034
- package/plugin/skills/security/oauth/SKILL.md +80 -934
- package/plugin/skills/security/owasp/SKILL.md +78 -862
- package/plugin/skills/testing/playwright/SKILL.md +77 -700
- package/plugin/skills/testing/pytest/SKILL.md +73 -811
- package/plugin/skills/testing/vitest/SKILL.md +60 -920
- package/plugin/skills/tools/document-processing/SKILL.md +111 -838
- package/plugin/skills/tools/image-processing/SKILL.md +126 -659
- package/plugin/skills/tools/mcp-development/SKILL.md +85 -758
- package/plugin/skills/tools/media-processing/SKILL.md +118 -735
- package/plugin/stdrules/SKILL_STANDARDS.md +490 -0
- package/plugin/skills/SKILL_STANDARDS.md +0 -743
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
|
-
name:
|
|
3
|
-
description:
|
|
2
|
+
name: Processing Documents
|
|
3
|
+
description: Processes PDF, DOCX, XLSX, and PPTX files with extraction, generation, and batch operations. Use when building document pipelines, extracting content from office files, or generating reports.
|
|
4
4
|
category: tools
|
|
5
5
|
triggers:
|
|
6
6
|
- document processing
|
|
@@ -12,905 +12,178 @@ triggers:
|
|
|
12
12
|
- office documents
|
|
13
13
|
---
|
|
14
14
|
|
|
15
|
-
#
|
|
15
|
+
# Processing Documents
|
|
16
16
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
## Purpose
|
|
20
|
-
|
|
21
|
-
Handle document processing tasks that enterprise applications commonly require:
|
|
22
|
-
|
|
23
|
-
- Extract text and structured data from PDFs
|
|
24
|
-
- Parse and generate Word documents
|
|
25
|
-
- Manipulate Excel spreadsheets programmatically
|
|
26
|
-
- Create PowerPoint presentations from data
|
|
27
|
-
- Process documents in batch with progress tracking
|
|
28
|
-
|
|
29
|
-
## Features
|
|
30
|
-
|
|
31
|
-
### 1. PDF Processing
|
|
17
|
+
## Quick Start
|
|
32
18
|
|
|
33
19
|
```typescript
|
|
34
|
-
|
|
35
|
-
import
|
|
36
|
-
|
|
37
|
-
interface PDFExtractionResult {
|
|
38
|
-
text: string;
|
|
39
|
-
pages: PageContent[];
|
|
40
|
-
metadata: PDFMetadata;
|
|
41
|
-
tables: ExtractedTable[];
|
|
42
|
-
images: ExtractedImage[];
|
|
43
|
-
}
|
|
20
|
+
import { PDFDocument } from 'pdf-lib';
|
|
21
|
+
import ExcelJS from 'exceljs';
|
|
22
|
+
import { Document, Packer, Paragraph, TextRun } from 'docx';
|
|
44
23
|
|
|
45
|
-
//
|
|
24
|
+
// Extract text from PDF
|
|
46
25
|
async function extractPDFText(buffer: Buffer): Promise<string> {
|
|
47
|
-
const pdfExtract = new PDFExtract();
|
|
48
|
-
const data = await pdfExtract.extractBuffer(buffer);
|
|
49
|
-
|
|
50
|
-
return data.pages
|
|
51
|
-
.map(page => page.content
|
|
52
|
-
.map(item => item.str)
|
|
53
|
-
.join(' ')
|
|
54
|
-
)
|
|
55
|
-
.join('\n\n');
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
// Structured extraction with tables
|
|
59
|
-
async function extractStructuredPDF(buffer: Buffer): Promise<PDFExtractionResult> {
|
|
60
26
|
const pdfDoc = await PDFDocument.load(buffer);
|
|
61
|
-
const pages
|
|
62
|
-
|
|
63
|
-
for (let i = 0; i < pdfDoc.getPageCount(); i++) {
|
|
64
|
-
const page = pdfDoc.getPage(i);
|
|
65
|
-
pages.push({
|
|
66
|
-
pageNumber: i + 1,
|
|
67
|
-
width: page.getWidth(),
|
|
68
|
-
height: page.getHeight(),
|
|
69
|
-
content: await extractPageContent(page),
|
|
70
|
-
tables: await detectTables(page),
|
|
71
|
-
});
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
return {
|
|
75
|
-
text: pages.map(p => p.content).join('\n\n'),
|
|
76
|
-
pages,
|
|
77
|
-
metadata: await extractMetadata(pdfDoc),
|
|
78
|
-
tables: pages.flatMap(p => p.tables),
|
|
79
|
-
images: await extractImages(pdfDoc),
|
|
80
|
-
};
|
|
27
|
+
const pages = pdfDoc.getPages();
|
|
28
|
+
return pages.map(page => page.getTextContent()).join('\n\n');
|
|
81
29
|
}
|
|
82
30
|
|
|
83
|
-
//
|
|
84
|
-
async function
|
|
85
|
-
const
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
switch (element.type) {
|
|
92
|
-
case 'text':
|
|
93
|
-
const text = substituteVariables(element.content, data);
|
|
94
|
-
page.drawText(text, {
|
|
95
|
-
x: element.x,
|
|
96
|
-
y: height - element.y,
|
|
97
|
-
size: element.fontSize || 12,
|
|
98
|
-
font: await pdfDoc.embedFont(element.font || StandardFonts.Helvetica),
|
|
99
|
-
});
|
|
100
|
-
break;
|
|
101
|
-
case 'image':
|
|
102
|
-
const imageBytes = await fetch(data[element.dataKey]).then(r => r.arrayBuffer());
|
|
103
|
-
const image = await pdfDoc.embedPng(imageBytes);
|
|
104
|
-
page.drawImage(image, {
|
|
105
|
-
x: element.x,
|
|
106
|
-
y: height - element.y - element.height,
|
|
107
|
-
width: element.width,
|
|
108
|
-
height: element.height,
|
|
109
|
-
});
|
|
110
|
-
break;
|
|
111
|
-
case 'table':
|
|
112
|
-
await drawTable(page, element, data[element.dataKey]);
|
|
113
|
-
break;
|
|
114
|
-
}
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
return Buffer.from(await pdfDoc.save());
|
|
118
|
-
}
|
|
119
|
-
```
|
|
120
|
-
|
|
121
|
-
### 2. Word Document Processing (DOCX)
|
|
122
|
-
|
|
123
|
-
```typescript
|
|
124
|
-
import { Document, Paragraph, TextRun, Table, TableRow, TableCell, Packer } from 'docx';
|
|
125
|
-
|
|
126
|
-
// Parse DOCX to structured format
|
|
127
|
-
interface DOCXContent {
|
|
128
|
-
paragraphs: ParsedParagraph[];
|
|
129
|
-
tables: ParsedTable[];
|
|
130
|
-
images: ParsedImage[];
|
|
131
|
-
styles: DocumentStyles;
|
|
132
|
-
metadata: DocumentMetadata;
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
async function parseDOCX(buffer: Buffer): Promise<DOCXContent> {
|
|
136
|
-
const zip = new JSZip();
|
|
137
|
-
const doc = await zip.loadAsync(buffer);
|
|
138
|
-
|
|
139
|
-
// Parse document.xml
|
|
140
|
-
const documentXml = await doc.file('word/document.xml')?.async('string');
|
|
141
|
-
const parser = new XMLParser();
|
|
142
|
-
const parsed = parser.parse(documentXml);
|
|
143
|
-
|
|
144
|
-
// Extract content preserving structure
|
|
145
|
-
return {
|
|
146
|
-
paragraphs: extractParagraphs(parsed),
|
|
147
|
-
tables: extractTables(parsed),
|
|
148
|
-
images: await extractImages(doc),
|
|
149
|
-
styles: await parseStyles(doc),
|
|
150
|
-
metadata: await parseMetadata(doc),
|
|
151
|
-
};
|
|
31
|
+
// Read Excel spreadsheet
|
|
32
|
+
async function readExcel(buffer: Buffer) {
|
|
33
|
+
const workbook = new ExcelJS.Workbook();
|
|
34
|
+
await workbook.xlsx.load(buffer);
|
|
35
|
+
return workbook.worksheets.map(sheet => ({
|
|
36
|
+
name: sheet.name,
|
|
37
|
+
rows: sheet.getSheetValues(),
|
|
38
|
+
}));
|
|
152
39
|
}
|
|
153
40
|
|
|
154
|
-
// Generate
|
|
155
|
-
async function generateDOCX(
|
|
41
|
+
// Generate Word document
|
|
42
|
+
async function generateDOCX(title: string, content: string[]): Promise<Buffer> {
|
|
156
43
|
const doc = new Document({
|
|
157
44
|
sections: [{
|
|
158
|
-
properties: {
|
|
159
|
-
page: {
|
|
160
|
-
margin: { top: 720, right: 720, bottom: 720, left: 720 },
|
|
161
|
-
},
|
|
162
|
-
},
|
|
163
45
|
children: [
|
|
164
|
-
|
|
165
|
-
new Paragraph({
|
|
166
|
-
children: [
|
|
167
|
-
new TextRun({
|
|
168
|
-
text: config.title,
|
|
169
|
-
bold: true,
|
|
170
|
-
size: 48,
|
|
171
|
-
}),
|
|
172
|
-
],
|
|
173
|
-
heading: HeadingLevel.HEADING_1,
|
|
174
|
-
spacing: { after: 200 },
|
|
175
|
-
}),
|
|
176
|
-
|
|
177
|
-
// Content paragraphs
|
|
178
|
-
...config.content.map(section => new Paragraph({
|
|
179
|
-
children: [
|
|
180
|
-
new TextRun({
|
|
181
|
-
text: section.text,
|
|
182
|
-
size: 24,
|
|
183
|
-
}),
|
|
184
|
-
],
|
|
185
|
-
spacing: { after: 120 },
|
|
186
|
-
})),
|
|
187
|
-
|
|
188
|
-
// Table if data provided
|
|
189
|
-
...(config.tableData ? [createTable(config.tableData)] : []),
|
|
46
|
+
new Paragraph({ children: [new TextRun({ text: title, bold: true, size: 48 })] }),
|
|
47
|
+
...content.map(text => new Paragraph({ children: [new TextRun(text)] })),
|
|
190
48
|
],
|
|
191
49
|
}],
|
|
192
50
|
});
|
|
193
|
-
|
|
194
51
|
return await Packer.toBuffer(doc);
|
|
195
52
|
}
|
|
196
|
-
|
|
197
|
-
// Create formatted table
|
|
198
|
-
function createTable(data: TableData): Table {
|
|
199
|
-
return new Table({
|
|
200
|
-
rows: [
|
|
201
|
-
// Header row
|
|
202
|
-
new TableRow({
|
|
203
|
-
children: data.headers.map(header =>
|
|
204
|
-
new TableCell({
|
|
205
|
-
children: [new Paragraph({
|
|
206
|
-
children: [new TextRun({ text: header, bold: true })],
|
|
207
|
-
})],
|
|
208
|
-
shading: { fill: 'f0f0f0' },
|
|
209
|
-
})
|
|
210
|
-
),
|
|
211
|
-
tableHeader: true,
|
|
212
|
-
}),
|
|
213
|
-
// Data rows
|
|
214
|
-
...data.rows.map(row =>
|
|
215
|
-
new TableRow({
|
|
216
|
-
children: row.map(cell =>
|
|
217
|
-
new TableCell({
|
|
218
|
-
children: [new Paragraph({ children: [new TextRun(cell)] })],
|
|
219
|
-
})
|
|
220
|
-
),
|
|
221
|
-
})
|
|
222
|
-
),
|
|
223
|
-
],
|
|
224
|
-
width: { size: 100, type: WidthType.PERCENTAGE },
|
|
225
|
-
});
|
|
226
|
-
}
|
|
227
53
|
```
|
|
228
54
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
```typescript
|
|
232
|
-
import ExcelJS from 'exceljs';
|
|
233
|
-
|
|
234
|
-
interface SpreadsheetData {
|
|
235
|
-
sheets: SheetData[];
|
|
236
|
-
metadata: WorkbookMetadata;
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
interface SheetData {
|
|
240
|
-
name: string;
|
|
241
|
-
headers: string[];
|
|
242
|
-
rows: Record<string, any>[];
|
|
243
|
-
formulas: FormulaCell[];
|
|
244
|
-
charts: ChartDefinition[];
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
// Read Excel with full fidelity
|
|
248
|
-
async function readExcel(buffer: Buffer): Promise<SpreadsheetData> {
|
|
249
|
-
const workbook = new ExcelJS.Workbook();
|
|
250
|
-
await workbook.xlsx.load(buffer);
|
|
251
|
-
|
|
252
|
-
const sheets: SheetData[] = [];
|
|
253
|
-
|
|
254
|
-
workbook.eachSheet((worksheet, sheetId) => {
|
|
255
|
-
const headers: string[] = [];
|
|
256
|
-
const rows: Record<string, any>[] = [];
|
|
257
|
-
const formulas: FormulaCell[] = [];
|
|
258
|
-
|
|
259
|
-
// Get headers from first row
|
|
260
|
-
worksheet.getRow(1).eachCell((cell, colNumber) => {
|
|
261
|
-
headers[colNumber - 1] = cell.value?.toString() || `Column${colNumber}`;
|
|
262
|
-
});
|
|
263
|
-
|
|
264
|
-
// Get data rows
|
|
265
|
-
worksheet.eachRow((row, rowNumber) => {
|
|
266
|
-
if (rowNumber === 1) return; // Skip header
|
|
267
|
-
|
|
268
|
-
const rowData: Record<string, any> = {};
|
|
269
|
-
row.eachCell((cell, colNumber) => {
|
|
270
|
-
const header = headers[colNumber - 1];
|
|
271
|
-
|
|
272
|
-
// Preserve formulas
|
|
273
|
-
if (cell.formula) {
|
|
274
|
-
formulas.push({
|
|
275
|
-
row: rowNumber,
|
|
276
|
-
col: colNumber,
|
|
277
|
-
formula: cell.formula,
|
|
278
|
-
result: cell.value,
|
|
279
|
-
});
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
rowData[header] = cell.value;
|
|
283
|
-
});
|
|
284
|
-
|
|
285
|
-
rows.push(rowData);
|
|
286
|
-
});
|
|
287
|
-
|
|
288
|
-
sheets.push({
|
|
289
|
-
name: worksheet.name,
|
|
290
|
-
headers,
|
|
291
|
-
rows,
|
|
292
|
-
formulas,
|
|
293
|
-
charts: extractCharts(worksheet),
|
|
294
|
-
});
|
|
295
|
-
});
|
|
296
|
-
|
|
297
|
-
return {
|
|
298
|
-
sheets,
|
|
299
|
-
metadata: {
|
|
300
|
-
creator: workbook.creator,
|
|
301
|
-
created: workbook.created,
|
|
302
|
-
modified: workbook.modified,
|
|
303
|
-
},
|
|
304
|
-
};
|
|
305
|
-
}
|
|
306
|
-
|
|
307
|
-
// Generate Excel with formatting
|
|
308
|
-
async function generateExcel(config: ExcelConfig): Promise<Buffer> {
|
|
309
|
-
const workbook = new ExcelJS.Workbook();
|
|
310
|
-
workbook.creator = config.author || 'Document Processor';
|
|
311
|
-
workbook.created = new Date();
|
|
312
|
-
|
|
313
|
-
for (const sheetConfig of config.sheets) {
|
|
314
|
-
const worksheet = workbook.addWorksheet(sheetConfig.name);
|
|
315
|
-
|
|
316
|
-
// Add headers with styling
|
|
317
|
-
worksheet.addRow(sheetConfig.headers);
|
|
318
|
-
worksheet.getRow(1).font = { bold: true };
|
|
319
|
-
worksheet.getRow(1).fill = {
|
|
320
|
-
type: 'pattern',
|
|
321
|
-
pattern: 'solid',
|
|
322
|
-
fgColor: { argb: 'FFE0E0E0' },
|
|
323
|
-
};
|
|
324
|
-
|
|
325
|
-
// Add data rows
|
|
326
|
-
for (const row of sheetConfig.data) {
|
|
327
|
-
worksheet.addRow(sheetConfig.headers.map(h => row[h]));
|
|
328
|
-
}
|
|
329
|
-
|
|
330
|
-
// Auto-fit columns
|
|
331
|
-
worksheet.columns.forEach(column => {
|
|
332
|
-
let maxLength = 0;
|
|
333
|
-
column.eachCell({ includeEmpty: true }, cell => {
|
|
334
|
-
const cellLength = cell.value?.toString().length || 10;
|
|
335
|
-
maxLength = Math.max(maxLength, cellLength);
|
|
336
|
-
});
|
|
337
|
-
column.width = Math.min(maxLength + 2, 50);
|
|
338
|
-
});
|
|
339
|
-
|
|
340
|
-
// Add formulas if specified
|
|
341
|
-
if (sheetConfig.formulas) {
|
|
342
|
-
for (const formula of sheetConfig.formulas) {
|
|
343
|
-
worksheet.getCell(formula.cell).value = { formula: formula.formula };
|
|
344
|
-
}
|
|
345
|
-
}
|
|
346
|
-
|
|
347
|
-
// Add conditional formatting
|
|
348
|
-
if (sheetConfig.conditionalFormatting) {
|
|
349
|
-
worksheet.addConditionalFormatting({
|
|
350
|
-
ref: sheetConfig.conditionalFormatting.range,
|
|
351
|
-
rules: sheetConfig.conditionalFormatting.rules,
|
|
352
|
-
});
|
|
353
|
-
}
|
|
354
|
-
}
|
|
355
|
-
|
|
356
|
-
return Buffer.from(await workbook.xlsx.writeBuffer());
|
|
357
|
-
}
|
|
358
|
-
|
|
359
|
-
// Data transformation utilities
|
|
360
|
-
function transformExcelData(data: SheetData, transform: DataTransform): SheetData {
|
|
361
|
-
let rows = [...data.rows];
|
|
362
|
-
|
|
363
|
-
// Filter rows
|
|
364
|
-
if (transform.filter) {
|
|
365
|
-
rows = rows.filter(row => transform.filter!(row));
|
|
366
|
-
}
|
|
367
|
-
|
|
368
|
-
// Map columns
|
|
369
|
-
if (transform.columnMap) {
|
|
370
|
-
rows = rows.map(row => {
|
|
371
|
-
const newRow: Record<string, any> = {};
|
|
372
|
-
for (const [oldKey, newKey] of Object.entries(transform.columnMap!)) {
|
|
373
|
-
newRow[newKey] = row[oldKey];
|
|
374
|
-
}
|
|
375
|
-
return newRow;
|
|
376
|
-
});
|
|
377
|
-
}
|
|
55
|
+
## Features
|
|
378
56
|
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
57
|
+
| Feature | Description | Guide |
|
|
58
|
+
|---------|-------------|-------|
|
|
59
|
+
| PDF Extraction | Extract text, tables, images, and metadata from PDFs | Use pdf-lib or pdf-parse for text extraction |
|
|
60
|
+
| PDF Generation | Create PDFs from templates with data binding | Use pdf-lib with text, images, and table elements |
|
|
61
|
+
| DOCX Parsing | Parse Word documents preserving structure | Use mammoth or docx library for parsing |
|
|
62
|
+
| DOCX Generation | Generate Word documents with formatting | Use docx package with paragraphs and tables |
|
|
63
|
+
| Excel Reading | Read spreadsheets with formulas and formatting | Use exceljs to iterate sheets and cells |
|
|
64
|
+
| Excel Generation | Create spreadsheets with charts and styling | Use exceljs with conditional formatting |
|
|
65
|
+
| PPTX Generation | Create presentations with slides and charts | Use pptxgenjs for slide creation |
|
|
66
|
+
| Batch Processing | Process multiple documents with concurrency | Use p-queue for controlled parallel processing |
|
|
67
|
+
| Template Engine | Generate documents from templates with placeholders | Use docxtemplater for DOCX templates |
|
|
68
|
+
| Streaming | Handle large files without memory exhaustion | Process files in chunks with streams |
|
|
383
69
|
|
|
384
|
-
|
|
385
|
-
...data,
|
|
386
|
-
headers: transform.columnMap
|
|
387
|
-
? Object.values(transform.columnMap)
|
|
388
|
-
: data.headers,
|
|
389
|
-
rows,
|
|
390
|
-
};
|
|
391
|
-
}
|
|
392
|
-
```
|
|
70
|
+
## Common Patterns
|
|
393
71
|
|
|
394
|
-
###
|
|
72
|
+
### Batch Document Processing
|
|
395
73
|
|
|
396
74
|
```typescript
|
|
397
|
-
import
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
}
|
|
413
|
-
|
|
414
|
-
// Generate PowerPoint presentation
|
|
415
|
-
async function generatePPTX(config: PresentationConfig): Promise<Buffer> {
|
|
416
|
-
const pptx = new PptxGenJS();
|
|
417
|
-
|
|
418
|
-
// Set metadata
|
|
419
|
-
pptx.title = config.title;
|
|
420
|
-
pptx.author = config.author || 'Document Processor';
|
|
421
|
-
|
|
422
|
-
// Apply theme
|
|
423
|
-
if (config.theme) {
|
|
424
|
-
pptx.defineLayout({
|
|
425
|
-
name: 'CUSTOM',
|
|
426
|
-
width: config.theme.width || 10,
|
|
427
|
-
height: config.theme.height || 7.5,
|
|
75
|
+
import PQueue from 'p-queue';
|
|
76
|
+
|
|
77
|
+
async function processBatch(files: string[], transform: (buffer: Buffer) => Promise<Buffer>) {
|
|
78
|
+
const queue = new PQueue({ concurrency: 4 });
|
|
79
|
+
const results: { file: string; success: boolean; error?: string }[] = [];
|
|
80
|
+
|
|
81
|
+
for (const file of files) {
|
|
82
|
+
queue.add(async () => {
|
|
83
|
+
try {
|
|
84
|
+
const buffer = await fs.readFile(file);
|
|
85
|
+
const output = await transform(buffer);
|
|
86
|
+
await fs.writeFile(file.replace(/\.\w+$/, '_processed.pdf'), output);
|
|
87
|
+
results.push({ file, success: true });
|
|
88
|
+
} catch (error) {
|
|
89
|
+
results.push({ file, success: false, error: error.message });
|
|
90
|
+
}
|
|
428
91
|
});
|
|
429
92
|
}
|
|
430
93
|
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
const slide = pptx.addSlide();
|
|
434
|
-
|
|
435
|
-
switch (slideConfig.layout) {
|
|
436
|
-
case 'title':
|
|
437
|
-
slide.addText(slideConfig.title || '', {
|
|
438
|
-
x: 0.5, y: 2.5, w: 9, h: 1,
|
|
439
|
-
fontSize: 44, bold: true, align: 'center',
|
|
440
|
-
});
|
|
441
|
-
if (slideConfig.subtitle) {
|
|
442
|
-
slide.addText(slideConfig.subtitle, {
|
|
443
|
-
x: 0.5, y: 3.5, w: 9, h: 0.5,
|
|
444
|
-
fontSize: 24, color: '666666', align: 'center',
|
|
445
|
-
});
|
|
446
|
-
}
|
|
447
|
-
break;
|
|
448
|
-
|
|
449
|
-
case 'content':
|
|
450
|
-
slide.addText(slideConfig.title || '', {
|
|
451
|
-
x: 0.5, y: 0.3, w: 9, h: 0.8,
|
|
452
|
-
fontSize: 32, bold: true,
|
|
453
|
-
});
|
|
454
|
-
let yPos = 1.2;
|
|
455
|
-
for (const content of slideConfig.content || []) {
|
|
456
|
-
yPos = addSlideContent(slide, content, yPos);
|
|
457
|
-
}
|
|
458
|
-
break;
|
|
459
|
-
|
|
460
|
-
case 'twoColumn':
|
|
461
|
-
slide.addText(slideConfig.title || '', {
|
|
462
|
-
x: 0.5, y: 0.3, w: 9, h: 0.8,
|
|
463
|
-
fontSize: 32, bold: true,
|
|
464
|
-
});
|
|
465
|
-
// Left column
|
|
466
|
-
addSlideContent(slide, slideConfig.content![0], 1.2, 0.5, 4.3);
|
|
467
|
-
// Right column
|
|
468
|
-
addSlideContent(slide, slideConfig.content![1], 1.2, 5.2, 4.3);
|
|
469
|
-
break;
|
|
470
|
-
}
|
|
471
|
-
|
|
472
|
-
// Add speaker notes
|
|
473
|
-
if (slideConfig.notes) {
|
|
474
|
-
slide.addNotes(slideConfig.notes);
|
|
475
|
-
}
|
|
476
|
-
}
|
|
477
|
-
|
|
478
|
-
return Buffer.from(await pptx.write({ outputType: 'arraybuffer' }));
|
|
479
|
-
}
|
|
480
|
-
|
|
481
|
-
// Add various content types to slide
|
|
482
|
-
function addSlideContent(
|
|
483
|
-
slide: PptxGenJS.Slide,
|
|
484
|
-
content: SlideContent,
|
|
485
|
-
y: number,
|
|
486
|
-
x: number = 0.5,
|
|
487
|
-
w: number = 9
|
|
488
|
-
): number {
|
|
489
|
-
switch (content.type) {
|
|
490
|
-
case 'text':
|
|
491
|
-
slide.addText(content.value, {
|
|
492
|
-
x, y, w, h: 0.5,
|
|
493
|
-
fontSize: content.fontSize || 18,
|
|
494
|
-
bullet: content.bullet,
|
|
495
|
-
});
|
|
496
|
-
return y + 0.6;
|
|
497
|
-
|
|
498
|
-
case 'bullets':
|
|
499
|
-
slide.addText(
|
|
500
|
-
content.items.map(item => ({ text: item, options: { bullet: true } })),
|
|
501
|
-
{ x, y, w, fontSize: 18 }
|
|
502
|
-
);
|
|
503
|
-
return y + content.items.length * 0.4 + 0.2;
|
|
504
|
-
|
|
505
|
-
case 'image':
|
|
506
|
-
slide.addImage({
|
|
507
|
-
path: content.path,
|
|
508
|
-
x, y, w: content.width || 4, h: content.height || 3,
|
|
509
|
-
});
|
|
510
|
-
return y + (content.height || 3) + 0.2;
|
|
511
|
-
|
|
512
|
-
case 'chart':
|
|
513
|
-
slide.addChart(content.chartType, content.data, {
|
|
514
|
-
x, y, w, h: content.height || 4,
|
|
515
|
-
});
|
|
516
|
-
return y + (content.height || 4) + 0.2;
|
|
517
|
-
|
|
518
|
-
case 'table':
|
|
519
|
-
slide.addTable(content.data, {
|
|
520
|
-
x, y, w,
|
|
521
|
-
border: { pt: 1, color: 'CFCFCF' },
|
|
522
|
-
fontFace: 'Arial',
|
|
523
|
-
fontSize: 14,
|
|
524
|
-
});
|
|
525
|
-
return y + content.data.length * 0.4 + 0.2;
|
|
526
|
-
|
|
527
|
-
default:
|
|
528
|
-
return y;
|
|
529
|
-
}
|
|
94
|
+
await queue.onIdle();
|
|
95
|
+
return results;
|
|
530
96
|
}
|
|
531
97
|
```
|
|
532
98
|
|
|
533
|
-
###
|
|
99
|
+
### Excel Report Generation
|
|
534
100
|
|
|
535
101
|
```typescript
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
concurrency: number;
|
|
540
|
-
transform: DocumentTransform;
|
|
541
|
-
onProgress?: (progress: BatchProgress) => void;
|
|
542
|
-
onError?: (error: BatchError) => void;
|
|
543
|
-
}
|
|
544
|
-
|
|
545
|
-
interface BatchProgress {
|
|
546
|
-
total: number;
|
|
547
|
-
processed: number;
|
|
548
|
-
succeeded: number;
|
|
549
|
-
failed: number;
|
|
550
|
-
currentFile: string;
|
|
551
|
-
}
|
|
552
|
-
|
|
553
|
-
// Batch document processing with streaming
|
|
554
|
-
async function processBatch(config: BatchConfig): Promise<BatchResult> {
|
|
555
|
-
const files = await glob(`${config.inputDir}/**/*.{pdf,docx,xlsx,pptx}`);
|
|
556
|
-
const results: ProcessingResult[] = [];
|
|
557
|
-
|
|
558
|
-
const progress: BatchProgress = {
|
|
559
|
-
total: files.length,
|
|
560
|
-
processed: 0,
|
|
561
|
-
succeeded: 0,
|
|
562
|
-
failed: 0,
|
|
563
|
-
currentFile: '',
|
|
564
|
-
};
|
|
565
|
-
|
|
566
|
-
// Process with concurrency limit
|
|
567
|
-
const queue = new PQueue({ concurrency: config.concurrency });
|
|
568
|
-
|
|
569
|
-
const tasks = files.map(file => queue.add(async () => {
|
|
570
|
-
progress.currentFile = file;
|
|
571
|
-
config.onProgress?.(progress);
|
|
572
|
-
|
|
573
|
-
try {
|
|
574
|
-
const buffer = await fs.readFile(file);
|
|
575
|
-
const ext = path.extname(file).toLowerCase();
|
|
576
|
-
|
|
577
|
-
// Process based on file type
|
|
578
|
-
let result: Buffer;
|
|
579
|
-
switch (ext) {
|
|
580
|
-
case '.pdf':
|
|
581
|
-
result = await transformPDF(buffer, config.transform);
|
|
582
|
-
break;
|
|
583
|
-
case '.docx':
|
|
584
|
-
result = await transformDOCX(buffer, config.transform);
|
|
585
|
-
break;
|
|
586
|
-
case '.xlsx':
|
|
587
|
-
result = await transformExcel(buffer, config.transform);
|
|
588
|
-
break;
|
|
589
|
-
case '.pptx':
|
|
590
|
-
result = await transformPPTX(buffer, config.transform);
|
|
591
|
-
break;
|
|
592
|
-
default:
|
|
593
|
-
throw new Error(`Unsupported file type: ${ext}`);
|
|
594
|
-
}
|
|
595
|
-
|
|
596
|
-
// Write output
|
|
597
|
-
const outputPath = path.join(
|
|
598
|
-
config.outputDir,
|
|
599
|
-
path.relative(config.inputDir, file)
|
|
600
|
-
);
|
|
601
|
-
await fs.mkdir(path.dirname(outputPath), { recursive: true });
|
|
602
|
-
await fs.writeFile(outputPath, result);
|
|
603
|
-
|
|
604
|
-
progress.succeeded++;
|
|
605
|
-
results.push({ file, success: true });
|
|
606
|
-
} catch (error) {
|
|
607
|
-
progress.failed++;
|
|
608
|
-
results.push({ file, success: false, error: error.message });
|
|
609
|
-
config.onError?.({ file, error });
|
|
610
|
-
} finally {
|
|
611
|
-
progress.processed++;
|
|
612
|
-
config.onProgress?.(progress);
|
|
613
|
-
}
|
|
614
|
-
}));
|
|
102
|
+
async function generateReport(data: Record<string, any>[]): Promise<Buffer> {
|
|
103
|
+
const workbook = new ExcelJS.Workbook();
|
|
104
|
+
const sheet = workbook.addWorksheet('Report');
|
|
615
105
|
|
|
616
|
-
|
|
106
|
+
// Add headers with styling
|
|
107
|
+
const headers = Object.keys(data[0] || {});
|
|
108
|
+
sheet.addRow(headers);
|
|
109
|
+
sheet.getRow(1).font = { bold: true };
|
|
110
|
+
sheet.getRow(1).fill = { type: 'pattern', pattern: 'solid', fgColor: { argb: 'FFE0E0E0' } };
|
|
617
111
|
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
succeeded: progress.succeeded,
|
|
621
|
-
failed: progress.failed,
|
|
622
|
-
results,
|
|
623
|
-
};
|
|
624
|
-
}
|
|
112
|
+
// Add data rows
|
|
113
|
+
data.forEach(row => sheet.addRow(headers.map(h => row[h])));
|
|
625
114
|
|
|
626
|
-
//
|
|
627
|
-
|
|
628
|
-
inputStream: ReadStream,
|
|
629
|
-
transform: ChunkTransform
|
|
630
|
-
): AsyncGenerator<Buffer> {
|
|
631
|
-
const chunks: Buffer[] = [];
|
|
632
|
-
let processedSize = 0;
|
|
633
|
-
|
|
634
|
-
for await (const chunk of inputStream) {
|
|
635
|
-
chunks.push(chunk);
|
|
636
|
-
processedSize += chunk.length;
|
|
637
|
-
|
|
638
|
-
// Process in chunks for memory efficiency
|
|
639
|
-
if (processedSize >= CHUNK_SIZE) {
|
|
640
|
-
const combined = Buffer.concat(chunks);
|
|
641
|
-
yield await transform(combined);
|
|
642
|
-
chunks.length = 0;
|
|
643
|
-
processedSize = 0;
|
|
644
|
-
}
|
|
645
|
-
}
|
|
115
|
+
// Auto-fit columns
|
|
116
|
+
sheet.columns.forEach(col => { col.width = 15; });
|
|
646
117
|
|
|
647
|
-
|
|
648
|
-
if (chunks.length > 0) {
|
|
649
|
-
yield await transform(Buffer.concat(chunks));
|
|
650
|
-
}
|
|
118
|
+
return Buffer.from(await workbook.xlsx.writeBuffer());
|
|
651
119
|
}
|
|
652
120
|
```
|
|
653
121
|
|
|
654
|
-
###
|
|
122
|
+
### Invoice Generation from Template
|
|
655
123
|
|
|
656
124
|
```typescript
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
template: DocumentTemplate,
|
|
672
|
-
data: Record<string, any>
|
|
673
|
-
): Promise<Buffer> {
|
|
674
|
-
const templateBuffer = await fs.readFile(template.templatePath);
|
|
675
|
-
|
|
676
|
-
switch (template.type) {
|
|
677
|
-
case 'docx':
|
|
678
|
-
return generateDOCXFromTemplate(templateBuffer, template.placeholders, data);
|
|
679
|
-
case 'xlsx':
|
|
680
|
-
return generateExcelFromTemplate(templateBuffer, template.placeholders, data);
|
|
681
|
-
case 'pptx':
|
|
682
|
-
return generatePPTXFromTemplate(templateBuffer, template.placeholders, data);
|
|
683
|
-
case 'pdf':
|
|
684
|
-
return generatePDFFromTemplate(templateBuffer, template.placeholders, data);
|
|
685
|
-
}
|
|
686
|
-
}
|
|
687
|
-
|
|
688
|
-
// DOCX template processing
|
|
689
|
-
async function generateDOCXFromTemplate(
|
|
690
|
-
templateBuffer: Buffer,
|
|
691
|
-
placeholders: PlaceholderConfig[],
|
|
692
|
-
data: Record<string, any>
|
|
693
|
-
): Promise<Buffer> {
|
|
694
|
-
const doc = new Docxtemplater(new PizZip(templateBuffer), {
|
|
695
|
-
paragraphLoop: true,
|
|
696
|
-
linebreaks: true,
|
|
697
|
-
});
|
|
698
|
-
|
|
699
|
-
// Build data object with formatting
|
|
700
|
-
const templateData: Record<string, any> = {};
|
|
701
|
-
|
|
702
|
-
for (const placeholder of placeholders) {
|
|
703
|
-
const value = data[placeholder.key];
|
|
704
|
-
|
|
705
|
-
switch (placeholder.type) {
|
|
706
|
-
case 'text':
|
|
707
|
-
templateData[placeholder.key] = formatText(value, placeholder.format);
|
|
708
|
-
break;
|
|
709
|
-
case 'table':
|
|
710
|
-
templateData[placeholder.key] = formatTableData(value);
|
|
711
|
-
break;
|
|
712
|
-
case 'image':
|
|
713
|
-
templateData[placeholder.key] = await loadImage(value);
|
|
714
|
-
break;
|
|
715
|
-
case 'list':
|
|
716
|
-
templateData[placeholder.key] = value.map((item: any) => ({ item }));
|
|
717
|
-
break;
|
|
718
|
-
}
|
|
719
|
-
}
|
|
720
|
-
|
|
721
|
-
doc.render(templateData);
|
|
722
|
-
|
|
723
|
-
return doc.getZip().generate({
|
|
724
|
-
type: 'nodebuffer',
|
|
725
|
-
compression: 'DEFLATE',
|
|
125
|
+
import Docxtemplater from 'docxtemplater';
|
|
126
|
+
import PizZip from 'pizzip';
|
|
127
|
+
|
|
128
|
+
async function generateInvoice(templatePath: string, invoiceData: InvoiceData): Promise<Buffer> {
|
|
129
|
+
const templateBuffer = await fs.readFile(templatePath);
|
|
130
|
+
const zip = new PizZip(templateBuffer);
|
|
131
|
+
const doc = new Docxtemplater(zip, { paragraphLoop: true, linebreaks: true });
|
|
132
|
+
|
|
133
|
+
doc.render({
|
|
134
|
+
invoiceNumber: invoiceData.number,
|
|
135
|
+
date: invoiceData.date,
|
|
136
|
+
customer: invoiceData.customer,
|
|
137
|
+
items: invoiceData.items,
|
|
138
|
+
total: invoiceData.total,
|
|
726
139
|
});
|
|
727
|
-
}
|
|
728
|
-
```
|
|
729
|
-
|
|
730
|
-
## Use Cases
|
|
731
140
|
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
```typescript
|
|
735
|
-
// Generate invoices from order data
|
|
736
|
-
async function generateInvoice(order: Order): Promise<Buffer> {
|
|
737
|
-
const template: DOCXConfig = {
|
|
738
|
-
title: `Invoice #${order.invoiceNumber}`,
|
|
739
|
-
content: [
|
|
740
|
-
{ text: `Date: ${formatDate(order.date)}` },
|
|
741
|
-
{ text: `Customer: ${order.customer.name}` },
|
|
742
|
-
{ text: `Address: ${order.customer.address}` },
|
|
743
|
-
],
|
|
744
|
-
tableData: {
|
|
745
|
-
headers: ['Item', 'Quantity', 'Price', 'Total'],
|
|
746
|
-
rows: order.items.map(item => [
|
|
747
|
-
item.name,
|
|
748
|
-
item.quantity.toString(),
|
|
749
|
-
formatCurrency(item.price),
|
|
750
|
-
formatCurrency(item.quantity * item.price),
|
|
751
|
-
]),
|
|
752
|
-
},
|
|
753
|
-
};
|
|
754
|
-
|
|
755
|
-
// Add totals
|
|
756
|
-
template.content.push(
|
|
757
|
-
{ text: '' },
|
|
758
|
-
{ text: `Subtotal: ${formatCurrency(order.subtotal)}` },
|
|
759
|
-
{ text: `Tax: ${formatCurrency(order.tax)}` },
|
|
760
|
-
{ text: `Total: ${formatCurrency(order.total)}`, bold: true },
|
|
761
|
-
);
|
|
762
|
-
|
|
763
|
-
return generateDOCX(template);
|
|
141
|
+
return doc.getZip().generate({ type: 'nodebuffer', compression: 'DEFLATE' });
|
|
764
142
|
}
|
|
765
143
|
```
|
|
766
144
|
|
|
767
|
-
###
|
|
145
|
+
### PDF Table Extraction
|
|
768
146
|
|
|
769
147
|
```typescript
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
author: 'Analytics System',
|
|
774
|
-
sheets: [
|
|
775
|
-
{
|
|
776
|
-
name: 'Summary',
|
|
777
|
-
headers: ['Metric', 'Value', 'Change'],
|
|
778
|
-
data: dashboard.kpis.map(kpi => ({
|
|
779
|
-
Metric: kpi.name,
|
|
780
|
-
Value: kpi.value,
|
|
781
|
-
Change: `${kpi.change > 0 ? '+' : ''}${kpi.change}%`,
|
|
782
|
-
})),
|
|
783
|
-
conditionalFormatting: {
|
|
784
|
-
range: 'C2:C100',
|
|
785
|
-
rules: [
|
|
786
|
-
{ type: 'cellIs', operator: 'greaterThan', formulae: [0], style: { fill: { argb: 'FF00FF00' } } },
|
|
787
|
-
{ type: 'cellIs', operator: 'lessThan', formulae: [0], style: { fill: { argb: 'FFFF0000' } } },
|
|
788
|
-
],
|
|
789
|
-
},
|
|
790
|
-
},
|
|
791
|
-
{
|
|
792
|
-
name: 'Detailed Data',
|
|
793
|
-
headers: Object.keys(dashboard.detailedData[0] || {}),
|
|
794
|
-
data: dashboard.detailedData,
|
|
795
|
-
},
|
|
796
|
-
],
|
|
797
|
-
});
|
|
798
|
-
}
|
|
799
|
-
```
|
|
148
|
+
async function extractTables(pdfBuffer: Buffer): Promise<ExtractedTable[]> {
|
|
149
|
+
const pdfDoc = await PDFDocument.load(pdfBuffer);
|
|
150
|
+
const tables: ExtractedTable[] = [];
|
|
800
151
|
|
|
801
|
-
|
|
152
|
+
for (let i = 0; i < pdfDoc.getPageCount(); i++) {
|
|
153
|
+
const page = pdfDoc.getPage(i);
|
|
154
|
+
const content = await extractPageContent(page);
|
|
155
|
+
const detectedTables = detectTableStructures(content);
|
|
156
|
+
tables.push(...detectedTables.map(t => ({ ...t, pageNumber: i + 1 })));
|
|
157
|
+
}
|
|
802
158
|
|
|
803
|
-
|
|
804
|
-
// Extract and analyze contract data
|
|
805
|
-
async function analyzeContract(pdfBuffer: Buffer): Promise<ContractAnalysis> {
|
|
806
|
-
const extracted = await extractStructuredPDF(pdfBuffer);
|
|
807
|
-
|
|
808
|
-
return {
|
|
809
|
-
parties: extractParties(extracted.text),
|
|
810
|
-
dates: extractDates(extracted.text),
|
|
811
|
-
amounts: extractMonetaryAmounts(extracted.text),
|
|
812
|
-
clauses: categorizeClausses(extracted.text),
|
|
813
|
-
signatures: detectSignatures(extracted.images),
|
|
814
|
-
tables: extracted.tables.map(analyzeTable),
|
|
815
|
-
};
|
|
159
|
+
return tables;
|
|
816
160
|
}
|
|
817
161
|
```
|
|
818
162
|
|
|
819
163
|
## Best Practices
|
|
820
164
|
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
- Don't assume file extensions match content
|
|
834
|
-
- Don't ignore password-protected documents
|
|
835
|
-
- Don't strip metadata without user consent
|
|
836
|
-
- Don't process untrusted files without sandboxing
|
|
837
|
-
- Don't skip error handling for corrupt files
|
|
838
|
-
|
|
839
|
-
### Error Handling
|
|
840
|
-
|
|
841
|
-
```typescript
|
|
842
|
-
class DocumentProcessingError extends Error {
|
|
843
|
-
constructor(
|
|
844
|
-
message: string,
|
|
845
|
-
public readonly code: ErrorCode,
|
|
846
|
-
public readonly file?: string,
|
|
847
|
-
public readonly cause?: Error
|
|
848
|
-
) {
|
|
849
|
-
super(message);
|
|
850
|
-
this.name = 'DocumentProcessingError';
|
|
851
|
-
}
|
|
852
|
-
}
|
|
853
|
-
|
|
854
|
-
enum ErrorCode {
|
|
855
|
-
INVALID_FORMAT = 'INVALID_FORMAT',
|
|
856
|
-
CORRUPT_FILE = 'CORRUPT_FILE',
|
|
857
|
-
PASSWORD_PROTECTED = 'PASSWORD_PROTECTED',
|
|
858
|
-
ENCODING_ERROR = 'ENCODING_ERROR',
|
|
859
|
-
SIZE_LIMIT_EXCEEDED = 'SIZE_LIMIT_EXCEEDED',
|
|
860
|
-
UNSUPPORTED_FEATURE = 'UNSUPPORTED_FEATURE',
|
|
861
|
-
}
|
|
862
|
-
|
|
863
|
-
// Comprehensive error handling
|
|
864
|
-
async function safeProcessDocument(buffer: Buffer, filename: string): Promise<ProcessResult> {
|
|
865
|
-
try {
|
|
866
|
-
// Validate file
|
|
867
|
-
const fileType = await detectFileType(buffer);
|
|
868
|
-
if (!SUPPORTED_TYPES.includes(fileType)) {
|
|
869
|
-
throw new DocumentProcessingError(
|
|
870
|
-
`Unsupported file type: ${fileType}`,
|
|
871
|
-
ErrorCode.INVALID_FORMAT,
|
|
872
|
-
filename
|
|
873
|
-
);
|
|
874
|
-
}
|
|
875
|
-
|
|
876
|
-
// Check size
|
|
877
|
-
if (buffer.length > MAX_FILE_SIZE) {
|
|
878
|
-
throw new DocumentProcessingError(
|
|
879
|
-
`File exceeds maximum size of ${MAX_FILE_SIZE} bytes`,
|
|
880
|
-
ErrorCode.SIZE_LIMIT_EXCEEDED,
|
|
881
|
-
filename
|
|
882
|
-
);
|
|
883
|
-
}
|
|
884
|
-
|
|
885
|
-
// Process
|
|
886
|
-
return await processDocument(buffer, fileType);
|
|
887
|
-
} catch (error) {
|
|
888
|
-
if (error instanceof DocumentProcessingError) {
|
|
889
|
-
throw error;
|
|
890
|
-
}
|
|
891
|
-
|
|
892
|
-
// Wrap unexpected errors
|
|
893
|
-
throw new DocumentProcessingError(
|
|
894
|
-
`Failed to process document: ${error.message}`,
|
|
895
|
-
ErrorCode.CORRUPT_FILE,
|
|
896
|
-
filename,
|
|
897
|
-
error
|
|
898
|
-
);
|
|
899
|
-
}
|
|
900
|
-
}
|
|
901
|
-
```
|
|
165
|
+
| Do | Avoid |
|
|
166
|
+
|----|-------|
|
|
167
|
+
| Stream large files (>10MB) to prevent memory issues | Loading entire large files into memory |
|
|
168
|
+
| Validate file types before processing | Assuming file extensions match content |
|
|
169
|
+
| Handle password-protected documents gracefully | Ignoring encrypted document errors |
|
|
170
|
+
| Preserve original formatting when transforming | Stripping formatting without user consent |
|
|
171
|
+
| Cache parsed results for repeated access | Re-parsing the same document multiple times |
|
|
172
|
+
| Use appropriate libraries per format | Building custom parsers for standard formats |
|
|
173
|
+
| Set file size limits for uploads | Processing unbounded file sizes |
|
|
174
|
+
| Sanitize filenames and paths | Using untrusted paths directly |
|
|
175
|
+
| Handle encoding issues (UTF-8, BOM) | Assuming all files use the same encoding |
|
|
176
|
+
| Log processing errors with context | Silently failing on corrupt files |
|
|
902
177
|
|
|
903
178
|
## Related Skills
|
|
904
179
|
|
|
905
|
-
- **
|
|
180
|
+
- **media-processing** - Video and audio processing
|
|
181
|
+
- **image-processing** - Image manipulation with Sharp
|
|
906
182
|
- **typescript** - Type-safe document handling
|
|
907
|
-
- **data-processing** - Data transformation utilities
|
|
908
|
-
- **api-architecture** - Document API design patterns
|
|
909
183
|
|
|
910
|
-
##
|
|
184
|
+
## References
|
|
911
185
|
|
|
912
186
|
- [pdf-lib Documentation](https://pdf-lib.js.org/)
|
|
913
187
|
- [ExcelJS Documentation](https://github.com/exceljs/exceljs)
|
|
914
188
|
- [docx Documentation](https://docx.js.org/)
|
|
915
189
|
- [PptxGenJS Documentation](https://gitbrent.github.io/PptxGenJS/)
|
|
916
|
-
- [Apache POI](https://poi.apache.org/) - Java reference
|