omgkit 2.0.7 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/plugin/skills/backend/api-architecture/SKILL.md +857 -0
- package/plugin/skills/backend/caching-strategies/SKILL.md +755 -0
- package/plugin/skills/backend/event-driven-architecture/SKILL.md +753 -0
- package/plugin/skills/backend/real-time-systems/SKILL.md +635 -0
- package/plugin/skills/databases/database-optimization/SKILL.md +571 -0
- package/plugin/skills/devops/monorepo-management/SKILL.md +595 -0
- package/plugin/skills/devops/observability/SKILL.md +622 -0
- package/plugin/skills/devops/performance-profiling/SKILL.md +905 -0
- package/plugin/skills/frontend/advanced-ui-design/SKILL.md +426 -0
- package/plugin/skills/integrations/ai-integration/SKILL.md +730 -0
- package/plugin/skills/integrations/payment-integration/SKILL.md +735 -0
- package/plugin/skills/methodology/problem-solving/SKILL.md +355 -0
- package/plugin/skills/methodology/research-validation/SKILL.md +668 -0
- package/plugin/skills/methodology/sequential-thinking/SKILL.md +260 -0
- package/plugin/skills/mobile/mobile-development/SKILL.md +756 -0
- package/plugin/skills/security/security-hardening/SKILL.md +633 -0
- package/plugin/skills/tools/document-processing/SKILL.md +916 -0
- package/plugin/skills/tools/image-processing/SKILL.md +748 -0
- package/plugin/skills/tools/mcp-development/SKILL.md +883 -0
- package/plugin/skills/tools/media-processing/SKILL.md +831 -0
|
@@ -0,0 +1,916 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: document-processing
|
|
3
|
+
description: Enterprise-grade document processing for PDF, DOCX, XLSX, PPTX with streaming, validation, and batch operations
|
|
4
|
+
category: tools
|
|
5
|
+
triggers:
|
|
6
|
+
- document processing
|
|
7
|
+
- pdf extraction
|
|
8
|
+
- docx parsing
|
|
9
|
+
- excel manipulation
|
|
10
|
+
- spreadsheet data
|
|
11
|
+
- powerpoint generation
|
|
12
|
+
- office documents
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
# Document Processing
|
|
16
|
+
|
|
17
|
+
Enterprise-grade **document processing** for PDF, DOCX, XLSX, and PPTX files. This skill enables extraction, manipulation, generation, and batch processing of office documents with streaming support for large files.
|
|
18
|
+
|
|
19
|
+
## Purpose
|
|
20
|
+
|
|
21
|
+
Handle document processing tasks that enterprise applications commonly require:
|
|
22
|
+
|
|
23
|
+
- Extract text and structured data from PDFs
|
|
24
|
+
- Parse and generate Word documents
|
|
25
|
+
- Manipulate Excel spreadsheets programmatically
|
|
26
|
+
- Create PowerPoint presentations from data
|
|
27
|
+
- Process documents in batch with progress tracking
|
|
28
|
+
|
|
29
|
+
## Features
|
|
30
|
+
|
|
31
|
+
### 1. PDF Processing
|
|
32
|
+
|
|
33
|
+
```typescript
|
|
34
|
+
// PDF text extraction with structure preservation
|
|
35
|
+
import { PDFDocument, PDFExtract } from 'pdf-lib';
|
|
36
|
+
|
|
37
|
+
interface PDFExtractionResult {
|
|
38
|
+
text: string;
|
|
39
|
+
pages: PageContent[];
|
|
40
|
+
metadata: PDFMetadata;
|
|
41
|
+
tables: ExtractedTable[];
|
|
42
|
+
images: ExtractedImage[];
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Basic text extraction
|
|
46
|
+
async function extractPDFText(buffer: Buffer): Promise<string> {
|
|
47
|
+
const pdfExtract = new PDFExtract();
|
|
48
|
+
const data = await pdfExtract.extractBuffer(buffer);
|
|
49
|
+
|
|
50
|
+
return data.pages
|
|
51
|
+
.map(page => page.content
|
|
52
|
+
.map(item => item.str)
|
|
53
|
+
.join(' ')
|
|
54
|
+
)
|
|
55
|
+
.join('\n\n');
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// Structured extraction with tables
|
|
59
|
+
async function extractStructuredPDF(buffer: Buffer): Promise<PDFExtractionResult> {
|
|
60
|
+
const pdfDoc = await PDFDocument.load(buffer);
|
|
61
|
+
const pages: PageContent[] = [];
|
|
62
|
+
|
|
63
|
+
for (let i = 0; i < pdfDoc.getPageCount(); i++) {
|
|
64
|
+
const page = pdfDoc.getPage(i);
|
|
65
|
+
pages.push({
|
|
66
|
+
pageNumber: i + 1,
|
|
67
|
+
width: page.getWidth(),
|
|
68
|
+
height: page.getHeight(),
|
|
69
|
+
content: await extractPageContent(page),
|
|
70
|
+
tables: await detectTables(page),
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
return {
|
|
75
|
+
text: pages.map(p => p.content).join('\n\n'),
|
|
76
|
+
pages,
|
|
77
|
+
metadata: await extractMetadata(pdfDoc),
|
|
78
|
+
tables: pages.flatMap(p => p.tables),
|
|
79
|
+
images: await extractImages(pdfDoc),
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// PDF generation from template
|
|
84
|
+
async function generatePDF(template: PDFTemplate, data: Record<string, any>): Promise<Buffer> {
|
|
85
|
+
const pdfDoc = await PDFDocument.create();
|
|
86
|
+
const page = pdfDoc.addPage();
|
|
87
|
+
const { width, height } = page.getSize();
|
|
88
|
+
|
|
89
|
+
// Apply template with data substitution
|
|
90
|
+
for (const element of template.elements) {
|
|
91
|
+
switch (element.type) {
|
|
92
|
+
case 'text':
|
|
93
|
+
const text = substituteVariables(element.content, data);
|
|
94
|
+
page.drawText(text, {
|
|
95
|
+
x: element.x,
|
|
96
|
+
y: height - element.y,
|
|
97
|
+
size: element.fontSize || 12,
|
|
98
|
+
font: await pdfDoc.embedFont(element.font || StandardFonts.Helvetica),
|
|
99
|
+
});
|
|
100
|
+
break;
|
|
101
|
+
case 'image':
|
|
102
|
+
const imageBytes = await fetch(data[element.dataKey]).then(r => r.arrayBuffer());
|
|
103
|
+
const image = await pdfDoc.embedPng(imageBytes);
|
|
104
|
+
page.drawImage(image, {
|
|
105
|
+
x: element.x,
|
|
106
|
+
y: height - element.y - element.height,
|
|
107
|
+
width: element.width,
|
|
108
|
+
height: element.height,
|
|
109
|
+
});
|
|
110
|
+
break;
|
|
111
|
+
case 'table':
|
|
112
|
+
await drawTable(page, element, data[element.dataKey]);
|
|
113
|
+
break;
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
return Buffer.from(await pdfDoc.save());
|
|
118
|
+
}
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### 2. Word Document Processing (DOCX)
|
|
122
|
+
|
|
123
|
+
```typescript
|
|
124
|
+
import { Document, Paragraph, TextRun, Table, TableRow, TableCell, Packer } from 'docx';
|
|
125
|
+
|
|
126
|
+
// Parse DOCX to structured format
|
|
127
|
+
interface DOCXContent {
|
|
128
|
+
paragraphs: ParsedParagraph[];
|
|
129
|
+
tables: ParsedTable[];
|
|
130
|
+
images: ParsedImage[];
|
|
131
|
+
styles: DocumentStyles;
|
|
132
|
+
metadata: DocumentMetadata;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
async function parseDOCX(buffer: Buffer): Promise<DOCXContent> {
|
|
136
|
+
const zip = new JSZip();
|
|
137
|
+
const doc = await zip.loadAsync(buffer);
|
|
138
|
+
|
|
139
|
+
// Parse document.xml
|
|
140
|
+
const documentXml = await doc.file('word/document.xml')?.async('string');
|
|
141
|
+
const parser = new XMLParser();
|
|
142
|
+
const parsed = parser.parse(documentXml);
|
|
143
|
+
|
|
144
|
+
// Extract content preserving structure
|
|
145
|
+
return {
|
|
146
|
+
paragraphs: extractParagraphs(parsed),
|
|
147
|
+
tables: extractTables(parsed),
|
|
148
|
+
images: await extractImages(doc),
|
|
149
|
+
styles: await parseStyles(doc),
|
|
150
|
+
metadata: await parseMetadata(doc),
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// Generate DOCX from template
|
|
155
|
+
async function generateDOCX(config: DOCXConfig): Promise<Buffer> {
|
|
156
|
+
const doc = new Document({
|
|
157
|
+
sections: [{
|
|
158
|
+
properties: {
|
|
159
|
+
page: {
|
|
160
|
+
margin: { top: 720, right: 720, bottom: 720, left: 720 },
|
|
161
|
+
},
|
|
162
|
+
},
|
|
163
|
+
children: [
|
|
164
|
+
// Header
|
|
165
|
+
new Paragraph({
|
|
166
|
+
children: [
|
|
167
|
+
new TextRun({
|
|
168
|
+
text: config.title,
|
|
169
|
+
bold: true,
|
|
170
|
+
size: 48,
|
|
171
|
+
}),
|
|
172
|
+
],
|
|
173
|
+
heading: HeadingLevel.HEADING_1,
|
|
174
|
+
spacing: { after: 200 },
|
|
175
|
+
}),
|
|
176
|
+
|
|
177
|
+
// Content paragraphs
|
|
178
|
+
...config.content.map(section => new Paragraph({
|
|
179
|
+
children: [
|
|
180
|
+
new TextRun({
|
|
181
|
+
text: section.text,
|
|
182
|
+
size: 24,
|
|
183
|
+
}),
|
|
184
|
+
],
|
|
185
|
+
spacing: { after: 120 },
|
|
186
|
+
})),
|
|
187
|
+
|
|
188
|
+
// Table if data provided
|
|
189
|
+
...(config.tableData ? [createTable(config.tableData)] : []),
|
|
190
|
+
],
|
|
191
|
+
}],
|
|
192
|
+
});
|
|
193
|
+
|
|
194
|
+
return await Packer.toBuffer(doc);
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
// Create formatted table
|
|
198
|
+
function createTable(data: TableData): Table {
|
|
199
|
+
return new Table({
|
|
200
|
+
rows: [
|
|
201
|
+
// Header row
|
|
202
|
+
new TableRow({
|
|
203
|
+
children: data.headers.map(header =>
|
|
204
|
+
new TableCell({
|
|
205
|
+
children: [new Paragraph({
|
|
206
|
+
children: [new TextRun({ text: header, bold: true })],
|
|
207
|
+
})],
|
|
208
|
+
shading: { fill: 'f0f0f0' },
|
|
209
|
+
})
|
|
210
|
+
),
|
|
211
|
+
tableHeader: true,
|
|
212
|
+
}),
|
|
213
|
+
// Data rows
|
|
214
|
+
...data.rows.map(row =>
|
|
215
|
+
new TableRow({
|
|
216
|
+
children: row.map(cell =>
|
|
217
|
+
new TableCell({
|
|
218
|
+
children: [new Paragraph({ children: [new TextRun(cell)] })],
|
|
219
|
+
})
|
|
220
|
+
),
|
|
221
|
+
})
|
|
222
|
+
),
|
|
223
|
+
],
|
|
224
|
+
width: { size: 100, type: WidthType.PERCENTAGE },
|
|
225
|
+
});
|
|
226
|
+
}
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
### 3. Excel Processing (XLSX)
|
|
230
|
+
|
|
231
|
+
```typescript
|
|
232
|
+
import ExcelJS from 'exceljs';
|
|
233
|
+
|
|
234
|
+
interface SpreadsheetData {
|
|
235
|
+
sheets: SheetData[];
|
|
236
|
+
metadata: WorkbookMetadata;
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
interface SheetData {
|
|
240
|
+
name: string;
|
|
241
|
+
headers: string[];
|
|
242
|
+
rows: Record<string, any>[];
|
|
243
|
+
formulas: FormulaCell[];
|
|
244
|
+
charts: ChartDefinition[];
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// Read Excel with full fidelity
|
|
248
|
+
async function readExcel(buffer: Buffer): Promise<SpreadsheetData> {
|
|
249
|
+
const workbook = new ExcelJS.Workbook();
|
|
250
|
+
await workbook.xlsx.load(buffer);
|
|
251
|
+
|
|
252
|
+
const sheets: SheetData[] = [];
|
|
253
|
+
|
|
254
|
+
workbook.eachSheet((worksheet, sheetId) => {
|
|
255
|
+
const headers: string[] = [];
|
|
256
|
+
const rows: Record<string, any>[] = [];
|
|
257
|
+
const formulas: FormulaCell[] = [];
|
|
258
|
+
|
|
259
|
+
// Get headers from first row
|
|
260
|
+
worksheet.getRow(1).eachCell((cell, colNumber) => {
|
|
261
|
+
headers[colNumber - 1] = cell.value?.toString() || `Column${colNumber}`;
|
|
262
|
+
});
|
|
263
|
+
|
|
264
|
+
// Get data rows
|
|
265
|
+
worksheet.eachRow((row, rowNumber) => {
|
|
266
|
+
if (rowNumber === 1) return; // Skip header
|
|
267
|
+
|
|
268
|
+
const rowData: Record<string, any> = {};
|
|
269
|
+
row.eachCell((cell, colNumber) => {
|
|
270
|
+
const header = headers[colNumber - 1];
|
|
271
|
+
|
|
272
|
+
// Preserve formulas
|
|
273
|
+
if (cell.formula) {
|
|
274
|
+
formulas.push({
|
|
275
|
+
row: rowNumber,
|
|
276
|
+
col: colNumber,
|
|
277
|
+
formula: cell.formula,
|
|
278
|
+
result: cell.value,
|
|
279
|
+
});
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
rowData[header] = cell.value;
|
|
283
|
+
});
|
|
284
|
+
|
|
285
|
+
rows.push(rowData);
|
|
286
|
+
});
|
|
287
|
+
|
|
288
|
+
sheets.push({
|
|
289
|
+
name: worksheet.name,
|
|
290
|
+
headers,
|
|
291
|
+
rows,
|
|
292
|
+
formulas,
|
|
293
|
+
charts: extractCharts(worksheet),
|
|
294
|
+
});
|
|
295
|
+
});
|
|
296
|
+
|
|
297
|
+
return {
|
|
298
|
+
sheets,
|
|
299
|
+
metadata: {
|
|
300
|
+
creator: workbook.creator,
|
|
301
|
+
created: workbook.created,
|
|
302
|
+
modified: workbook.modified,
|
|
303
|
+
},
|
|
304
|
+
};
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
// Generate Excel with formatting
|
|
308
|
+
async function generateExcel(config: ExcelConfig): Promise<Buffer> {
|
|
309
|
+
const workbook = new ExcelJS.Workbook();
|
|
310
|
+
workbook.creator = config.author || 'Document Processor';
|
|
311
|
+
workbook.created = new Date();
|
|
312
|
+
|
|
313
|
+
for (const sheetConfig of config.sheets) {
|
|
314
|
+
const worksheet = workbook.addWorksheet(sheetConfig.name);
|
|
315
|
+
|
|
316
|
+
// Add headers with styling
|
|
317
|
+
worksheet.addRow(sheetConfig.headers);
|
|
318
|
+
worksheet.getRow(1).font = { bold: true };
|
|
319
|
+
worksheet.getRow(1).fill = {
|
|
320
|
+
type: 'pattern',
|
|
321
|
+
pattern: 'solid',
|
|
322
|
+
fgColor: { argb: 'FFE0E0E0' },
|
|
323
|
+
};
|
|
324
|
+
|
|
325
|
+
// Add data rows
|
|
326
|
+
for (const row of sheetConfig.data) {
|
|
327
|
+
worksheet.addRow(sheetConfig.headers.map(h => row[h]));
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
// Auto-fit columns
|
|
331
|
+
worksheet.columns.forEach(column => {
|
|
332
|
+
let maxLength = 0;
|
|
333
|
+
column.eachCell({ includeEmpty: true }, cell => {
|
|
334
|
+
const cellLength = cell.value?.toString().length || 10;
|
|
335
|
+
maxLength = Math.max(maxLength, cellLength);
|
|
336
|
+
});
|
|
337
|
+
column.width = Math.min(maxLength + 2, 50);
|
|
338
|
+
});
|
|
339
|
+
|
|
340
|
+
// Add formulas if specified
|
|
341
|
+
if (sheetConfig.formulas) {
|
|
342
|
+
for (const formula of sheetConfig.formulas) {
|
|
343
|
+
worksheet.getCell(formula.cell).value = { formula: formula.formula };
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
// Add conditional formatting
|
|
348
|
+
if (sheetConfig.conditionalFormatting) {
|
|
349
|
+
worksheet.addConditionalFormatting({
|
|
350
|
+
ref: sheetConfig.conditionalFormatting.range,
|
|
351
|
+
rules: sheetConfig.conditionalFormatting.rules,
|
|
352
|
+
});
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
return Buffer.from(await workbook.xlsx.writeBuffer());
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
// Data transformation utilities
|
|
360
|
+
function transformExcelData(data: SheetData, transform: DataTransform): SheetData {
|
|
361
|
+
let rows = [...data.rows];
|
|
362
|
+
|
|
363
|
+
// Filter rows
|
|
364
|
+
if (transform.filter) {
|
|
365
|
+
rows = rows.filter(row => transform.filter!(row));
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
// Map columns
|
|
369
|
+
if (transform.columnMap) {
|
|
370
|
+
rows = rows.map(row => {
|
|
371
|
+
const newRow: Record<string, any> = {};
|
|
372
|
+
for (const [oldKey, newKey] of Object.entries(transform.columnMap!)) {
|
|
373
|
+
newRow[newKey] = row[oldKey];
|
|
374
|
+
}
|
|
375
|
+
return newRow;
|
|
376
|
+
});
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
// Aggregate if specified
|
|
380
|
+
if (transform.groupBy) {
|
|
381
|
+
rows = aggregateRows(rows, transform.groupBy, transform.aggregations!);
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
return {
|
|
385
|
+
...data,
|
|
386
|
+
headers: transform.columnMap
|
|
387
|
+
? Object.values(transform.columnMap)
|
|
388
|
+
: data.headers,
|
|
389
|
+
rows,
|
|
390
|
+
};
|
|
391
|
+
}
|
|
392
|
+
```
|
|
393
|
+
|
|
394
|
+
### 4. PowerPoint Processing (PPTX)
|
|
395
|
+
|
|
396
|
+
```typescript
|
|
397
|
+
import PptxGenJS from 'pptxgenjs';
|
|
398
|
+
|
|
399
|
+
interface PresentationConfig {
|
|
400
|
+
title: string;
|
|
401
|
+
author?: string;
|
|
402
|
+
theme?: ThemeConfig;
|
|
403
|
+
slides: SlideConfig[];
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
interface SlideConfig {
|
|
407
|
+
layout: 'title' | 'content' | 'twoColumn' | 'comparison' | 'blank';
|
|
408
|
+
title?: string;
|
|
409
|
+
subtitle?: string;
|
|
410
|
+
content?: SlideContent[];
|
|
411
|
+
notes?: string;
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
// Generate PowerPoint presentation
|
|
415
|
+
async function generatePPTX(config: PresentationConfig): Promise<Buffer> {
|
|
416
|
+
const pptx = new PptxGenJS();
|
|
417
|
+
|
|
418
|
+
// Set metadata
|
|
419
|
+
pptx.title = config.title;
|
|
420
|
+
pptx.author = config.author || 'Document Processor';
|
|
421
|
+
|
|
422
|
+
// Apply theme
|
|
423
|
+
if (config.theme) {
|
|
424
|
+
pptx.defineLayout({
|
|
425
|
+
name: 'CUSTOM',
|
|
426
|
+
width: config.theme.width || 10,
|
|
427
|
+
height: config.theme.height || 7.5,
|
|
428
|
+
});
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
// Generate slides
|
|
432
|
+
for (const slideConfig of config.slides) {
|
|
433
|
+
const slide = pptx.addSlide();
|
|
434
|
+
|
|
435
|
+
switch (slideConfig.layout) {
|
|
436
|
+
case 'title':
|
|
437
|
+
slide.addText(slideConfig.title || '', {
|
|
438
|
+
x: 0.5, y: 2.5, w: 9, h: 1,
|
|
439
|
+
fontSize: 44, bold: true, align: 'center',
|
|
440
|
+
});
|
|
441
|
+
if (slideConfig.subtitle) {
|
|
442
|
+
slide.addText(slideConfig.subtitle, {
|
|
443
|
+
x: 0.5, y: 3.5, w: 9, h: 0.5,
|
|
444
|
+
fontSize: 24, color: '666666', align: 'center',
|
|
445
|
+
});
|
|
446
|
+
}
|
|
447
|
+
break;
|
|
448
|
+
|
|
449
|
+
case 'content':
|
|
450
|
+
slide.addText(slideConfig.title || '', {
|
|
451
|
+
x: 0.5, y: 0.3, w: 9, h: 0.8,
|
|
452
|
+
fontSize: 32, bold: true,
|
|
453
|
+
});
|
|
454
|
+
let yPos = 1.2;
|
|
455
|
+
for (const content of slideConfig.content || []) {
|
|
456
|
+
yPos = addSlideContent(slide, content, yPos);
|
|
457
|
+
}
|
|
458
|
+
break;
|
|
459
|
+
|
|
460
|
+
case 'twoColumn':
|
|
461
|
+
slide.addText(slideConfig.title || '', {
|
|
462
|
+
x: 0.5, y: 0.3, w: 9, h: 0.8,
|
|
463
|
+
fontSize: 32, bold: true,
|
|
464
|
+
});
|
|
465
|
+
// Left column
|
|
466
|
+
addSlideContent(slide, slideConfig.content![0], 1.2, 0.5, 4.3);
|
|
467
|
+
// Right column
|
|
468
|
+
addSlideContent(slide, slideConfig.content![1], 1.2, 5.2, 4.3);
|
|
469
|
+
break;
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
// Add speaker notes
|
|
473
|
+
if (slideConfig.notes) {
|
|
474
|
+
slide.addNotes(slideConfig.notes);
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
return Buffer.from(await pptx.write({ outputType: 'arraybuffer' }));
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
// Add various content types to slide
|
|
482
|
+
function addSlideContent(
|
|
483
|
+
slide: PptxGenJS.Slide,
|
|
484
|
+
content: SlideContent,
|
|
485
|
+
y: number,
|
|
486
|
+
x: number = 0.5,
|
|
487
|
+
w: number = 9
|
|
488
|
+
): number {
|
|
489
|
+
switch (content.type) {
|
|
490
|
+
case 'text':
|
|
491
|
+
slide.addText(content.value, {
|
|
492
|
+
x, y, w, h: 0.5,
|
|
493
|
+
fontSize: content.fontSize || 18,
|
|
494
|
+
bullet: content.bullet,
|
|
495
|
+
});
|
|
496
|
+
return y + 0.6;
|
|
497
|
+
|
|
498
|
+
case 'bullets':
|
|
499
|
+
slide.addText(
|
|
500
|
+
content.items.map(item => ({ text: item, options: { bullet: true } })),
|
|
501
|
+
{ x, y, w, fontSize: 18 }
|
|
502
|
+
);
|
|
503
|
+
return y + content.items.length * 0.4 + 0.2;
|
|
504
|
+
|
|
505
|
+
case 'image':
|
|
506
|
+
slide.addImage({
|
|
507
|
+
path: content.path,
|
|
508
|
+
x, y, w: content.width || 4, h: content.height || 3,
|
|
509
|
+
});
|
|
510
|
+
return y + (content.height || 3) + 0.2;
|
|
511
|
+
|
|
512
|
+
case 'chart':
|
|
513
|
+
slide.addChart(content.chartType, content.data, {
|
|
514
|
+
x, y, w, h: content.height || 4,
|
|
515
|
+
});
|
|
516
|
+
return y + (content.height || 4) + 0.2;
|
|
517
|
+
|
|
518
|
+
case 'table':
|
|
519
|
+
slide.addTable(content.data, {
|
|
520
|
+
x, y, w,
|
|
521
|
+
border: { pt: 1, color: 'CFCFCF' },
|
|
522
|
+
fontFace: 'Arial',
|
|
523
|
+
fontSize: 14,
|
|
524
|
+
});
|
|
525
|
+
return y + content.data.length * 0.4 + 0.2;
|
|
526
|
+
|
|
527
|
+
default:
|
|
528
|
+
return y;
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
```
|
|
532
|
+
|
|
533
|
+
### 5. Batch Processing Pipeline
|
|
534
|
+
|
|
535
|
+
```typescript
|
|
536
|
+
interface BatchConfig {
|
|
537
|
+
inputDir: string;
|
|
538
|
+
outputDir: string;
|
|
539
|
+
concurrency: number;
|
|
540
|
+
transform: DocumentTransform;
|
|
541
|
+
onProgress?: (progress: BatchProgress) => void;
|
|
542
|
+
onError?: (error: BatchError) => void;
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
interface BatchProgress {
|
|
546
|
+
total: number;
|
|
547
|
+
processed: number;
|
|
548
|
+
succeeded: number;
|
|
549
|
+
failed: number;
|
|
550
|
+
currentFile: string;
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
// Batch document processing with streaming
|
|
554
|
+
async function processBatch(config: BatchConfig): Promise<BatchResult> {
|
|
555
|
+
const files = await glob(`${config.inputDir}/**/*.{pdf,docx,xlsx,pptx}`);
|
|
556
|
+
const results: ProcessingResult[] = [];
|
|
557
|
+
|
|
558
|
+
const progress: BatchProgress = {
|
|
559
|
+
total: files.length,
|
|
560
|
+
processed: 0,
|
|
561
|
+
succeeded: 0,
|
|
562
|
+
failed: 0,
|
|
563
|
+
currentFile: '',
|
|
564
|
+
};
|
|
565
|
+
|
|
566
|
+
// Process with concurrency limit
|
|
567
|
+
const queue = new PQueue({ concurrency: config.concurrency });
|
|
568
|
+
|
|
569
|
+
const tasks = files.map(file => queue.add(async () => {
|
|
570
|
+
progress.currentFile = file;
|
|
571
|
+
config.onProgress?.(progress);
|
|
572
|
+
|
|
573
|
+
try {
|
|
574
|
+
const buffer = await fs.readFile(file);
|
|
575
|
+
const ext = path.extname(file).toLowerCase();
|
|
576
|
+
|
|
577
|
+
// Process based on file type
|
|
578
|
+
let result: Buffer;
|
|
579
|
+
switch (ext) {
|
|
580
|
+
case '.pdf':
|
|
581
|
+
result = await transformPDF(buffer, config.transform);
|
|
582
|
+
break;
|
|
583
|
+
case '.docx':
|
|
584
|
+
result = await transformDOCX(buffer, config.transform);
|
|
585
|
+
break;
|
|
586
|
+
case '.xlsx':
|
|
587
|
+
result = await transformExcel(buffer, config.transform);
|
|
588
|
+
break;
|
|
589
|
+
case '.pptx':
|
|
590
|
+
result = await transformPPTX(buffer, config.transform);
|
|
591
|
+
break;
|
|
592
|
+
default:
|
|
593
|
+
throw new Error(`Unsupported file type: ${ext}`);
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
// Write output
|
|
597
|
+
const outputPath = path.join(
|
|
598
|
+
config.outputDir,
|
|
599
|
+
path.relative(config.inputDir, file)
|
|
600
|
+
);
|
|
601
|
+
await fs.mkdir(path.dirname(outputPath), { recursive: true });
|
|
602
|
+
await fs.writeFile(outputPath, result);
|
|
603
|
+
|
|
604
|
+
progress.succeeded++;
|
|
605
|
+
results.push({ file, success: true });
|
|
606
|
+
} catch (error) {
|
|
607
|
+
progress.failed++;
|
|
608
|
+
results.push({ file, success: false, error: error.message });
|
|
609
|
+
config.onError?.({ file, error });
|
|
610
|
+
} finally {
|
|
611
|
+
progress.processed++;
|
|
612
|
+
config.onProgress?.(progress);
|
|
613
|
+
}
|
|
614
|
+
}));
|
|
615
|
+
|
|
616
|
+
await Promise.all(tasks);
|
|
617
|
+
|
|
618
|
+
return {
|
|
619
|
+
total: files.length,
|
|
620
|
+
succeeded: progress.succeeded,
|
|
621
|
+
failed: progress.failed,
|
|
622
|
+
results,
|
|
623
|
+
};
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
// Stream large file processing
|
|
627
|
+
async function* streamProcess(
|
|
628
|
+
inputStream: ReadStream,
|
|
629
|
+
transform: ChunkTransform
|
|
630
|
+
): AsyncGenerator<Buffer> {
|
|
631
|
+
const chunks: Buffer[] = [];
|
|
632
|
+
let processedSize = 0;
|
|
633
|
+
|
|
634
|
+
for await (const chunk of inputStream) {
|
|
635
|
+
chunks.push(chunk);
|
|
636
|
+
processedSize += chunk.length;
|
|
637
|
+
|
|
638
|
+
// Process in chunks for memory efficiency
|
|
639
|
+
if (processedSize >= CHUNK_SIZE) {
|
|
640
|
+
const combined = Buffer.concat(chunks);
|
|
641
|
+
yield await transform(combined);
|
|
642
|
+
chunks.length = 0;
|
|
643
|
+
processedSize = 0;
|
|
644
|
+
}
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
// Process remaining
|
|
648
|
+
if (chunks.length > 0) {
|
|
649
|
+
yield await transform(Buffer.concat(chunks));
|
|
650
|
+
}
|
|
651
|
+
}
|
|
652
|
+
```
|
|
653
|
+
|
|
654
|
+
### 6. Template-Based Document Generation
|
|
655
|
+
|
|
656
|
+
```typescript
|
|
657
|
+
interface DocumentTemplate {
|
|
658
|
+
type: 'pdf' | 'docx' | 'xlsx' | 'pptx';
|
|
659
|
+
templatePath: string;
|
|
660
|
+
placeholders: PlaceholderConfig[];
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
interface PlaceholderConfig {
|
|
664
|
+
key: string;
|
|
665
|
+
type: 'text' | 'image' | 'table' | 'chart' | 'list';
|
|
666
|
+
format?: FormatOptions;
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
// Generate document from template with data binding
|
|
670
|
+
async function generateFromTemplate(
|
|
671
|
+
template: DocumentTemplate,
|
|
672
|
+
data: Record<string, any>
|
|
673
|
+
): Promise<Buffer> {
|
|
674
|
+
const templateBuffer = await fs.readFile(template.templatePath);
|
|
675
|
+
|
|
676
|
+
switch (template.type) {
|
|
677
|
+
case 'docx':
|
|
678
|
+
return generateDOCXFromTemplate(templateBuffer, template.placeholders, data);
|
|
679
|
+
case 'xlsx':
|
|
680
|
+
return generateExcelFromTemplate(templateBuffer, template.placeholders, data);
|
|
681
|
+
case 'pptx':
|
|
682
|
+
return generatePPTXFromTemplate(templateBuffer, template.placeholders, data);
|
|
683
|
+
case 'pdf':
|
|
684
|
+
return generatePDFFromTemplate(templateBuffer, template.placeholders, data);
|
|
685
|
+
}
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
// DOCX template processing
|
|
689
|
+
async function generateDOCXFromTemplate(
|
|
690
|
+
templateBuffer: Buffer,
|
|
691
|
+
placeholders: PlaceholderConfig[],
|
|
692
|
+
data: Record<string, any>
|
|
693
|
+
): Promise<Buffer> {
|
|
694
|
+
const doc = new Docxtemplater(new PizZip(templateBuffer), {
|
|
695
|
+
paragraphLoop: true,
|
|
696
|
+
linebreaks: true,
|
|
697
|
+
});
|
|
698
|
+
|
|
699
|
+
// Build data object with formatting
|
|
700
|
+
const templateData: Record<string, any> = {};
|
|
701
|
+
|
|
702
|
+
for (const placeholder of placeholders) {
|
|
703
|
+
const value = data[placeholder.key];
|
|
704
|
+
|
|
705
|
+
switch (placeholder.type) {
|
|
706
|
+
case 'text':
|
|
707
|
+
templateData[placeholder.key] = formatText(value, placeholder.format);
|
|
708
|
+
break;
|
|
709
|
+
case 'table':
|
|
710
|
+
templateData[placeholder.key] = formatTableData(value);
|
|
711
|
+
break;
|
|
712
|
+
case 'image':
|
|
713
|
+
templateData[placeholder.key] = await loadImage(value);
|
|
714
|
+
break;
|
|
715
|
+
case 'list':
|
|
716
|
+
templateData[placeholder.key] = value.map((item: any) => ({ item }));
|
|
717
|
+
break;
|
|
718
|
+
}
|
|
719
|
+
}
|
|
720
|
+
|
|
721
|
+
doc.render(templateData);
|
|
722
|
+
|
|
723
|
+
return doc.getZip().generate({
|
|
724
|
+
type: 'nodebuffer',
|
|
725
|
+
compression: 'DEFLATE',
|
|
726
|
+
});
|
|
727
|
+
}
|
|
728
|
+
```
|
|
729
|
+
|
|
730
|
+
## Use Cases
|
|
731
|
+
|
|
732
|
+
### 1. Invoice Generation System
|
|
733
|
+
|
|
734
|
+
```typescript
|
|
735
|
+
// Generate invoices from order data
|
|
736
|
+
async function generateInvoice(order: Order): Promise<Buffer> {
|
|
737
|
+
const template: DOCXConfig = {
|
|
738
|
+
title: `Invoice #${order.invoiceNumber}`,
|
|
739
|
+
content: [
|
|
740
|
+
{ text: `Date: ${formatDate(order.date)}` },
|
|
741
|
+
{ text: `Customer: ${order.customer.name}` },
|
|
742
|
+
{ text: `Address: ${order.customer.address}` },
|
|
743
|
+
],
|
|
744
|
+
tableData: {
|
|
745
|
+
headers: ['Item', 'Quantity', 'Price', 'Total'],
|
|
746
|
+
rows: order.items.map(item => [
|
|
747
|
+
item.name,
|
|
748
|
+
item.quantity.toString(),
|
|
749
|
+
formatCurrency(item.price),
|
|
750
|
+
formatCurrency(item.quantity * item.price),
|
|
751
|
+
]),
|
|
752
|
+
},
|
|
753
|
+
};
|
|
754
|
+
|
|
755
|
+
// Add totals
|
|
756
|
+
template.content.push(
|
|
757
|
+
{ text: '' },
|
|
758
|
+
{ text: `Subtotal: ${formatCurrency(order.subtotal)}` },
|
|
759
|
+
{ text: `Tax: ${formatCurrency(order.tax)}` },
|
|
760
|
+
{ text: `Total: ${formatCurrency(order.total)}`, bold: true },
|
|
761
|
+
);
|
|
762
|
+
|
|
763
|
+
return generateDOCX(template);
|
|
764
|
+
}
|
|
765
|
+
```
|
|
766
|
+
|
|
767
|
+
### 2. Report Dashboard Export
|
|
768
|
+
|
|
769
|
+
```typescript
|
|
770
|
+
// Export dashboard data to Excel with charts
|
|
771
|
+
async function exportDashboard(dashboard: DashboardData): Promise<Buffer> {
|
|
772
|
+
return generateExcel({
|
|
773
|
+
author: 'Analytics System',
|
|
774
|
+
sheets: [
|
|
775
|
+
{
|
|
776
|
+
name: 'Summary',
|
|
777
|
+
headers: ['Metric', 'Value', 'Change'],
|
|
778
|
+
data: dashboard.kpis.map(kpi => ({
|
|
779
|
+
Metric: kpi.name,
|
|
780
|
+
Value: kpi.value,
|
|
781
|
+
Change: `${kpi.change > 0 ? '+' : ''}${kpi.change}%`,
|
|
782
|
+
})),
|
|
783
|
+
conditionalFormatting: {
|
|
784
|
+
range: 'C2:C100',
|
|
785
|
+
rules: [
|
|
786
|
+
{ type: 'cellIs', operator: 'greaterThan', formulae: [0], style: { fill: { argb: 'FF00FF00' } } },
|
|
787
|
+
{ type: 'cellIs', operator: 'lessThan', formulae: [0], style: { fill: { argb: 'FFFF0000' } } },
|
|
788
|
+
],
|
|
789
|
+
},
|
|
790
|
+
},
|
|
791
|
+
{
|
|
792
|
+
name: 'Detailed Data',
|
|
793
|
+
headers: Object.keys(dashboard.detailedData[0] || {}),
|
|
794
|
+
data: dashboard.detailedData,
|
|
795
|
+
},
|
|
796
|
+
],
|
|
797
|
+
});
|
|
798
|
+
}
|
|
799
|
+
```
|
|
800
|
+
|
|
801
|
+
### 3. Contract Analysis Pipeline
|
|
802
|
+
|
|
803
|
+
```typescript
|
|
804
|
+
// Extract and analyze contract data
|
|
805
|
+
async function analyzeContract(pdfBuffer: Buffer): Promise<ContractAnalysis> {
|
|
806
|
+
const extracted = await extractStructuredPDF(pdfBuffer);
|
|
807
|
+
|
|
808
|
+
return {
|
|
809
|
+
parties: extractParties(extracted.text),
|
|
810
|
+
dates: extractDates(extracted.text),
|
|
811
|
+
amounts: extractMonetaryAmounts(extracted.text),
|
|
812
|
+
clauses: categorizeClausses(extracted.text),
|
|
813
|
+
signatures: detectSignatures(extracted.images),
|
|
814
|
+
tables: extracted.tables.map(analyzeTable),
|
|
815
|
+
};
|
|
816
|
+
}
|
|
817
|
+
```
|
|
818
|
+
|
|
819
|
+
## Best Practices
|
|
820
|
+
|
|
821
|
+
### Do's
|
|
822
|
+
|
|
823
|
+
- **Stream large files** - Use streaming for files > 10MB to prevent memory issues
|
|
824
|
+
- **Validate inputs** - Check file types and sizes before processing
|
|
825
|
+
- **Handle encoding** - Support UTF-8 and detect encoding issues gracefully
|
|
826
|
+
- **Preserve formatting** - Maintain original formatting when transforming
|
|
827
|
+
- **Cache parsed results** - Cache extracted data for repeated access
|
|
828
|
+
- **Use appropriate libraries** - pdf-lib for PDFs, exceljs for Excel, docx for Word
|
|
829
|
+
|
|
830
|
+
### Don'ts
|
|
831
|
+
|
|
832
|
+
- Don't load entire large files into memory
|
|
833
|
+
- Don't assume file extensions match content
|
|
834
|
+
- Don't ignore password-protected documents
|
|
835
|
+
- Don't strip metadata without user consent
|
|
836
|
+
- Don't process untrusted files without sandboxing
|
|
837
|
+
- Don't skip error handling for corrupt files
|
|
838
|
+
|
|
839
|
+
### Error Handling
|
|
840
|
+
|
|
841
|
+
```typescript
|
|
842
|
+
class DocumentProcessingError extends Error {
|
|
843
|
+
constructor(
|
|
844
|
+
message: string,
|
|
845
|
+
public readonly code: ErrorCode,
|
|
846
|
+
public readonly file?: string,
|
|
847
|
+
public readonly cause?: Error
|
|
848
|
+
) {
|
|
849
|
+
super(message);
|
|
850
|
+
this.name = 'DocumentProcessingError';
|
|
851
|
+
}
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
enum ErrorCode {
|
|
855
|
+
INVALID_FORMAT = 'INVALID_FORMAT',
|
|
856
|
+
CORRUPT_FILE = 'CORRUPT_FILE',
|
|
857
|
+
PASSWORD_PROTECTED = 'PASSWORD_PROTECTED',
|
|
858
|
+
ENCODING_ERROR = 'ENCODING_ERROR',
|
|
859
|
+
SIZE_LIMIT_EXCEEDED = 'SIZE_LIMIT_EXCEEDED',
|
|
860
|
+
UNSUPPORTED_FEATURE = 'UNSUPPORTED_FEATURE',
|
|
861
|
+
}
|
|
862
|
+
|
|
863
|
+
// Comprehensive error handling
|
|
864
|
+
async function safeProcessDocument(buffer: Buffer, filename: string): Promise<ProcessResult> {
|
|
865
|
+
try {
|
|
866
|
+
// Validate file
|
|
867
|
+
const fileType = await detectFileType(buffer);
|
|
868
|
+
if (!SUPPORTED_TYPES.includes(fileType)) {
|
|
869
|
+
throw new DocumentProcessingError(
|
|
870
|
+
`Unsupported file type: ${fileType}`,
|
|
871
|
+
ErrorCode.INVALID_FORMAT,
|
|
872
|
+
filename
|
|
873
|
+
);
|
|
874
|
+
}
|
|
875
|
+
|
|
876
|
+
// Check size
|
|
877
|
+
if (buffer.length > MAX_FILE_SIZE) {
|
|
878
|
+
throw new DocumentProcessingError(
|
|
879
|
+
`File exceeds maximum size of ${MAX_FILE_SIZE} bytes`,
|
|
880
|
+
ErrorCode.SIZE_LIMIT_EXCEEDED,
|
|
881
|
+
filename
|
|
882
|
+
);
|
|
883
|
+
}
|
|
884
|
+
|
|
885
|
+
// Process
|
|
886
|
+
return await processDocument(buffer, fileType);
|
|
887
|
+
} catch (error) {
|
|
888
|
+
if (error instanceof DocumentProcessingError) {
|
|
889
|
+
throw error;
|
|
890
|
+
}
|
|
891
|
+
|
|
892
|
+
// Wrap unexpected errors
|
|
893
|
+
throw new DocumentProcessingError(
|
|
894
|
+
`Failed to process document: ${error.message}`,
|
|
895
|
+
ErrorCode.CORRUPT_FILE,
|
|
896
|
+
filename,
|
|
897
|
+
error
|
|
898
|
+
);
|
|
899
|
+
}
|
|
900
|
+
}
|
|
901
|
+
```
|
|
902
|
+
|
|
903
|
+
## Related Skills
|
|
904
|
+
|
|
905
|
+
- **python** - Alternative processing with python-docx, openpyxl, PyPDF2
|
|
906
|
+
- **typescript** - Type-safe document handling
|
|
907
|
+
- **data-processing** - Data transformation utilities
|
|
908
|
+
- **api-architecture** - Document API design patterns
|
|
909
|
+
|
|
910
|
+
## Reference Resources
|
|
911
|
+
|
|
912
|
+
- [pdf-lib Documentation](https://pdf-lib.js.org/)
|
|
913
|
+
- [ExcelJS Documentation](https://github.com/exceljs/exceljs)
|
|
914
|
+
- [docx Documentation](https://docx.js.org/)
|
|
915
|
+
- [PptxGenJS Documentation](https://gitbrent.github.io/PptxGenJS/)
|
|
916
|
+
- [Apache POI](https://poi.apache.org/) - Java reference
|