omgkit 2.1.1 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/package.json +1 -1
  2. package/plugin/skills/databases/mongodb/SKILL.md +81 -28
  3. package/plugin/skills/databases/prisma/SKILL.md +87 -32
  4. package/plugin/skills/databases/redis/SKILL.md +80 -27
  5. package/plugin/skills/devops/aws/SKILL.md +80 -26
  6. package/plugin/skills/devops/github-actions/SKILL.md +84 -32
  7. package/plugin/skills/devops/kubernetes/SKILL.md +94 -32
  8. package/plugin/skills/devops/performance-profiling/SKILL.md +59 -863
  9. package/plugin/skills/frameworks/django/SKILL.md +158 -24
  10. package/plugin/skills/frameworks/express/SKILL.md +153 -33
  11. package/plugin/skills/frameworks/fastapi/SKILL.md +153 -34
  12. package/plugin/skills/frameworks/laravel/SKILL.md +146 -33
  13. package/plugin/skills/frameworks/nestjs/SKILL.md +137 -25
  14. package/plugin/skills/frameworks/rails/SKILL.md +594 -28
  15. package/plugin/skills/frameworks/react/SKILL.md +94 -962
  16. package/plugin/skills/frameworks/spring/SKILL.md +528 -35
  17. package/plugin/skills/frameworks/vue/SKILL.md +147 -25
  18. package/plugin/skills/frontend/accessibility/SKILL.md +145 -36
  19. package/plugin/skills/frontend/frontend-design/SKILL.md +114 -29
  20. package/plugin/skills/frontend/responsive/SKILL.md +131 -28
  21. package/plugin/skills/frontend/shadcn-ui/SKILL.md +133 -43
  22. package/plugin/skills/frontend/tailwindcss/SKILL.md +105 -37
  23. package/plugin/skills/frontend/threejs/SKILL.md +110 -35
  24. package/plugin/skills/languages/javascript/SKILL.md +195 -34
  25. package/plugin/skills/methodology/brainstorming/SKILL.md +98 -30
  26. package/plugin/skills/methodology/defense-in-depth/SKILL.md +83 -37
  27. package/plugin/skills/methodology/dispatching-parallel-agents/SKILL.md +92 -31
  28. package/plugin/skills/methodology/executing-plans/SKILL.md +117 -28
  29. package/plugin/skills/methodology/finishing-development-branch/SKILL.md +111 -32
  30. package/plugin/skills/methodology/problem-solving/SKILL.md +65 -311
  31. package/plugin/skills/methodology/receiving-code-review/SKILL.md +76 -27
  32. package/plugin/skills/methodology/requesting-code-review/SKILL.md +93 -22
  33. package/plugin/skills/methodology/root-cause-tracing/SKILL.md +75 -40
  34. package/plugin/skills/methodology/sequential-thinking/SKILL.md +75 -224
  35. package/plugin/skills/methodology/systematic-debugging/SKILL.md +81 -35
  36. package/plugin/skills/methodology/test-driven-development/SKILL.md +120 -26
  37. package/plugin/skills/methodology/testing-anti-patterns/SKILL.md +88 -35
  38. package/plugin/skills/methodology/token-optimization/SKILL.md +73 -34
  39. package/plugin/skills/methodology/verification-before-completion/SKILL.md +128 -28
  40. package/plugin/skills/methodology/writing-plans/SKILL.md +105 -20
  41. package/plugin/skills/omega/omega-architecture/SKILL.md +178 -40
  42. package/plugin/skills/omega/omega-coding/SKILL.md +247 -41
  43. package/plugin/skills/omega/omega-sprint/SKILL.md +208 -46
  44. package/plugin/skills/omega/omega-testing/SKILL.md +253 -42
  45. package/plugin/skills/omega/omega-thinking/SKILL.md +263 -51
  46. package/plugin/skills/security/better-auth/SKILL.md +83 -34
  47. package/plugin/skills/security/oauth/SKILL.md +118 -35
  48. package/plugin/skills/security/owasp/SKILL.md +112 -35
  49. package/plugin/skills/testing/playwright/SKILL.md +141 -38
  50. package/plugin/skills/testing/pytest/SKILL.md +137 -38
  51. package/plugin/skills/testing/vitest/SKILL.md +124 -39
  52. package/plugin/skills/tools/document-processing/SKILL.md +111 -838
  53. package/plugin/skills/tools/image-processing/SKILL.md +126 -659
  54. package/plugin/skills/tools/mcp-development/SKILL.md +85 -758
  55. package/plugin/skills/tools/media-processing/SKILL.md +118 -735
  56. package/plugin/stdrules/SKILL_STANDARDS.md +490 -0
@@ -1,6 +1,6 @@
1
1
  ---
2
- name: document-processing
3
- description: Enterprise-grade document processing for PDF, DOCX, XLSX, PPTX with streaming, validation, and batch operations
2
+ name: Processing Documents
3
+ description: Processes PDF, DOCX, XLSX, and PPTX files with extraction, generation, and batch operations. Use when building document pipelines, extracting content from office files, or generating reports.
4
4
  category: tools
5
5
  triggers:
6
6
  - document processing
@@ -12,905 +12,178 @@ triggers:
12
12
  - office documents
13
13
  ---
14
14
 
15
- # Document Processing
15
+ # Processing Documents
16
16
 
17
- Enterprise-grade **document processing** for PDF, DOCX, XLSX, and PPTX files. This skill enables extraction, manipulation, generation, and batch processing of office documents with streaming support for large files.
18
-
19
- ## Purpose
20
-
21
- Handle document processing tasks that enterprise applications commonly require:
22
-
23
- - Extract text and structured data from PDFs
24
- - Parse and generate Word documents
25
- - Manipulate Excel spreadsheets programmatically
26
- - Create PowerPoint presentations from data
27
- - Process documents in batch with progress tracking
28
-
29
- ## Features
30
-
31
- ### 1. PDF Processing
17
+ ## Quick Start
32
18
 
33
19
  ```typescript
34
- // PDF text extraction with structure preservation
35
- import { PDFDocument, PDFExtract } from 'pdf-lib';
36
-
37
- interface PDFExtractionResult {
38
- text: string;
39
- pages: PageContent[];
40
- metadata: PDFMetadata;
41
- tables: ExtractedTable[];
42
- images: ExtractedImage[];
43
- }
20
+ import { PDFDocument } from 'pdf-lib';
21
+ import ExcelJS from 'exceljs';
22
+ import { Document, Packer, Paragraph, TextRun } from 'docx';
44
23
 
45
- // Basic text extraction
24
+ // Extract text from PDF
46
25
  async function extractPDFText(buffer: Buffer): Promise<string> {
47
- const pdfExtract = new PDFExtract();
48
- const data = await pdfExtract.extractBuffer(buffer);
49
-
50
- return data.pages
51
- .map(page => page.content
52
- .map(item => item.str)
53
- .join(' ')
54
- )
55
- .join('\n\n');
56
- }
57
-
58
- // Structured extraction with tables
59
- async function extractStructuredPDF(buffer: Buffer): Promise<PDFExtractionResult> {
60
26
  const pdfDoc = await PDFDocument.load(buffer);
61
- const pages: PageContent[] = [];
62
-
63
- for (let i = 0; i < pdfDoc.getPageCount(); i++) {
64
- const page = pdfDoc.getPage(i);
65
- pages.push({
66
- pageNumber: i + 1,
67
- width: page.getWidth(),
68
- height: page.getHeight(),
69
- content: await extractPageContent(page),
70
- tables: await detectTables(page),
71
- });
72
- }
73
-
74
- return {
75
- text: pages.map(p => p.content).join('\n\n'),
76
- pages,
77
- metadata: await extractMetadata(pdfDoc),
78
- tables: pages.flatMap(p => p.tables),
79
- images: await extractImages(pdfDoc),
80
- };
27
+ const pages = pdfDoc.getPages();
28
+ return pages.map(page => page.getTextContent()).join('\n\n');
81
29
  }
82
30
 
83
- // PDF generation from template
84
- async function generatePDF(template: PDFTemplate, data: Record<string, any>): Promise<Buffer> {
85
- const pdfDoc = await PDFDocument.create();
86
- const page = pdfDoc.addPage();
87
- const { width, height } = page.getSize();
88
-
89
- // Apply template with data substitution
90
- for (const element of template.elements) {
91
- switch (element.type) {
92
- case 'text':
93
- const text = substituteVariables(element.content, data);
94
- page.drawText(text, {
95
- x: element.x,
96
- y: height - element.y,
97
- size: element.fontSize || 12,
98
- font: await pdfDoc.embedFont(element.font || StandardFonts.Helvetica),
99
- });
100
- break;
101
- case 'image':
102
- const imageBytes = await fetch(data[element.dataKey]).then(r => r.arrayBuffer());
103
- const image = await pdfDoc.embedPng(imageBytes);
104
- page.drawImage(image, {
105
- x: element.x,
106
- y: height - element.y - element.height,
107
- width: element.width,
108
- height: element.height,
109
- });
110
- break;
111
- case 'table':
112
- await drawTable(page, element, data[element.dataKey]);
113
- break;
114
- }
115
- }
116
-
117
- return Buffer.from(await pdfDoc.save());
118
- }
119
- ```
120
-
121
- ### 2. Word Document Processing (DOCX)
122
-
123
- ```typescript
124
- import { Document, Paragraph, TextRun, Table, TableRow, TableCell, Packer } from 'docx';
125
-
126
- // Parse DOCX to structured format
127
- interface DOCXContent {
128
- paragraphs: ParsedParagraph[];
129
- tables: ParsedTable[];
130
- images: ParsedImage[];
131
- styles: DocumentStyles;
132
- metadata: DocumentMetadata;
133
- }
134
-
135
- async function parseDOCX(buffer: Buffer): Promise<DOCXContent> {
136
- const zip = new JSZip();
137
- const doc = await zip.loadAsync(buffer);
138
-
139
- // Parse document.xml
140
- const documentXml = await doc.file('word/document.xml')?.async('string');
141
- const parser = new XMLParser();
142
- const parsed = parser.parse(documentXml);
143
-
144
- // Extract content preserving structure
145
- return {
146
- paragraphs: extractParagraphs(parsed),
147
- tables: extractTables(parsed),
148
- images: await extractImages(doc),
149
- styles: await parseStyles(doc),
150
- metadata: await parseMetadata(doc),
151
- };
31
+ // Read Excel spreadsheet
32
+ async function readExcel(buffer: Buffer) {
33
+ const workbook = new ExcelJS.Workbook();
34
+ await workbook.xlsx.load(buffer);
35
+ return workbook.worksheets.map(sheet => ({
36
+ name: sheet.name,
37
+ rows: sheet.getSheetValues(),
38
+ }));
152
39
  }
153
40
 
154
- // Generate DOCX from template
155
- async function generateDOCX(config: DOCXConfig): Promise<Buffer> {
41
+ // Generate Word document
42
+ async function generateDOCX(title: string, content: string[]): Promise<Buffer> {
156
43
  const doc = new Document({
157
44
  sections: [{
158
- properties: {
159
- page: {
160
- margin: { top: 720, right: 720, bottom: 720, left: 720 },
161
- },
162
- },
163
45
  children: [
164
- // Header
165
- new Paragraph({
166
- children: [
167
- new TextRun({
168
- text: config.title,
169
- bold: true,
170
- size: 48,
171
- }),
172
- ],
173
- heading: HeadingLevel.HEADING_1,
174
- spacing: { after: 200 },
175
- }),
176
-
177
- // Content paragraphs
178
- ...config.content.map(section => new Paragraph({
179
- children: [
180
- new TextRun({
181
- text: section.text,
182
- size: 24,
183
- }),
184
- ],
185
- spacing: { after: 120 },
186
- })),
187
-
188
- // Table if data provided
189
- ...(config.tableData ? [createTable(config.tableData)] : []),
46
+ new Paragraph({ children: [new TextRun({ text: title, bold: true, size: 48 })] }),
47
+ ...content.map(text => new Paragraph({ children: [new TextRun(text)] })),
190
48
  ],
191
49
  }],
192
50
  });
193
-
194
51
  return await Packer.toBuffer(doc);
195
52
  }
196
-
197
- // Create formatted table
198
- function createTable(data: TableData): Table {
199
- return new Table({
200
- rows: [
201
- // Header row
202
- new TableRow({
203
- children: data.headers.map(header =>
204
- new TableCell({
205
- children: [new Paragraph({
206
- children: [new TextRun({ text: header, bold: true })],
207
- })],
208
- shading: { fill: 'f0f0f0' },
209
- })
210
- ),
211
- tableHeader: true,
212
- }),
213
- // Data rows
214
- ...data.rows.map(row =>
215
- new TableRow({
216
- children: row.map(cell =>
217
- new TableCell({
218
- children: [new Paragraph({ children: [new TextRun(cell)] })],
219
- })
220
- ),
221
- })
222
- ),
223
- ],
224
- width: { size: 100, type: WidthType.PERCENTAGE },
225
- });
226
- }
227
53
  ```
228
54
 
229
- ### 3. Excel Processing (XLSX)
230
-
231
- ```typescript
232
- import ExcelJS from 'exceljs';
233
-
234
- interface SpreadsheetData {
235
- sheets: SheetData[];
236
- metadata: WorkbookMetadata;
237
- }
238
-
239
- interface SheetData {
240
- name: string;
241
- headers: string[];
242
- rows: Record<string, any>[];
243
- formulas: FormulaCell[];
244
- charts: ChartDefinition[];
245
- }
246
-
247
- // Read Excel with full fidelity
248
- async function readExcel(buffer: Buffer): Promise<SpreadsheetData> {
249
- const workbook = new ExcelJS.Workbook();
250
- await workbook.xlsx.load(buffer);
251
-
252
- const sheets: SheetData[] = [];
253
-
254
- workbook.eachSheet((worksheet, sheetId) => {
255
- const headers: string[] = [];
256
- const rows: Record<string, any>[] = [];
257
- const formulas: FormulaCell[] = [];
258
-
259
- // Get headers from first row
260
- worksheet.getRow(1).eachCell((cell, colNumber) => {
261
- headers[colNumber - 1] = cell.value?.toString() || `Column${colNumber}`;
262
- });
263
-
264
- // Get data rows
265
- worksheet.eachRow((row, rowNumber) => {
266
- if (rowNumber === 1) return; // Skip header
267
-
268
- const rowData: Record<string, any> = {};
269
- row.eachCell((cell, colNumber) => {
270
- const header = headers[colNumber - 1];
271
-
272
- // Preserve formulas
273
- if (cell.formula) {
274
- formulas.push({
275
- row: rowNumber,
276
- col: colNumber,
277
- formula: cell.formula,
278
- result: cell.value,
279
- });
280
- }
281
-
282
- rowData[header] = cell.value;
283
- });
284
-
285
- rows.push(rowData);
286
- });
287
-
288
- sheets.push({
289
- name: worksheet.name,
290
- headers,
291
- rows,
292
- formulas,
293
- charts: extractCharts(worksheet),
294
- });
295
- });
296
-
297
- return {
298
- sheets,
299
- metadata: {
300
- creator: workbook.creator,
301
- created: workbook.created,
302
- modified: workbook.modified,
303
- },
304
- };
305
- }
306
-
307
- // Generate Excel with formatting
308
- async function generateExcel(config: ExcelConfig): Promise<Buffer> {
309
- const workbook = new ExcelJS.Workbook();
310
- workbook.creator = config.author || 'Document Processor';
311
- workbook.created = new Date();
312
-
313
- for (const sheetConfig of config.sheets) {
314
- const worksheet = workbook.addWorksheet(sheetConfig.name);
315
-
316
- // Add headers with styling
317
- worksheet.addRow(sheetConfig.headers);
318
- worksheet.getRow(1).font = { bold: true };
319
- worksheet.getRow(1).fill = {
320
- type: 'pattern',
321
- pattern: 'solid',
322
- fgColor: { argb: 'FFE0E0E0' },
323
- };
324
-
325
- // Add data rows
326
- for (const row of sheetConfig.data) {
327
- worksheet.addRow(sheetConfig.headers.map(h => row[h]));
328
- }
329
-
330
- // Auto-fit columns
331
- worksheet.columns.forEach(column => {
332
- let maxLength = 0;
333
- column.eachCell({ includeEmpty: true }, cell => {
334
- const cellLength = cell.value?.toString().length || 10;
335
- maxLength = Math.max(maxLength, cellLength);
336
- });
337
- column.width = Math.min(maxLength + 2, 50);
338
- });
339
-
340
- // Add formulas if specified
341
- if (sheetConfig.formulas) {
342
- for (const formula of sheetConfig.formulas) {
343
- worksheet.getCell(formula.cell).value = { formula: formula.formula };
344
- }
345
- }
346
-
347
- // Add conditional formatting
348
- if (sheetConfig.conditionalFormatting) {
349
- worksheet.addConditionalFormatting({
350
- ref: sheetConfig.conditionalFormatting.range,
351
- rules: sheetConfig.conditionalFormatting.rules,
352
- });
353
- }
354
- }
355
-
356
- return Buffer.from(await workbook.xlsx.writeBuffer());
357
- }
358
-
359
- // Data transformation utilities
360
- function transformExcelData(data: SheetData, transform: DataTransform): SheetData {
361
- let rows = [...data.rows];
362
-
363
- // Filter rows
364
- if (transform.filter) {
365
- rows = rows.filter(row => transform.filter!(row));
366
- }
367
-
368
- // Map columns
369
- if (transform.columnMap) {
370
- rows = rows.map(row => {
371
- const newRow: Record<string, any> = {};
372
- for (const [oldKey, newKey] of Object.entries(transform.columnMap!)) {
373
- newRow[newKey] = row[oldKey];
374
- }
375
- return newRow;
376
- });
377
- }
55
+ ## Features
378
56
 
379
- // Aggregate if specified
380
- if (transform.groupBy) {
381
- rows = aggregateRows(rows, transform.groupBy, transform.aggregations!);
382
- }
57
+ | Feature | Description | Guide |
58
+ |---------|-------------|-------|
59
+ | PDF Extraction | Extract text, tables, images, and metadata from PDFs | Use pdf-lib or pdf-parse for text extraction |
60
+ | PDF Generation | Create PDFs from templates with data binding | Use pdf-lib with text, images, and table elements |
61
+ | DOCX Parsing | Parse Word documents preserving structure | Use mammoth or docx library for parsing |
62
+ | DOCX Generation | Generate Word documents with formatting | Use docx package with paragraphs and tables |
63
+ | Excel Reading | Read spreadsheets with formulas and formatting | Use exceljs to iterate sheets and cells |
64
+ | Excel Generation | Create spreadsheets with charts and styling | Use exceljs with conditional formatting |
65
+ | PPTX Generation | Create presentations with slides and charts | Use pptxgenjs for slide creation |
66
+ | Batch Processing | Process multiple documents with concurrency | Use p-queue for controlled parallel processing |
67
+ | Template Engine | Generate documents from templates with placeholders | Use docxtemplater for DOCX templates |
68
+ | Streaming | Handle large files without memory exhaustion | Process files in chunks with streams |
383
69
 
384
- return {
385
- ...data,
386
- headers: transform.columnMap
387
- ? Object.values(transform.columnMap)
388
- : data.headers,
389
- rows,
390
- };
391
- }
392
- ```
70
+ ## Common Patterns
393
71
 
394
- ### 4. PowerPoint Processing (PPTX)
72
+ ### Batch Document Processing
395
73
 
396
74
  ```typescript
397
- import PptxGenJS from 'pptxgenjs';
398
-
399
- interface PresentationConfig {
400
- title: string;
401
- author?: string;
402
- theme?: ThemeConfig;
403
- slides: SlideConfig[];
404
- }
405
-
406
- interface SlideConfig {
407
- layout: 'title' | 'content' | 'twoColumn' | 'comparison' | 'blank';
408
- title?: string;
409
- subtitle?: string;
410
- content?: SlideContent[];
411
- notes?: string;
412
- }
413
-
414
- // Generate PowerPoint presentation
415
- async function generatePPTX(config: PresentationConfig): Promise<Buffer> {
416
- const pptx = new PptxGenJS();
417
-
418
- // Set metadata
419
- pptx.title = config.title;
420
- pptx.author = config.author || 'Document Processor';
421
-
422
- // Apply theme
423
- if (config.theme) {
424
- pptx.defineLayout({
425
- name: 'CUSTOM',
426
- width: config.theme.width || 10,
427
- height: config.theme.height || 7.5,
75
+ import PQueue from 'p-queue';
76
+
77
+ async function processBatch(files: string[], transform: (buffer: Buffer) => Promise<Buffer>) {
78
+ const queue = new PQueue({ concurrency: 4 });
79
+ const results: { file: string; success: boolean; error?: string }[] = [];
80
+
81
+ for (const file of files) {
82
+ queue.add(async () => {
83
+ try {
84
+ const buffer = await fs.readFile(file);
85
+ const output = await transform(buffer);
86
+ await fs.writeFile(file.replace(/\.\w+$/, '_processed.pdf'), output);
87
+ results.push({ file, success: true });
88
+ } catch (error) {
89
+ results.push({ file, success: false, error: error.message });
90
+ }
428
91
  });
429
92
  }
430
93
 
431
- // Generate slides
432
- for (const slideConfig of config.slides) {
433
- const slide = pptx.addSlide();
434
-
435
- switch (slideConfig.layout) {
436
- case 'title':
437
- slide.addText(slideConfig.title || '', {
438
- x: 0.5, y: 2.5, w: 9, h: 1,
439
- fontSize: 44, bold: true, align: 'center',
440
- });
441
- if (slideConfig.subtitle) {
442
- slide.addText(slideConfig.subtitle, {
443
- x: 0.5, y: 3.5, w: 9, h: 0.5,
444
- fontSize: 24, color: '666666', align: 'center',
445
- });
446
- }
447
- break;
448
-
449
- case 'content':
450
- slide.addText(slideConfig.title || '', {
451
- x: 0.5, y: 0.3, w: 9, h: 0.8,
452
- fontSize: 32, bold: true,
453
- });
454
- let yPos = 1.2;
455
- for (const content of slideConfig.content || []) {
456
- yPos = addSlideContent(slide, content, yPos);
457
- }
458
- break;
459
-
460
- case 'twoColumn':
461
- slide.addText(slideConfig.title || '', {
462
- x: 0.5, y: 0.3, w: 9, h: 0.8,
463
- fontSize: 32, bold: true,
464
- });
465
- // Left column
466
- addSlideContent(slide, slideConfig.content![0], 1.2, 0.5, 4.3);
467
- // Right column
468
- addSlideContent(slide, slideConfig.content![1], 1.2, 5.2, 4.3);
469
- break;
470
- }
471
-
472
- // Add speaker notes
473
- if (slideConfig.notes) {
474
- slide.addNotes(slideConfig.notes);
475
- }
476
- }
477
-
478
- return Buffer.from(await pptx.write({ outputType: 'arraybuffer' }));
479
- }
480
-
481
- // Add various content types to slide
482
- function addSlideContent(
483
- slide: PptxGenJS.Slide,
484
- content: SlideContent,
485
- y: number,
486
- x: number = 0.5,
487
- w: number = 9
488
- ): number {
489
- switch (content.type) {
490
- case 'text':
491
- slide.addText(content.value, {
492
- x, y, w, h: 0.5,
493
- fontSize: content.fontSize || 18,
494
- bullet: content.bullet,
495
- });
496
- return y + 0.6;
497
-
498
- case 'bullets':
499
- slide.addText(
500
- content.items.map(item => ({ text: item, options: { bullet: true } })),
501
- { x, y, w, fontSize: 18 }
502
- );
503
- return y + content.items.length * 0.4 + 0.2;
504
-
505
- case 'image':
506
- slide.addImage({
507
- path: content.path,
508
- x, y, w: content.width || 4, h: content.height || 3,
509
- });
510
- return y + (content.height || 3) + 0.2;
511
-
512
- case 'chart':
513
- slide.addChart(content.chartType, content.data, {
514
- x, y, w, h: content.height || 4,
515
- });
516
- return y + (content.height || 4) + 0.2;
517
-
518
- case 'table':
519
- slide.addTable(content.data, {
520
- x, y, w,
521
- border: { pt: 1, color: 'CFCFCF' },
522
- fontFace: 'Arial',
523
- fontSize: 14,
524
- });
525
- return y + content.data.length * 0.4 + 0.2;
526
-
527
- default:
528
- return y;
529
- }
94
+ await queue.onIdle();
95
+ return results;
530
96
  }
531
97
  ```
532
98
 
533
- ### 5. Batch Processing Pipeline
99
+ ### Excel Report Generation
534
100
 
535
101
  ```typescript
536
- interface BatchConfig {
537
- inputDir: string;
538
- outputDir: string;
539
- concurrency: number;
540
- transform: DocumentTransform;
541
- onProgress?: (progress: BatchProgress) => void;
542
- onError?: (error: BatchError) => void;
543
- }
544
-
545
- interface BatchProgress {
546
- total: number;
547
- processed: number;
548
- succeeded: number;
549
- failed: number;
550
- currentFile: string;
551
- }
552
-
553
- // Batch document processing with streaming
554
- async function processBatch(config: BatchConfig): Promise<BatchResult> {
555
- const files = await glob(`${config.inputDir}/**/*.{pdf,docx,xlsx,pptx}`);
556
- const results: ProcessingResult[] = [];
557
-
558
- const progress: BatchProgress = {
559
- total: files.length,
560
- processed: 0,
561
- succeeded: 0,
562
- failed: 0,
563
- currentFile: '',
564
- };
565
-
566
- // Process with concurrency limit
567
- const queue = new PQueue({ concurrency: config.concurrency });
568
-
569
- const tasks = files.map(file => queue.add(async () => {
570
- progress.currentFile = file;
571
- config.onProgress?.(progress);
572
-
573
- try {
574
- const buffer = await fs.readFile(file);
575
- const ext = path.extname(file).toLowerCase();
576
-
577
- // Process based on file type
578
- let result: Buffer;
579
- switch (ext) {
580
- case '.pdf':
581
- result = await transformPDF(buffer, config.transform);
582
- break;
583
- case '.docx':
584
- result = await transformDOCX(buffer, config.transform);
585
- break;
586
- case '.xlsx':
587
- result = await transformExcel(buffer, config.transform);
588
- break;
589
- case '.pptx':
590
- result = await transformPPTX(buffer, config.transform);
591
- break;
592
- default:
593
- throw new Error(`Unsupported file type: ${ext}`);
594
- }
595
-
596
- // Write output
597
- const outputPath = path.join(
598
- config.outputDir,
599
- path.relative(config.inputDir, file)
600
- );
601
- await fs.mkdir(path.dirname(outputPath), { recursive: true });
602
- await fs.writeFile(outputPath, result);
603
-
604
- progress.succeeded++;
605
- results.push({ file, success: true });
606
- } catch (error) {
607
- progress.failed++;
608
- results.push({ file, success: false, error: error.message });
609
- config.onError?.({ file, error });
610
- } finally {
611
- progress.processed++;
612
- config.onProgress?.(progress);
613
- }
614
- }));
102
+ async function generateReport(data: Record<string, any>[]): Promise<Buffer> {
103
+ const workbook = new ExcelJS.Workbook();
104
+ const sheet = workbook.addWorksheet('Report');
615
105
 
616
- await Promise.all(tasks);
106
+ // Add headers with styling
107
+ const headers = Object.keys(data[0] || {});
108
+ sheet.addRow(headers);
109
+ sheet.getRow(1).font = { bold: true };
110
+ sheet.getRow(1).fill = { type: 'pattern', pattern: 'solid', fgColor: { argb: 'FFE0E0E0' } };
617
111
 
618
- return {
619
- total: files.length,
620
- succeeded: progress.succeeded,
621
- failed: progress.failed,
622
- results,
623
- };
624
- }
112
+ // Add data rows
113
+ data.forEach(row => sheet.addRow(headers.map(h => row[h])));
625
114
 
626
- // Stream large file processing
627
- async function* streamProcess(
628
- inputStream: ReadStream,
629
- transform: ChunkTransform
630
- ): AsyncGenerator<Buffer> {
631
- const chunks: Buffer[] = [];
632
- let processedSize = 0;
633
-
634
- for await (const chunk of inputStream) {
635
- chunks.push(chunk);
636
- processedSize += chunk.length;
637
-
638
- // Process in chunks for memory efficiency
639
- if (processedSize >= CHUNK_SIZE) {
640
- const combined = Buffer.concat(chunks);
641
- yield await transform(combined);
642
- chunks.length = 0;
643
- processedSize = 0;
644
- }
645
- }
115
+ // Auto-fit columns
116
+ sheet.columns.forEach(col => { col.width = 15; });
646
117
 
647
- // Process remaining
648
- if (chunks.length > 0) {
649
- yield await transform(Buffer.concat(chunks));
650
- }
118
+ return Buffer.from(await workbook.xlsx.writeBuffer());
651
119
  }
652
120
  ```
653
121
 
654
- ### 6. Template-Based Document Generation
122
+ ### Invoice Generation from Template
655
123
 
656
124
  ```typescript
657
- interface DocumentTemplate {
658
- type: 'pdf' | 'docx' | 'xlsx' | 'pptx';
659
- templatePath: string;
660
- placeholders: PlaceholderConfig[];
661
- }
662
-
663
- interface PlaceholderConfig {
664
- key: string;
665
- type: 'text' | 'image' | 'table' | 'chart' | 'list';
666
- format?: FormatOptions;
667
- }
668
-
669
- // Generate document from template with data binding
670
- async function generateFromTemplate(
671
- template: DocumentTemplate,
672
- data: Record<string, any>
673
- ): Promise<Buffer> {
674
- const templateBuffer = await fs.readFile(template.templatePath);
675
-
676
- switch (template.type) {
677
- case 'docx':
678
- return generateDOCXFromTemplate(templateBuffer, template.placeholders, data);
679
- case 'xlsx':
680
- return generateExcelFromTemplate(templateBuffer, template.placeholders, data);
681
- case 'pptx':
682
- return generatePPTXFromTemplate(templateBuffer, template.placeholders, data);
683
- case 'pdf':
684
- return generatePDFFromTemplate(templateBuffer, template.placeholders, data);
685
- }
686
- }
687
-
688
- // DOCX template processing
689
- async function generateDOCXFromTemplate(
690
- templateBuffer: Buffer,
691
- placeholders: PlaceholderConfig[],
692
- data: Record<string, any>
693
- ): Promise<Buffer> {
694
- const doc = new Docxtemplater(new PizZip(templateBuffer), {
695
- paragraphLoop: true,
696
- linebreaks: true,
697
- });
698
-
699
- // Build data object with formatting
700
- const templateData: Record<string, any> = {};
701
-
702
- for (const placeholder of placeholders) {
703
- const value = data[placeholder.key];
704
-
705
- switch (placeholder.type) {
706
- case 'text':
707
- templateData[placeholder.key] = formatText(value, placeholder.format);
708
- break;
709
- case 'table':
710
- templateData[placeholder.key] = formatTableData(value);
711
- break;
712
- case 'image':
713
- templateData[placeholder.key] = await loadImage(value);
714
- break;
715
- case 'list':
716
- templateData[placeholder.key] = value.map((item: any) => ({ item }));
717
- break;
718
- }
719
- }
720
-
721
- doc.render(templateData);
722
-
723
- return doc.getZip().generate({
724
- type: 'nodebuffer',
725
- compression: 'DEFLATE',
125
+ import Docxtemplater from 'docxtemplater';
126
+ import PizZip from 'pizzip';
127
+
128
+ async function generateInvoice(templatePath: string, invoiceData: InvoiceData): Promise<Buffer> {
129
+ const templateBuffer = await fs.readFile(templatePath);
130
+ const zip = new PizZip(templateBuffer);
131
+ const doc = new Docxtemplater(zip, { paragraphLoop: true, linebreaks: true });
132
+
133
+ doc.render({
134
+ invoiceNumber: invoiceData.number,
135
+ date: invoiceData.date,
136
+ customer: invoiceData.customer,
137
+ items: invoiceData.items,
138
+ total: invoiceData.total,
726
139
  });
727
- }
728
- ```
729
-
730
- ## Use Cases
731
140
 
732
- ### 1. Invoice Generation System
733
-
734
- ```typescript
735
- // Generate invoices from order data
736
- async function generateInvoice(order: Order): Promise<Buffer> {
737
- const template: DOCXConfig = {
738
- title: `Invoice #${order.invoiceNumber}`,
739
- content: [
740
- { text: `Date: ${formatDate(order.date)}` },
741
- { text: `Customer: ${order.customer.name}` },
742
- { text: `Address: ${order.customer.address}` },
743
- ],
744
- tableData: {
745
- headers: ['Item', 'Quantity', 'Price', 'Total'],
746
- rows: order.items.map(item => [
747
- item.name,
748
- item.quantity.toString(),
749
- formatCurrency(item.price),
750
- formatCurrency(item.quantity * item.price),
751
- ]),
752
- },
753
- };
754
-
755
- // Add totals
756
- template.content.push(
757
- { text: '' },
758
- { text: `Subtotal: ${formatCurrency(order.subtotal)}` },
759
- { text: `Tax: ${formatCurrency(order.tax)}` },
760
- { text: `Total: ${formatCurrency(order.total)}`, bold: true },
761
- );
762
-
763
- return generateDOCX(template);
141
+ return doc.getZip().generate({ type: 'nodebuffer', compression: 'DEFLATE' });
764
142
  }
765
143
  ```
766
144
 
767
- ### 2. Report Dashboard Export
145
+ ### PDF Table Extraction
768
146
 
769
147
  ```typescript
770
- // Export dashboard data to Excel with charts
771
- async function exportDashboard(dashboard: DashboardData): Promise<Buffer> {
772
- return generateExcel({
773
- author: 'Analytics System',
774
- sheets: [
775
- {
776
- name: 'Summary',
777
- headers: ['Metric', 'Value', 'Change'],
778
- data: dashboard.kpis.map(kpi => ({
779
- Metric: kpi.name,
780
- Value: kpi.value,
781
- Change: `${kpi.change > 0 ? '+' : ''}${kpi.change}%`,
782
- })),
783
- conditionalFormatting: {
784
- range: 'C2:C100',
785
- rules: [
786
- { type: 'cellIs', operator: 'greaterThan', formulae: [0], style: { fill: { argb: 'FF00FF00' } } },
787
- { type: 'cellIs', operator: 'lessThan', formulae: [0], style: { fill: { argb: 'FFFF0000' } } },
788
- ],
789
- },
790
- },
791
- {
792
- name: 'Detailed Data',
793
- headers: Object.keys(dashboard.detailedData[0] || {}),
794
- data: dashboard.detailedData,
795
- },
796
- ],
797
- });
798
- }
799
- ```
148
+ async function extractTables(pdfBuffer: Buffer): Promise<ExtractedTable[]> {
149
+ const pdfDoc = await PDFDocument.load(pdfBuffer);
150
+ const tables: ExtractedTable[] = [];
800
151
 
801
- ### 3. Contract Analysis Pipeline
152
+ for (let i = 0; i < pdfDoc.getPageCount(); i++) {
153
+ const page = pdfDoc.getPage(i);
154
+ const content = await extractPageContent(page);
155
+ const detectedTables = detectTableStructures(content);
156
+ tables.push(...detectedTables.map(t => ({ ...t, pageNumber: i + 1 })));
157
+ }
802
158
 
803
- ```typescript
804
- // Extract and analyze contract data
805
- async function analyzeContract(pdfBuffer: Buffer): Promise<ContractAnalysis> {
806
- const extracted = await extractStructuredPDF(pdfBuffer);
807
-
808
- return {
809
- parties: extractParties(extracted.text),
810
- dates: extractDates(extracted.text),
811
- amounts: extractMonetaryAmounts(extracted.text),
812
- clauses: categorizeClausses(extracted.text),
813
- signatures: detectSignatures(extracted.images),
814
- tables: extracted.tables.map(analyzeTable),
815
- };
159
+ return tables;
816
160
  }
817
161
  ```
818
162
 
819
163
  ## Best Practices
820
164
 
821
- ### Do's
822
-
823
- - **Stream large files** - Use streaming for files > 10MB to prevent memory issues
824
- - **Validate inputs** - Check file types and sizes before processing
825
- - **Handle encoding** - Support UTF-8 and detect encoding issues gracefully
826
- - **Preserve formatting** - Maintain original formatting when transforming
827
- - **Cache parsed results** - Cache extracted data for repeated access
828
- - **Use appropriate libraries** - pdf-lib for PDFs, exceljs for Excel, docx for Word
829
-
830
- ### Don'ts
831
-
832
- - Don't load entire large files into memory
833
- - Don't assume file extensions match content
834
- - Don't ignore password-protected documents
835
- - Don't strip metadata without user consent
836
- - Don't process untrusted files without sandboxing
837
- - Don't skip error handling for corrupt files
838
-
839
- ### Error Handling
840
-
841
- ```typescript
842
- class DocumentProcessingError extends Error {
843
- constructor(
844
- message: string,
845
- public readonly code: ErrorCode,
846
- public readonly file?: string,
847
- public readonly cause?: Error
848
- ) {
849
- super(message);
850
- this.name = 'DocumentProcessingError';
851
- }
852
- }
853
-
854
- enum ErrorCode {
855
- INVALID_FORMAT = 'INVALID_FORMAT',
856
- CORRUPT_FILE = 'CORRUPT_FILE',
857
- PASSWORD_PROTECTED = 'PASSWORD_PROTECTED',
858
- ENCODING_ERROR = 'ENCODING_ERROR',
859
- SIZE_LIMIT_EXCEEDED = 'SIZE_LIMIT_EXCEEDED',
860
- UNSUPPORTED_FEATURE = 'UNSUPPORTED_FEATURE',
861
- }
862
-
863
- // Comprehensive error handling
864
- async function safeProcessDocument(buffer: Buffer, filename: string): Promise<ProcessResult> {
865
- try {
866
- // Validate file
867
- const fileType = await detectFileType(buffer);
868
- if (!SUPPORTED_TYPES.includes(fileType)) {
869
- throw new DocumentProcessingError(
870
- `Unsupported file type: ${fileType}`,
871
- ErrorCode.INVALID_FORMAT,
872
- filename
873
- );
874
- }
875
-
876
- // Check size
877
- if (buffer.length > MAX_FILE_SIZE) {
878
- throw new DocumentProcessingError(
879
- `File exceeds maximum size of ${MAX_FILE_SIZE} bytes`,
880
- ErrorCode.SIZE_LIMIT_EXCEEDED,
881
- filename
882
- );
883
- }
884
-
885
- // Process
886
- return await processDocument(buffer, fileType);
887
- } catch (error) {
888
- if (error instanceof DocumentProcessingError) {
889
- throw error;
890
- }
891
-
892
- // Wrap unexpected errors
893
- throw new DocumentProcessingError(
894
- `Failed to process document: ${error.message}`,
895
- ErrorCode.CORRUPT_FILE,
896
- filename,
897
- error
898
- );
899
- }
900
- }
901
- ```
165
+ | Do | Avoid |
166
+ |----|-------|
167
+ | Stream large files (>10MB) to prevent memory issues | Loading entire large files into memory |
168
+ | Validate file types before processing | Assuming file extensions match content |
169
+ | Handle password-protected documents gracefully | Ignoring encrypted document errors |
170
+ | Preserve original formatting when transforming | Stripping formatting without user consent |
171
+ | Cache parsed results for repeated access | Re-parsing the same document multiple times |
172
+ | Use appropriate libraries per format | Building custom parsers for standard formats |
173
+ | Set file size limits for uploads | Processing unbounded file sizes |
174
+ | Sanitize filenames and paths | Using untrusted paths directly |
175
+ | Handle encoding issues (UTF-8, BOM) | Assuming all files use the same encoding |
176
+ | Log processing errors with context | Silently failing on corrupt files |
902
177
 
903
178
  ## Related Skills
904
179
 
905
- - **python** - Alternative processing with python-docx, openpyxl, PyPDF2
180
+ - **media-processing** - Video and audio processing
181
+ - **image-processing** - Image manipulation with Sharp
906
182
  - **typescript** - Type-safe document handling
907
- - **data-processing** - Data transformation utilities
908
- - **api-architecture** - Document API design patterns
909
183
 
910
- ## Reference Resources
184
+ ## References
911
185
 
912
186
  - [pdf-lib Documentation](https://pdf-lib.js.org/)
913
187
  - [ExcelJS Documentation](https://github.com/exceljs/exceljs)
914
188
  - [docx Documentation](https://docx.js.org/)
915
189
  - [PptxGenJS Documentation](https://gitbrent.github.io/PptxGenJS/)
916
- - [Apache POI](https://poi.apache.org/) - Java reference