omgkit 2.0.7 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,916 @@
1
+ ---
2
+ name: document-processing
3
+ description: Enterprise-grade document processing for PDF, DOCX, XLSX, PPTX with streaming, validation, and batch operations
4
+ category: tools
5
+ triggers:
6
+ - document processing
7
+ - pdf extraction
8
+ - docx parsing
9
+ - excel manipulation
10
+ - spreadsheet data
11
+ - powerpoint generation
12
+ - office documents
13
+ ---
14
+
15
+ # Document Processing
16
+
17
+ Enterprise-grade **document processing** for PDF, DOCX, XLSX, and PPTX files. This skill enables extraction, manipulation, generation, and batch processing of office documents with streaming support for large files.
18
+
19
+ ## Purpose
20
+
21
+ Handle document processing tasks that enterprise applications commonly require:
22
+
23
+ - Extract text and structured data from PDFs
24
+ - Parse and generate Word documents
25
+ - Manipulate Excel spreadsheets programmatically
26
+ - Create PowerPoint presentations from data
27
+ - Process documents in batch with progress tracking
28
+
29
+ ## Features
30
+
31
+ ### 1. PDF Processing
32
+
33
+ ```typescript
34
+ // PDF text extraction with structure preservation
35
+ import { PDFDocument, PDFExtract } from 'pdf-lib';
36
+
37
+ interface PDFExtractionResult {
38
+ text: string;
39
+ pages: PageContent[];
40
+ metadata: PDFMetadata;
41
+ tables: ExtractedTable[];
42
+ images: ExtractedImage[];
43
+ }
44
+
45
+ // Basic text extraction
46
+ async function extractPDFText(buffer: Buffer): Promise<string> {
47
+ const pdfExtract = new PDFExtract();
48
+ const data = await pdfExtract.extractBuffer(buffer);
49
+
50
+ return data.pages
51
+ .map(page => page.content
52
+ .map(item => item.str)
53
+ .join(' ')
54
+ )
55
+ .join('\n\n');
56
+ }
57
+
58
+ // Structured extraction with tables
59
+ async function extractStructuredPDF(buffer: Buffer): Promise<PDFExtractionResult> {
60
+ const pdfDoc = await PDFDocument.load(buffer);
61
+ const pages: PageContent[] = [];
62
+
63
+ for (let i = 0; i < pdfDoc.getPageCount(); i++) {
64
+ const page = pdfDoc.getPage(i);
65
+ pages.push({
66
+ pageNumber: i + 1,
67
+ width: page.getWidth(),
68
+ height: page.getHeight(),
69
+ content: await extractPageContent(page),
70
+ tables: await detectTables(page),
71
+ });
72
+ }
73
+
74
+ return {
75
+ text: pages.map(p => p.content).join('\n\n'),
76
+ pages,
77
+ metadata: await extractMetadata(pdfDoc),
78
+ tables: pages.flatMap(p => p.tables),
79
+ images: await extractImages(pdfDoc),
80
+ };
81
+ }
82
+
83
+ // PDF generation from template
84
+ async function generatePDF(template: PDFTemplate, data: Record<string, any>): Promise<Buffer> {
85
+ const pdfDoc = await PDFDocument.create();
86
+ const page = pdfDoc.addPage();
87
+ const { width, height } = page.getSize();
88
+
89
+ // Apply template with data substitution
90
+ for (const element of template.elements) {
91
+ switch (element.type) {
92
+ case 'text':
93
+ const text = substituteVariables(element.content, data);
94
+ page.drawText(text, {
95
+ x: element.x,
96
+ y: height - element.y,
97
+ size: element.fontSize || 12,
98
+ font: await pdfDoc.embedFont(element.font || StandardFonts.Helvetica),
99
+ });
100
+ break;
101
+ case 'image':
102
+ const imageBytes = await fetch(data[element.dataKey]).then(r => r.arrayBuffer());
103
+ const image = await pdfDoc.embedPng(imageBytes);
104
+ page.drawImage(image, {
105
+ x: element.x,
106
+ y: height - element.y - element.height,
107
+ width: element.width,
108
+ height: element.height,
109
+ });
110
+ break;
111
+ case 'table':
112
+ await drawTable(page, element, data[element.dataKey]);
113
+ break;
114
+ }
115
+ }
116
+
117
+ return Buffer.from(await pdfDoc.save());
118
+ }
119
+ ```
120
+
121
+ ### 2. Word Document Processing (DOCX)
122
+
123
+ ```typescript
124
+ import { Document, Paragraph, TextRun, Table, TableRow, TableCell, Packer } from 'docx';
125
+
126
+ // Parse DOCX to structured format
127
+ interface DOCXContent {
128
+ paragraphs: ParsedParagraph[];
129
+ tables: ParsedTable[];
130
+ images: ParsedImage[];
131
+ styles: DocumentStyles;
132
+ metadata: DocumentMetadata;
133
+ }
134
+
135
+ async function parseDOCX(buffer: Buffer): Promise<DOCXContent> {
136
+ const zip = new JSZip();
137
+ const doc = await zip.loadAsync(buffer);
138
+
139
+ // Parse document.xml
140
+ const documentXml = await doc.file('word/document.xml')?.async('string');
141
+ const parser = new XMLParser();
142
+ const parsed = parser.parse(documentXml);
143
+
144
+ // Extract content preserving structure
145
+ return {
146
+ paragraphs: extractParagraphs(parsed),
147
+ tables: extractTables(parsed),
148
+ images: await extractImages(doc),
149
+ styles: await parseStyles(doc),
150
+ metadata: await parseMetadata(doc),
151
+ };
152
+ }
153
+
154
+ // Generate DOCX from template
155
+ async function generateDOCX(config: DOCXConfig): Promise<Buffer> {
156
+ const doc = new Document({
157
+ sections: [{
158
+ properties: {
159
+ page: {
160
+ margin: { top: 720, right: 720, bottom: 720, left: 720 },
161
+ },
162
+ },
163
+ children: [
164
+ // Header
165
+ new Paragraph({
166
+ children: [
167
+ new TextRun({
168
+ text: config.title,
169
+ bold: true,
170
+ size: 48,
171
+ }),
172
+ ],
173
+ heading: HeadingLevel.HEADING_1,
174
+ spacing: { after: 200 },
175
+ }),
176
+
177
+ // Content paragraphs
178
+ ...config.content.map(section => new Paragraph({
179
+ children: [
180
+ new TextRun({
181
+ text: section.text,
182
+ size: 24,
183
+ }),
184
+ ],
185
+ spacing: { after: 120 },
186
+ })),
187
+
188
+ // Table if data provided
189
+ ...(config.tableData ? [createTable(config.tableData)] : []),
190
+ ],
191
+ }],
192
+ });
193
+
194
+ return await Packer.toBuffer(doc);
195
+ }
196
+
197
+ // Create formatted table
198
+ function createTable(data: TableData): Table {
199
+ return new Table({
200
+ rows: [
201
+ // Header row
202
+ new TableRow({
203
+ children: data.headers.map(header =>
204
+ new TableCell({
205
+ children: [new Paragraph({
206
+ children: [new TextRun({ text: header, bold: true })],
207
+ })],
208
+ shading: { fill: 'f0f0f0' },
209
+ })
210
+ ),
211
+ tableHeader: true,
212
+ }),
213
+ // Data rows
214
+ ...data.rows.map(row =>
215
+ new TableRow({
216
+ children: row.map(cell =>
217
+ new TableCell({
218
+ children: [new Paragraph({ children: [new TextRun(cell)] })],
219
+ })
220
+ ),
221
+ })
222
+ ),
223
+ ],
224
+ width: { size: 100, type: WidthType.PERCENTAGE },
225
+ });
226
+ }
227
+ ```
228
+
229
+ ### 3. Excel Processing (XLSX)
230
+
231
+ ```typescript
232
+ import ExcelJS from 'exceljs';
233
+
234
+ interface SpreadsheetData {
235
+ sheets: SheetData[];
236
+ metadata: WorkbookMetadata;
237
+ }
238
+
239
+ interface SheetData {
240
+ name: string;
241
+ headers: string[];
242
+ rows: Record<string, any>[];
243
+ formulas: FormulaCell[];
244
+ charts: ChartDefinition[];
245
+ }
246
+
247
+ // Read Excel with full fidelity
248
+ async function readExcel(buffer: Buffer): Promise<SpreadsheetData> {
249
+ const workbook = new ExcelJS.Workbook();
250
+ await workbook.xlsx.load(buffer);
251
+
252
+ const sheets: SheetData[] = [];
253
+
254
+ workbook.eachSheet((worksheet, sheetId) => {
255
+ const headers: string[] = [];
256
+ const rows: Record<string, any>[] = [];
257
+ const formulas: FormulaCell[] = [];
258
+
259
+ // Get headers from first row
260
+ worksheet.getRow(1).eachCell((cell, colNumber) => {
261
+ headers[colNumber - 1] = cell.value?.toString() || `Column${colNumber}`;
262
+ });
263
+
264
+ // Get data rows
265
+ worksheet.eachRow((row, rowNumber) => {
266
+ if (rowNumber === 1) return; // Skip header
267
+
268
+ const rowData: Record<string, any> = {};
269
+ row.eachCell((cell, colNumber) => {
270
+ const header = headers[colNumber - 1];
271
+
272
+ // Preserve formulas
273
+ if (cell.formula) {
274
+ formulas.push({
275
+ row: rowNumber,
276
+ col: colNumber,
277
+ formula: cell.formula,
278
+ result: cell.value,
279
+ });
280
+ }
281
+
282
+ rowData[header] = cell.value;
283
+ });
284
+
285
+ rows.push(rowData);
286
+ });
287
+
288
+ sheets.push({
289
+ name: worksheet.name,
290
+ headers,
291
+ rows,
292
+ formulas,
293
+ charts: extractCharts(worksheet),
294
+ });
295
+ });
296
+
297
+ return {
298
+ sheets,
299
+ metadata: {
300
+ creator: workbook.creator,
301
+ created: workbook.created,
302
+ modified: workbook.modified,
303
+ },
304
+ };
305
+ }
306
+
307
+ // Generate Excel with formatting
308
+ async function generateExcel(config: ExcelConfig): Promise<Buffer> {
309
+ const workbook = new ExcelJS.Workbook();
310
+ workbook.creator = config.author || 'Document Processor';
311
+ workbook.created = new Date();
312
+
313
+ for (const sheetConfig of config.sheets) {
314
+ const worksheet = workbook.addWorksheet(sheetConfig.name);
315
+
316
+ // Add headers with styling
317
+ worksheet.addRow(sheetConfig.headers);
318
+ worksheet.getRow(1).font = { bold: true };
319
+ worksheet.getRow(1).fill = {
320
+ type: 'pattern',
321
+ pattern: 'solid',
322
+ fgColor: { argb: 'FFE0E0E0' },
323
+ };
324
+
325
+ // Add data rows
326
+ for (const row of sheetConfig.data) {
327
+ worksheet.addRow(sheetConfig.headers.map(h => row[h]));
328
+ }
329
+
330
+ // Auto-fit columns
331
+ worksheet.columns.forEach(column => {
332
+ let maxLength = 0;
333
+ column.eachCell({ includeEmpty: true }, cell => {
334
+ const cellLength = cell.value?.toString().length || 10;
335
+ maxLength = Math.max(maxLength, cellLength);
336
+ });
337
+ column.width = Math.min(maxLength + 2, 50);
338
+ });
339
+
340
+ // Add formulas if specified
341
+ if (sheetConfig.formulas) {
342
+ for (const formula of sheetConfig.formulas) {
343
+ worksheet.getCell(formula.cell).value = { formula: formula.formula };
344
+ }
345
+ }
346
+
347
+ // Add conditional formatting
348
+ if (sheetConfig.conditionalFormatting) {
349
+ worksheet.addConditionalFormatting({
350
+ ref: sheetConfig.conditionalFormatting.range,
351
+ rules: sheetConfig.conditionalFormatting.rules,
352
+ });
353
+ }
354
+ }
355
+
356
+ return Buffer.from(await workbook.xlsx.writeBuffer());
357
+ }
358
+
359
+ // Data transformation utilities
360
+ function transformExcelData(data: SheetData, transform: DataTransform): SheetData {
361
+ let rows = [...data.rows];
362
+
363
+ // Filter rows
364
+ if (transform.filter) {
365
+ rows = rows.filter(row => transform.filter!(row));
366
+ }
367
+
368
+ // Map columns
369
+ if (transform.columnMap) {
370
+ rows = rows.map(row => {
371
+ const newRow: Record<string, any> = {};
372
+ for (const [oldKey, newKey] of Object.entries(transform.columnMap!)) {
373
+ newRow[newKey] = row[oldKey];
374
+ }
375
+ return newRow;
376
+ });
377
+ }
378
+
379
+ // Aggregate if specified
380
+ if (transform.groupBy) {
381
+ rows = aggregateRows(rows, transform.groupBy, transform.aggregations!);
382
+ }
383
+
384
+ return {
385
+ ...data,
386
+ headers: transform.columnMap
387
+ ? Object.values(transform.columnMap)
388
+ : data.headers,
389
+ rows,
390
+ };
391
+ }
392
+ ```
393
+
394
+ ### 4. PowerPoint Processing (PPTX)
395
+
396
+ ```typescript
397
+ import PptxGenJS from 'pptxgenjs';
398
+
399
+ interface PresentationConfig {
400
+ title: string;
401
+ author?: string;
402
+ theme?: ThemeConfig;
403
+ slides: SlideConfig[];
404
+ }
405
+
406
+ interface SlideConfig {
407
+ layout: 'title' | 'content' | 'twoColumn' | 'comparison' | 'blank';
408
+ title?: string;
409
+ subtitle?: string;
410
+ content?: SlideContent[];
411
+ notes?: string;
412
+ }
413
+
414
+ // Generate PowerPoint presentation
415
+ async function generatePPTX(config: PresentationConfig): Promise<Buffer> {
416
+ const pptx = new PptxGenJS();
417
+
418
+ // Set metadata
419
+ pptx.title = config.title;
420
+ pptx.author = config.author || 'Document Processor';
421
+
422
+ // Apply theme
423
+ if (config.theme) {
424
+ pptx.defineLayout({
425
+ name: 'CUSTOM',
426
+ width: config.theme.width || 10,
427
+ height: config.theme.height || 7.5,
428
+ });
429
+ }
430
+
431
+ // Generate slides
432
+ for (const slideConfig of config.slides) {
433
+ const slide = pptx.addSlide();
434
+
435
+ switch (slideConfig.layout) {
436
+ case 'title':
437
+ slide.addText(slideConfig.title || '', {
438
+ x: 0.5, y: 2.5, w: 9, h: 1,
439
+ fontSize: 44, bold: true, align: 'center',
440
+ });
441
+ if (slideConfig.subtitle) {
442
+ slide.addText(slideConfig.subtitle, {
443
+ x: 0.5, y: 3.5, w: 9, h: 0.5,
444
+ fontSize: 24, color: '666666', align: 'center',
445
+ });
446
+ }
447
+ break;
448
+
449
+ case 'content':
450
+ slide.addText(slideConfig.title || '', {
451
+ x: 0.5, y: 0.3, w: 9, h: 0.8,
452
+ fontSize: 32, bold: true,
453
+ });
454
+ let yPos = 1.2;
455
+ for (const content of slideConfig.content || []) {
456
+ yPos = addSlideContent(slide, content, yPos);
457
+ }
458
+ break;
459
+
460
+ case 'twoColumn':
461
+ slide.addText(slideConfig.title || '', {
462
+ x: 0.5, y: 0.3, w: 9, h: 0.8,
463
+ fontSize: 32, bold: true,
464
+ });
465
+ // Left column
466
+ addSlideContent(slide, slideConfig.content![0], 1.2, 0.5, 4.3);
467
+ // Right column
468
+ addSlideContent(slide, slideConfig.content![1], 1.2, 5.2, 4.3);
469
+ break;
470
+ }
471
+
472
+ // Add speaker notes
473
+ if (slideConfig.notes) {
474
+ slide.addNotes(slideConfig.notes);
475
+ }
476
+ }
477
+
478
+ return Buffer.from(await pptx.write({ outputType: 'arraybuffer' }));
479
+ }
480
+
481
+ // Add various content types to slide
482
+ function addSlideContent(
483
+ slide: PptxGenJS.Slide,
484
+ content: SlideContent,
485
+ y: number,
486
+ x: number = 0.5,
487
+ w: number = 9
488
+ ): number {
489
+ switch (content.type) {
490
+ case 'text':
491
+ slide.addText(content.value, {
492
+ x, y, w, h: 0.5,
493
+ fontSize: content.fontSize || 18,
494
+ bullet: content.bullet,
495
+ });
496
+ return y + 0.6;
497
+
498
+ case 'bullets':
499
+ slide.addText(
500
+ content.items.map(item => ({ text: item, options: { bullet: true } })),
501
+ { x, y, w, fontSize: 18 }
502
+ );
503
+ return y + content.items.length * 0.4 + 0.2;
504
+
505
+ case 'image':
506
+ slide.addImage({
507
+ path: content.path,
508
+ x, y, w: content.width || 4, h: content.height || 3,
509
+ });
510
+ return y + (content.height || 3) + 0.2;
511
+
512
+ case 'chart':
513
+ slide.addChart(content.chartType, content.data, {
514
+ x, y, w, h: content.height || 4,
515
+ });
516
+ return y + (content.height || 4) + 0.2;
517
+
518
+ case 'table':
519
+ slide.addTable(content.data, {
520
+ x, y, w,
521
+ border: { pt: 1, color: 'CFCFCF' },
522
+ fontFace: 'Arial',
523
+ fontSize: 14,
524
+ });
525
+ return y + content.data.length * 0.4 + 0.2;
526
+
527
+ default:
528
+ return y;
529
+ }
530
+ }
531
+ ```
532
+
533
+ ### 5. Batch Processing Pipeline
534
+
535
+ ```typescript
536
+ interface BatchConfig {
537
+ inputDir: string;
538
+ outputDir: string;
539
+ concurrency: number;
540
+ transform: DocumentTransform;
541
+ onProgress?: (progress: BatchProgress) => void;
542
+ onError?: (error: BatchError) => void;
543
+ }
544
+
545
+ interface BatchProgress {
546
+ total: number;
547
+ processed: number;
548
+ succeeded: number;
549
+ failed: number;
550
+ currentFile: string;
551
+ }
552
+
553
+ // Batch document processing with streaming
554
+ async function processBatch(config: BatchConfig): Promise<BatchResult> {
555
+ const files = await glob(`${config.inputDir}/**/*.{pdf,docx,xlsx,pptx}`);
556
+ const results: ProcessingResult[] = [];
557
+
558
+ const progress: BatchProgress = {
559
+ total: files.length,
560
+ processed: 0,
561
+ succeeded: 0,
562
+ failed: 0,
563
+ currentFile: '',
564
+ };
565
+
566
+ // Process with concurrency limit
567
+ const queue = new PQueue({ concurrency: config.concurrency });
568
+
569
+ const tasks = files.map(file => queue.add(async () => {
570
+ progress.currentFile = file;
571
+ config.onProgress?.(progress);
572
+
573
+ try {
574
+ const buffer = await fs.readFile(file);
575
+ const ext = path.extname(file).toLowerCase();
576
+
577
+ // Process based on file type
578
+ let result: Buffer;
579
+ switch (ext) {
580
+ case '.pdf':
581
+ result = await transformPDF(buffer, config.transform);
582
+ break;
583
+ case '.docx':
584
+ result = await transformDOCX(buffer, config.transform);
585
+ break;
586
+ case '.xlsx':
587
+ result = await transformExcel(buffer, config.transform);
588
+ break;
589
+ case '.pptx':
590
+ result = await transformPPTX(buffer, config.transform);
591
+ break;
592
+ default:
593
+ throw new Error(`Unsupported file type: ${ext}`);
594
+ }
595
+
596
+ // Write output
597
+ const outputPath = path.join(
598
+ config.outputDir,
599
+ path.relative(config.inputDir, file)
600
+ );
601
+ await fs.mkdir(path.dirname(outputPath), { recursive: true });
602
+ await fs.writeFile(outputPath, result);
603
+
604
+ progress.succeeded++;
605
+ results.push({ file, success: true });
606
+ } catch (error) {
607
+ progress.failed++;
608
+ results.push({ file, success: false, error: error.message });
609
+ config.onError?.({ file, error });
610
+ } finally {
611
+ progress.processed++;
612
+ config.onProgress?.(progress);
613
+ }
614
+ }));
615
+
616
+ await Promise.all(tasks);
617
+
618
+ return {
619
+ total: files.length,
620
+ succeeded: progress.succeeded,
621
+ failed: progress.failed,
622
+ results,
623
+ };
624
+ }
625
+
626
+ // Stream large file processing
627
+ async function* streamProcess(
628
+ inputStream: ReadStream,
629
+ transform: ChunkTransform
630
+ ): AsyncGenerator<Buffer> {
631
+ const chunks: Buffer[] = [];
632
+ let processedSize = 0;
633
+
634
+ for await (const chunk of inputStream) {
635
+ chunks.push(chunk);
636
+ processedSize += chunk.length;
637
+
638
+ // Process in chunks for memory efficiency
639
+ if (processedSize >= CHUNK_SIZE) {
640
+ const combined = Buffer.concat(chunks);
641
+ yield await transform(combined);
642
+ chunks.length = 0;
643
+ processedSize = 0;
644
+ }
645
+ }
646
+
647
+ // Process remaining
648
+ if (chunks.length > 0) {
649
+ yield await transform(Buffer.concat(chunks));
650
+ }
651
+ }
652
+ ```
653
+
654
+ ### 6. Template-Based Document Generation
655
+
656
+ ```typescript
657
+ interface DocumentTemplate {
658
+ type: 'pdf' | 'docx' | 'xlsx' | 'pptx';
659
+ templatePath: string;
660
+ placeholders: PlaceholderConfig[];
661
+ }
662
+
663
+ interface PlaceholderConfig {
664
+ key: string;
665
+ type: 'text' | 'image' | 'table' | 'chart' | 'list';
666
+ format?: FormatOptions;
667
+ }
668
+
669
+ // Generate document from template with data binding
670
+ async function generateFromTemplate(
671
+ template: DocumentTemplate,
672
+ data: Record<string, any>
673
+ ): Promise<Buffer> {
674
+ const templateBuffer = await fs.readFile(template.templatePath);
675
+
676
+ switch (template.type) {
677
+ case 'docx':
678
+ return generateDOCXFromTemplate(templateBuffer, template.placeholders, data);
679
+ case 'xlsx':
680
+ return generateExcelFromTemplate(templateBuffer, template.placeholders, data);
681
+ case 'pptx':
682
+ return generatePPTXFromTemplate(templateBuffer, template.placeholders, data);
683
+ case 'pdf':
684
+ return generatePDFFromTemplate(templateBuffer, template.placeholders, data);
685
+ }
686
+ }
687
+
688
+ // DOCX template processing
689
+ async function generateDOCXFromTemplate(
690
+ templateBuffer: Buffer,
691
+ placeholders: PlaceholderConfig[],
692
+ data: Record<string, any>
693
+ ): Promise<Buffer> {
694
+ const doc = new Docxtemplater(new PizZip(templateBuffer), {
695
+ paragraphLoop: true,
696
+ linebreaks: true,
697
+ });
698
+
699
+ // Build data object with formatting
700
+ const templateData: Record<string, any> = {};
701
+
702
+ for (const placeholder of placeholders) {
703
+ const value = data[placeholder.key];
704
+
705
+ switch (placeholder.type) {
706
+ case 'text':
707
+ templateData[placeholder.key] = formatText(value, placeholder.format);
708
+ break;
709
+ case 'table':
710
+ templateData[placeholder.key] = formatTableData(value);
711
+ break;
712
+ case 'image':
713
+ templateData[placeholder.key] = await loadImage(value);
714
+ break;
715
+ case 'list':
716
+ templateData[placeholder.key] = value.map((item: any) => ({ item }));
717
+ break;
718
+ }
719
+ }
720
+
721
+ doc.render(templateData);
722
+
723
+ return doc.getZip().generate({
724
+ type: 'nodebuffer',
725
+ compression: 'DEFLATE',
726
+ });
727
+ }
728
+ ```
729
+
730
+ ## Use Cases
731
+
732
+ ### 1. Invoice Generation System
733
+
734
+ ```typescript
735
+ // Generate invoices from order data
736
+ async function generateInvoice(order: Order): Promise<Buffer> {
737
+ const template: DOCXConfig = {
738
+ title: `Invoice #${order.invoiceNumber}`,
739
+ content: [
740
+ { text: `Date: ${formatDate(order.date)}` },
741
+ { text: `Customer: ${order.customer.name}` },
742
+ { text: `Address: ${order.customer.address}` },
743
+ ],
744
+ tableData: {
745
+ headers: ['Item', 'Quantity', 'Price', 'Total'],
746
+ rows: order.items.map(item => [
747
+ item.name,
748
+ item.quantity.toString(),
749
+ formatCurrency(item.price),
750
+ formatCurrency(item.quantity * item.price),
751
+ ]),
752
+ },
753
+ };
754
+
755
+ // Add totals
756
+ template.content.push(
757
+ { text: '' },
758
+ { text: `Subtotal: ${formatCurrency(order.subtotal)}` },
759
+ { text: `Tax: ${formatCurrency(order.tax)}` },
760
+ { text: `Total: ${formatCurrency(order.total)}`, bold: true },
761
+ );
762
+
763
+ return generateDOCX(template);
764
+ }
765
+ ```
766
+
767
+ ### 2. Report Dashboard Export
768
+
769
+ ```typescript
770
+ // Export dashboard data to Excel with charts
771
+ async function exportDashboard(dashboard: DashboardData): Promise<Buffer> {
772
+ return generateExcel({
773
+ author: 'Analytics System',
774
+ sheets: [
775
+ {
776
+ name: 'Summary',
777
+ headers: ['Metric', 'Value', 'Change'],
778
+ data: dashboard.kpis.map(kpi => ({
779
+ Metric: kpi.name,
780
+ Value: kpi.value,
781
+ Change: `${kpi.change > 0 ? '+' : ''}${kpi.change}%`,
782
+ })),
783
+ conditionalFormatting: {
784
+ range: 'C2:C100',
785
+ rules: [
786
+ { type: 'cellIs', operator: 'greaterThan', formulae: [0], style: { fill: { argb: 'FF00FF00' } } },
787
+ { type: 'cellIs', operator: 'lessThan', formulae: [0], style: { fill: { argb: 'FFFF0000' } } },
788
+ ],
789
+ },
790
+ },
791
+ {
792
+ name: 'Detailed Data',
793
+ headers: Object.keys(dashboard.detailedData[0] || {}),
794
+ data: dashboard.detailedData,
795
+ },
796
+ ],
797
+ });
798
+ }
799
+ ```
800
+
801
+ ### 3. Contract Analysis Pipeline
802
+
803
+ ```typescript
804
+ // Extract and analyze contract data
805
+ async function analyzeContract(pdfBuffer: Buffer): Promise<ContractAnalysis> {
806
+ const extracted = await extractStructuredPDF(pdfBuffer);
807
+
808
+ return {
809
+ parties: extractParties(extracted.text),
810
+ dates: extractDates(extracted.text),
811
+ amounts: extractMonetaryAmounts(extracted.text),
812
+ clauses: categorizeClausses(extracted.text),
813
+ signatures: detectSignatures(extracted.images),
814
+ tables: extracted.tables.map(analyzeTable),
815
+ };
816
+ }
817
+ ```
818
+
819
+ ## Best Practices
820
+
821
+ ### Do's
822
+
823
+ - **Stream large files** - Use streaming for files > 10MB to prevent memory issues
824
+ - **Validate inputs** - Check file types and sizes before processing
825
+ - **Handle encoding** - Support UTF-8 and detect encoding issues gracefully
826
+ - **Preserve formatting** - Maintain original formatting when transforming
827
+ - **Cache parsed results** - Cache extracted data for repeated access
828
+ - **Use appropriate libraries** - pdf-lib for PDFs, exceljs for Excel, docx for Word
829
+
830
+ ### Don'ts
831
+
832
+ - Don't load entire large files into memory
833
+ - Don't assume file extensions match content
834
+ - Don't ignore password-protected documents
835
+ - Don't strip metadata without user consent
836
+ - Don't process untrusted files without sandboxing
837
+ - Don't skip error handling for corrupt files
838
+
839
+ ### Error Handling
840
+
841
+ ```typescript
842
+ class DocumentProcessingError extends Error {
843
+ constructor(
844
+ message: string,
845
+ public readonly code: ErrorCode,
846
+ public readonly file?: string,
847
+ public readonly cause?: Error
848
+ ) {
849
+ super(message);
850
+ this.name = 'DocumentProcessingError';
851
+ }
852
+ }
853
+
854
+ enum ErrorCode {
855
+ INVALID_FORMAT = 'INVALID_FORMAT',
856
+ CORRUPT_FILE = 'CORRUPT_FILE',
857
+ PASSWORD_PROTECTED = 'PASSWORD_PROTECTED',
858
+ ENCODING_ERROR = 'ENCODING_ERROR',
859
+ SIZE_LIMIT_EXCEEDED = 'SIZE_LIMIT_EXCEEDED',
860
+ UNSUPPORTED_FEATURE = 'UNSUPPORTED_FEATURE',
861
+ }
862
+
863
+ // Comprehensive error handling
864
+ async function safeProcessDocument(buffer: Buffer, filename: string): Promise<ProcessResult> {
865
+ try {
866
+ // Validate file
867
+ const fileType = await detectFileType(buffer);
868
+ if (!SUPPORTED_TYPES.includes(fileType)) {
869
+ throw new DocumentProcessingError(
870
+ `Unsupported file type: ${fileType}`,
871
+ ErrorCode.INVALID_FORMAT,
872
+ filename
873
+ );
874
+ }
875
+
876
+ // Check size
877
+ if (buffer.length > MAX_FILE_SIZE) {
878
+ throw new DocumentProcessingError(
879
+ `File exceeds maximum size of ${MAX_FILE_SIZE} bytes`,
880
+ ErrorCode.SIZE_LIMIT_EXCEEDED,
881
+ filename
882
+ );
883
+ }
884
+
885
+ // Process
886
+ return await processDocument(buffer, fileType);
887
+ } catch (error) {
888
+ if (error instanceof DocumentProcessingError) {
889
+ throw error;
890
+ }
891
+
892
+ // Wrap unexpected errors
893
+ throw new DocumentProcessingError(
894
+ `Failed to process document: ${error.message}`,
895
+ ErrorCode.CORRUPT_FILE,
896
+ filename,
897
+ error
898
+ );
899
+ }
900
+ }
901
+ ```
902
+
903
+ ## Related Skills
904
+
905
+ - **python** - Alternative processing with python-docx, openpyxl, PyPDF2
906
+ - **typescript** - Type-safe document handling
907
+ - **data-processing** - Data transformation utilities
908
+ - **api-architecture** - Document API design patterns
909
+
910
+ ## Reference Resources
911
+
912
+ - [pdf-lib Documentation](https://pdf-lib.js.org/)
913
+ - [ExcelJS Documentation](https://github.com/exceljs/exceljs)
914
+ - [docx Documentation](https://docx.js.org/)
915
+ - [PptxGenJS Documentation](https://gitbrent.github.io/PptxGenJS/)
916
+ - [Apache POI](https://poi.apache.org/) - Java reference