@cdklabs/cdk-appmod-catalog-blueprints 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/.jsii +8644 -0
  2. package/LICENSE +202 -0
  3. package/README.md +212 -0
  4. package/lib/document-processing/agentic-document-processing.d.ts +16 -0
  5. package/lib/document-processing/agentic-document-processing.js +90 -0
  6. package/lib/document-processing/base-document-processing.d.ts +189 -0
  7. package/lib/document-processing/base-document-processing.js +509 -0
  8. package/lib/document-processing/bedrock-document-processing.d.ts +167 -0
  9. package/lib/document-processing/bedrock-document-processing.js +297 -0
  10. package/lib/document-processing/index.d.ts +3 -0
  11. package/lib/document-processing/index.js +20 -0
  12. package/lib/document-processing/resources/default-bedrock-invoke/index.py +63 -0
  13. package/lib/document-processing/resources/default-bedrock-invoke/requirements.txt +4 -0
  14. package/lib/document-processing/resources/default-doc-retrieval-lambda/index.mjs +92 -0
  15. package/lib/document-processing/resources/default-doc-retrieval-lambda/package.json +10 -0
  16. package/lib/document-processing/resources/default-error-handler/index.js +46 -0
  17. package/lib/document-processing/resources/default-error-handler/package.json +4 -0
  18. package/lib/document-processing/resources/default-image-processor/classifier.mjs +665 -0
  19. package/lib/document-processing/resources/default-image-processor/extractors.mjs +465 -0
  20. package/lib/document-processing/resources/default-image-processor/index.mjs +143 -0
  21. package/lib/document-processing/resources/default-image-processor/package-lock.json +12 -0
  22. package/lib/document-processing/resources/default-image-processor/package.json +4 -0
  23. package/lib/document-processing/resources/default-image-validator/index.mjs +76 -0
  24. package/lib/document-processing/resources/default-image-validator/package-lock.json +154 -0
  25. package/lib/document-processing/resources/default-image-validator/package.json +7 -0
  26. package/lib/document-processing/resources/default-pdf-processor/index.js +46 -0
  27. package/lib/document-processing/resources/default-pdf-validator/index.js +36 -0
  28. package/lib/document-processing/resources/default-sqs-consumer/index.py +111 -0
  29. package/lib/document-processing/resources/default-sqs-consumer/requirements.txt +4 -0
  30. package/lib/document-processing/resources/default-sqs-consumer/sample_payload.json +20 -0
  31. package/lib/document-processing/resources/default-sqs-consumer/sample_payload_multi.json +24 -0
  32. package/lib/document-processing/resources/default-strands-agent/index.py +111 -0
  33. package/lib/document-processing/resources/default-strands-agent/requirements.txt +6 -0
  34. package/lib/document-processing/tests/agentic-document-processing-nag.test.d.ts +1 -0
  35. package/lib/document-processing/tests/agentic-document-processing-nag.test.js +107 -0
  36. package/lib/document-processing/tests/agentic-document-processing.test.d.ts +1 -0
  37. package/lib/document-processing/tests/agentic-document-processing.test.js +125 -0
  38. package/lib/document-processing/tests/bedrock-document-processing-nag.test.d.ts +1 -0
  39. package/lib/document-processing/tests/bedrock-document-processing-nag.test.js +101 -0
  40. package/lib/document-processing/tests/bedrock-document-processing.test.d.ts +1 -0
  41. package/lib/document-processing/tests/bedrock-document-processing.test.js +79 -0
  42. package/lib/framework/custom-resource/default-runtimes.d.ts +21 -0
  43. package/lib/framework/custom-resource/default-runtimes.js +34 -0
  44. package/lib/framework/custom-resource/index.d.ts +1 -0
  45. package/lib/framework/custom-resource/index.js +18 -0
  46. package/lib/framework/foundation/access-log.d.ts +69 -0
  47. package/lib/framework/foundation/access-log.js +121 -0
  48. package/lib/framework/foundation/eventbridge-broker.d.ts +18 -0
  49. package/lib/framework/foundation/eventbridge-broker.js +42 -0
  50. package/lib/framework/foundation/index.d.ts +3 -0
  51. package/lib/framework/foundation/index.js +20 -0
  52. package/lib/framework/foundation/network.d.ts +19 -0
  53. package/lib/framework/foundation/network.js +83 -0
  54. package/lib/framework/index.d.ts +2 -0
  55. package/lib/framework/index.js +19 -0
  56. package/lib/framework/quickstart/base-quickstart.d.ts +30 -0
  57. package/lib/framework/quickstart/base-quickstart.js +30 -0
  58. package/lib/index.d.ts +4 -0
  59. package/lib/index.js +21 -0
  60. package/lib/tsconfig.tsbuildinfo +1 -0
  61. package/lib/utilities/cdk-nag-config.d.ts +42 -0
  62. package/lib/utilities/cdk-nag-config.js +194 -0
  63. package/lib/utilities/data-loader-lambda/index.py +282 -0
  64. package/lib/utilities/data-loader-lambda/requirements.txt +3 -0
  65. package/lib/utilities/data-loader.d.ts +173 -0
  66. package/lib/utilities/data-loader.js +447 -0
  67. package/lib/utilities/index.d.ts +3 -0
  68. package/lib/utilities/index.js +20 -0
  69. package/lib/utilities/lambda-iam-utils.d.ts +145 -0
  70. package/lib/utilities/lambda-iam-utils.js +235 -0
  71. package/lib/utilities/lambda_layers/data-masking/layer-construct.d.ts +42 -0
  72. package/lib/utilities/lambda_layers/data-masking/layer-construct.js +53 -0
  73. package/lib/utilities/lambda_layers/data-masking/layer-construct.ts +88 -0
  74. package/lib/utilities/observability/bedrock-observability.d.ts +18 -0
  75. package/lib/utilities/observability/bedrock-observability.js +131 -0
  76. package/lib/utilities/observability/cloudfront-distribution-observability-property-injector.d.ts +6 -0
  77. package/lib/utilities/observability/cloudfront-distribution-observability-property-injector.js +22 -0
  78. package/lib/utilities/observability/index.d.ts +6 -0
  79. package/lib/utilities/observability/index.js +25 -0
  80. package/lib/utilities/observability/lambda-observability-property-injector.d.ts +8 -0
  81. package/lib/utilities/observability/lambda-observability-property-injector.js +43 -0
  82. package/lib/utilities/observability/log-group-data-protection-props.d.ts +19 -0
  83. package/lib/utilities/observability/log-group-data-protection-props.js +5 -0
  84. package/lib/utilities/observability/observability.d.ts +83 -0
  85. package/lib/utilities/observability/observability.js +278 -0
  86. package/lib/utilities/observability/observable.d.ts +32 -0
  87. package/lib/utilities/observability/observable.js +3 -0
  88. package/lib/utilities/observability/powertools-config.d.ts +3 -0
  89. package/lib/utilities/observability/powertools-config.js +25 -0
  90. package/lib/utilities/observability/resources/bedrock-manage-logging-configuration/index.py +27 -0
  91. package/lib/utilities/observability/state-machine-observability-property-injector.d.ts +8 -0
  92. package/lib/utilities/observability/state-machine-observability-property-injector.js +49 -0
  93. package/lib/utilities/tests/data-loader-nag.test.d.ts +1 -0
  94. package/lib/utilities/tests/data-loader-nag.test.js +432 -0
  95. package/lib/utilities/tests/data-loader.test.d.ts +1 -0
  96. package/lib/utilities/tests/data-loader.test.js +284 -0
  97. package/lib/webapp/frontend-construct.d.ts +136 -0
  98. package/lib/webapp/frontend-construct.js +253 -0
  99. package/lib/webapp/index.d.ts +1 -0
  100. package/lib/webapp/index.js +18 -0
  101. package/lib/webapp/tests/frontend-construct-nag.test.d.ts +1 -0
  102. package/lib/webapp/tests/frontend-construct-nag.test.js +266 -0
  103. package/lib/webapp/tests/frontend-construct.test.d.ts +1 -0
  104. package/lib/webapp/tests/frontend-construct.test.js +385 -0
  105. package/package.json +183 -0
@@ -0,0 +1,665 @@
1
+ import { TextractClient, DetectDocumentTextCommand } from '@aws-sdk/client-textract';
2
+ import { BedrockRuntimeClient, InvokeModelCommand } from '@aws-sdk/client-bedrock-runtime';
3
+ import { readFile } from 'fs/promises';
4
+
5
+ /**
6
+ * Document classifier for Amazon Textract
7
+ * Helps determine which Textract API to use based on document content
8
+ */
9
+ class TextractDocumentClassifier {
10
+ constructor() {
11
+ this.documentTypes = {
12
+ IDENTITY_DOCUMENT: 'IDENTITY_DOCUMENT',
13
+ INVOICE: 'INVOICE',
14
+ RECEIPT: 'RECEIPT',
15
+ TAX_FORM: 'TAX_FORM',
16
+ BANK_STATEMENT: 'BANK_STATEMENT',
17
+ PAYSLIP: 'PAYSLIP',
18
+ UTILITY_BILL: 'UTILITY_BILL',
19
+ MORTGAGE_DOCUMENT: 'MORTGAGE_DOCUMENT',
20
+ GENERIC_DOCUMENT: 'GENERIC_DOCUMENT',
21
+ };
22
+
23
+ // Keywords that strongly indicate document types
24
+ this.documentKeywords = {
25
+ [this.documentTypes.IDENTITY_DOCUMENT]: [
26
+ 'identity card', 'id card', 'passport', 'driver license', 'driver\'s license',
27
+ 'identification', 'national id', 'identity number', 'id no', 'nric'
28
+ ],
29
+ [this.documentTypes.INVOICE]: [
30
+ 'invoice', 'bill to', 'invoice number', 'invoice no', 'invoice date',
31
+ 'payment terms', 'due date', 'total due', 'subtotal', 'tax amount'
32
+ ],
33
+ [this.documentTypes.RECEIPT]: [
34
+ 'receipt', 'total', 'cash', 'change', 'payment', 'item', 'qty', 'quantity',
35
+ 'thank you for your purchase', 'merchant', 'transaction'
36
+ ],
37
+ [this.documentTypes.TAX_FORM]: [
38
+ 'tax', 'income tax', 'tax return', 'tax year', 'irs', 'w-2', 'w-4',
39
+ '1040', '1099', 'tax form'
40
+ ],
41
+ [this.documentTypes.BANK_STATEMENT]: [
42
+ 'bank statement', 'account number', 'account summary', 'balance',
43
+ 'withdrawal', 'deposit', 'transaction history', 'beginning balance',
44
+ 'ending balance', 'available balance'
45
+ ],
46
+ [this.documentTypes.PAYSLIP]: [
47
+ 'pay slip', 'payslip', 'salary', 'wage', 'earnings', 'deductions',
48
+ 'net pay', 'gross pay', 'pay period', 'employee id', 'year to date'
49
+ ],
50
+ [this.documentTypes.UTILITY_BILL]: [
51
+ 'utility', 'electricity', 'water', 'gas', 'internet', 'phone bill',
52
+ 'service address', 'meter reading', 'usage', 'kilowatt', 'therms'
53
+ ],
54
+ [this.documentTypes.MORTGAGE_DOCUMENT]: [
55
+ 'mortgage', 'loan', 'property', 'interest rate', 'principal', 'amortization',
56
+ 'escrow', 'lender', 'borrower', 'closing date', 'loan number'
57
+ ]
58
+ };
59
+
60
+ // Regular expressions for specific document types from various countries
61
+ this.idPatterns = {
62
+ // Singapore
63
+ singaporeNRIC: /[STFG]\d{7}[A-Z]/i,
64
+
65
+ // United States
66
+ usSSN: /\d{3}-\d{2}-\d{4}/,
67
+ usDL: /([A-Z]{1,2}[-\s]?)?\d{2,3}[-\s]?\d{2,3}[-\s]?\d{3,4}/i,
68
+
69
+ // United Kingdom
70
+ ukNI: /[A-Z]{2}\s?\d{2}\s?\d{2}\s?\d{2}\s?[A-Z]/i, // National Insurance number
71
+ ukDL: /[A-Z9]{5}\d{6}[A-Z9]{2}\d[A-Z]{2}/i, // Driver's License
72
+
73
+ // European Union
74
+ euPassport: /[A-Z]{1,2}\d{6,9}/i, // Generic EU passport format
75
+ euID: /ID\s*\d{6,10}/i, // Generic EU ID card format
76
+
77
+ // Germany
78
+ germanID: /[0-9A-Z]{9}/i, // Personalausweis
79
+
80
+ // France
81
+ frenchID: /\d{12}/i, // National ID number
82
+
83
+ // Spain
84
+ spanishDNI: /\d{8}[A-Z]/i, // DNI (Documento Nacional de Identidad)
85
+ spanishNIE: /[XYZ]\d{7}[A-Z]/i, // NIE (Número de Identidad de Extranjero)
86
+
87
+ // Italy
88
+ italianFC: /[A-Z]{6}\d{2}[A-Z]\d{2}[A-Z]\d{3}[A-Z]/i, // Fiscal Code
89
+
90
+ // China
91
+ chineseID: /\d{17}[\dX]/i, // Resident Identity Card
92
+
93
+ // India
94
+ indianAadhaar: /\d{4}\s?\d{4}\s?\d{4}/i, // Aadhaar number
95
+ indianPAN: /[A-Z]{5}\d{4}[A-Z]/i, // PAN (Permanent Account Number)
96
+
97
+ // Japan
98
+ japaneseMyNumber: /\d{4}\s?\d{4}\s?\d{4}/i, // My Number
99
+
100
+ // South Korea
101
+ koreanRRN: /\d{6}[-\s]?\d{7}/i, // Resident Registration Number
102
+
103
+ // Australia
104
+ australianTFN: /\d{3}\s?\d{3}\s?\d{3}/i, // Tax File Number
105
+
106
+ // Canada
107
+ canadianSIN: /\d{3}[-\s]?\d{3}[-\s]?\d{3}/i, // Social Insurance Number
108
+
109
+ // Brazil
110
+ brazilianCPF: /\d{3}\.?\d{3}\.?\d{3}-?\d{2}/i, // CPF (Cadastro de Pessoas Físicas)
111
+
112
+ // Generic patterns
113
+ passport: /passport\s+no[.:]\s*[A-Z0-9]+/i,
114
+ driverLicense: /driver'?s?\s+licen[sc]e\s+no[.:]\s*[A-Z0-9]+/i,
115
+ nationalID: /national\s+id\s+no[.:]\s*[A-Z0-9]+/i,
116
+ identityCard: /identity\s+card\s+no[.:]\s*[A-Z0-9]+/i
117
+ };
118
+
119
+ // Layout patterns that suggest document types
120
+ this.layoutPatterns = {
121
+ // ID cards often have a specific aspect ratio and layout
122
+ idCardLayout: (stats) => {
123
+ // ID cards typically have 1:1.5 to 1:2 aspect ratio
124
+ if (stats.width && stats.height) {
125
+ const ratio = stats.width / stats.height;
126
+ return (ratio >= 1.5 && ratio <= 2.1);
127
+ }
128
+ return false;
129
+ },
130
+ // Invoices and receipts often have tables
131
+ tableDocument: (stats) => {
132
+ return stats.table_count > 0;
133
+ }
134
+ };
135
+ }
136
+
137
+ /**
138
+ * Classify a document based on Textract DetectDocumentText results
139
+ * @param {Object} detectTextResult - Result from Textract DetectDocumentText API
140
+ * @returns {Object} Classification result with document type and confidence
141
+ */
142
+ classifyFromDetectText(detectTextResult) {
143
+ // Extract all text from the document
144
+ const allText = this._extractTextFromDetectTextResult(detectTextResult);
145
+
146
+ // Get document dimensions if available
147
+ const dimensions = this._extractDocumentDimensions(detectTextResult);
148
+
149
+ // Count tables (if any)
150
+ const tableCount = this._countTablesFromLines(detectTextResult);
151
+
152
+ return this._classifyText(allText, {
153
+ width: dimensions.width,
154
+ height: dimensions.height,
155
+ table_count: tableCount
156
+ });
157
+ }
158
+
159
+ /**
160
+ * Classify document based on extracted text and statistics
161
+ * @param {string} text - All text extracted from the document
162
+ * @param {Object} stats - Document statistics (tables, dimensions, etc.)
163
+ * @returns {Object} Classification result
164
+ */
165
+ _classifyText(text, stats = {}) {
166
+ const normalizedText = text.toLowerCase();
167
+ const scores = {};
168
+
169
+ // Initialize scores for all document types
170
+ Object.values(this.documentTypes).forEach(type => {
171
+ scores[type] = 0;
172
+ });
173
+
174
+ // Check for keyword matches
175
+ Object.entries(this.documentKeywords).forEach(([docType, keywords]) => {
176
+ keywords.forEach(keyword => {
177
+ if (normalizedText.includes(keyword.toLowerCase())) {
178
+ scores[docType] += 2;
179
+ }
180
+ });
181
+ });
182
+
183
+ // Check for ID patterns
184
+ Object.entries(this.idPatterns).forEach(([patternName, pattern]) => {
185
+ if (pattern.test(text)) {
186
+ if (patternName.includes('passport') || patternName.includes('ID') ||
187
+ patternName.includes('License') || patternName.includes('NRIC')) {
188
+ scores[this.documentTypes.IDENTITY_DOCUMENT] += 10;
189
+ } else {
190
+ scores[this.documentTypes.IDENTITY_DOCUMENT] += 8;
191
+ }
192
+ }
193
+ });
194
+
195
+ // Check layout patterns
196
+ if (this.layoutPatterns.idCardLayout(stats)) {
197
+ scores[this.documentTypes.IDENTITY_DOCUMENT] += 5;
198
+ }
199
+ if (this.layoutPatterns.tableDocument(stats)) {
200
+ scores[this.documentTypes.INVOICE] += 3;
201
+ scores[this.documentTypes.RECEIPT] += 3;
202
+ scores[this.documentTypes.BANK_STATEMENT] += 3;
203
+ }
204
+
205
+ // Check for specific phrases that strongly indicate document types
206
+ if (/republic of singapore identity card/i.test(text)) {
207
+ scores[this.documentTypes.IDENTITY_DOCUMENT] += 15;
208
+ }
209
+ if (/invoice\s+#|invoice\s+number|invoice\s+no/i.test(text)) {
210
+ scores[this.documentTypes.INVOICE] += 10;
211
+ }
212
+ if (/receipt\s+#|receipt\s+number|receipt\s+no/i.test(text)) {
213
+ scores[this.documentTypes.RECEIPT] += 10;
214
+ }
215
+
216
+ // Find the document type with the highest score
217
+ let maxScore = 0;
218
+ let classifiedType = this.documentTypes.GENERIC_DOCUMENT;
219
+
220
+ Object.entries(scores).forEach(([type, score]) => {
221
+ if (score > maxScore) {
222
+ maxScore = score;
223
+ classifiedType = type;
224
+ }
225
+ });
226
+
227
+ // Calculate confidence (normalize to 0-100%)
228
+ const confidence = Math.min(100, Math.max(0, maxScore * 5));
229
+
230
+ return {
231
+ documentType: classifiedType,
232
+ confidence: confidence,
233
+ scores: scores,
234
+ recommendedApi: this._getRecommendedApi(classifiedType)
235
+ };
236
+ }
237
+
238
+ /**
239
+ * Get the recommended Textract API based on document type
240
+ * @param {string} documentType - Classified document type
241
+ * @returns {string} Recommended Textract API
242
+ */
243
+ _getRecommendedApi(documentType) {
244
+ switch (documentType) {
245
+ case this.documentTypes.IDENTITY_DOCUMENT:
246
+ return 'AnalyzeID';
247
+ case this.documentTypes.INVOICE:
248
+ case this.documentTypes.RECEIPT:
249
+ return 'AnalyzeExpense';
250
+ case this.documentTypes.MORTGAGE_DOCUMENT:
251
+ case this.documentTypes.BANK_STATEMENT:
252
+ case this.documentTypes.TAX_FORM:
253
+ return 'AnalyzeLending';
254
+ default:
255
+ return 'AnalyzeDocument';
256
+ }
257
+ }
258
+
259
+ /**
260
+ * Extract text from Textract DetectDocumentText result
261
+ * @param {Object} result - Textract DetectDocumentText result
262
+ * @returns {string} Extracted text
263
+ */
264
+ _extractTextFromDetectTextResult(result) {
265
+ let text = '';
266
+ if (result.Blocks) {
267
+ result.Blocks.forEach(block => {
268
+ if (block.BlockType === 'LINE' && block.Text) {
269
+ text += ' ' + block.Text;
270
+ }
271
+ });
272
+ }
273
+ return text;
274
+ }
275
+
276
+ /**
277
+ * Extract document dimensions from Textract result
278
+ * @param {Object} result - Textract result
279
+ * @returns {Object} Document dimensions
280
+ */
281
+ _extractDocumentDimensions(result) {
282
+ const dimensions = { width: 0, height: 0 };
283
+
284
+ if (result.DocumentMetadata && result.DocumentMetadata.Pages) {
285
+ dimensions.width = result.DocumentMetadata.Pages[0]?.Width || 0;
286
+ dimensions.height = result.DocumentMetadata.Pages[0]?.Height || 0;
287
+ }
288
+
289
+ return dimensions;
290
+ }
291
+
292
+ /**
293
+ * Count tables based on line patterns in Textract DetectDocumentText result
294
+ * @param {Object} result - Textract DetectDocumentText result
295
+ * @returns {number} Estimated table count
296
+ */
297
+ _countTablesFromLines(result) {
298
+ let horizontalLines = 0;
299
+ let verticalLines = 0;
300
+
301
+ if (result.Blocks) {
302
+ result.Blocks.forEach(block => {
303
+ if (block.BlockType === 'LINE') {
304
+ if (block.Geometry && block.Geometry.BoundingBox) {
305
+ const box = block.Geometry.BoundingBox;
306
+ if (box.Width > box.Height * 5) {
307
+ horizontalLines++;
308
+ } else if (box.Height > box.Width * 5) {
309
+ verticalLines++;
310
+ }
311
+ }
312
+ }
313
+ });
314
+ }
315
+
316
+ return Math.min(horizontalLines, verticalLines) > 3 ? 1 : 0;
317
+ }
318
+ }
319
+
320
+ /**
321
+ * Classify document using Amazon Bedrock GenAI
322
+ * @param {string} text - Extracted text from document
323
+ * @param {Object} genaiConfig - GenAI configuration
324
+ * @returns {Promise<Object>} Classification result with confidence
325
+ */
326
+ async function classifyWithGenAI(text, genaiConfig) {
327
+ const bedrock = new BedrockRuntimeClient({
328
+ region: genaiConfig.region || process.env.AWS_REGION || 'us-east-1'
329
+ });
330
+
331
+ // Truncate text to stay within token limits
332
+ const maxTokens = genaiConfig.maxTokens || 50000;
333
+ const truncatedText = text.substring(0, maxTokens);
334
+
335
+ const prompt = `You are a document classification expert. Analyze the following document text and classify it.
336
+
337
+ IMPORTANT: Respond with ONLY valid JSON. Do not include any explanatory text before or after the JSON.
338
+
339
+ Required JSON format:
340
+ {
341
+ "documentType": "one of: IDENTITY_DOCUMENT, INVOICE, RECEIPT, TAX_FORM, BANK_STATEMENT, PAYSLIP, UTILITY_BILL, MORTGAGE_DOCUMENT, GENERIC_DOCUMENT",
342
+ "confidence": number between 0-100,
343
+ "reasoning": "brief explanation of classification decision",
344
+ "recommendedApi": "one of: AnalyzeID, AnalyzeExpense, AnalyzeLending, AnalyzeDocument"
345
+ }
346
+
347
+ Document text to analyze:
348
+ ${truncatedText}`;
349
+
350
+ const modelId = genaiConfig.model || 'anthropic.claude-3-5-sonnet-20241022-v2:0';
351
+
352
+ try {
353
+ const command = new InvokeModelCommand({
354
+ modelId: modelId,
355
+ body: JSON.stringify({
356
+ anthropic_version: "bedrock-2023-05-31",
357
+ max_tokens: 1000,
358
+ temperature: 0.1,
359
+ messages: [{
360
+ role: "user",
361
+ content: prompt
362
+ }]
363
+ })
364
+ });
365
+
366
+ const response = await bedrock.send(command);
367
+ const responseBody = JSON.parse(new TextDecoder().decode(response.body));
368
+
369
+ // Extract the text response from Claude
370
+ const rawText = responseBody.content[0].text;
371
+ console.log('Raw GenAI response:', rawText);
372
+
373
+ // Try to extract JSON from the response (Claude sometimes adds extra text)
374
+ let result;
375
+ try {
376
+ // First try to parse the entire response as JSON
377
+ result = JSON.parse(rawText);
378
+ } catch (error) {
379
+ // If that fails, try to extract JSON from within the text
380
+ const jsonMatch = rawText.match(/\{[\s\S]*\}/);
381
+ if (jsonMatch) {
382
+ result = JSON.parse(jsonMatch[0]);
383
+ } else {
384
+ throw new Error(`GenAI response is not valid JSON: ${rawText}`);
385
+ }
386
+ }
387
+
388
+ if (!result.documentType || !result.confidence || !result.recommendedApi) {
389
+ throw new Error(`GenAI returned incomplete classification result: ${JSON.stringify(result)}`);
390
+ }
391
+
392
+ const confidence = Math.max(0, Math.min(100, Number(result.confidence)));
393
+
394
+ return {
395
+ documentType: result.documentType,
396
+ confidence: confidence,
397
+ reasoning: result.reasoning || 'GenAI classification',
398
+ recommendedApi: result.recommendedApi,
399
+ method: 'genai',
400
+ modelUsed: modelId
401
+ };
402
+
403
+ } catch (error) {
404
+ console.error('GenAI classification error:', error);
405
+ throw new Error(`GenAI classification failed: ${error.message}`);
406
+ }
407
+ }
408
+
409
+ /**
410
+ * Classify document using traditional rule-based approach
411
+ * @param {Object} detectResult - Textract DetectDocumentText result
412
+ * @returns {Object} Classification result
413
+ */
414
+ function classifyWithRules(detectResult) {
415
+ const classifier = new TextractDocumentClassifier();
416
+ const result = classifier.classifyFromDetectText(detectResult);
417
+ return {
418
+ ...result,
419
+ method: 'rule-based'
420
+ };
421
+ }
422
+
423
+ /**
424
+ * Helper function to extract text from DetectDocumentText result
425
+ * @param {Object} result - Textract DetectDocumentText result
426
+ * @returns {string} Extracted text
427
+ */
428
+ function extractTextFromDetectText(result) {
429
+ let text = '';
430
+ if (result.Blocks) {
431
+ result.Blocks.forEach(block => {
432
+ if (block.BlockType === 'LINE' && block.Text) {
433
+ text += ' ' + block.Text;
434
+ }
435
+ });
436
+ }
437
+ return text;
438
+ }
439
+
440
+ /**
441
+ * Process a document using Textract and classification
442
+ * @param {string} bucketName - S3 bucket name
443
+ * @param {string} objectKey - S3 object key
444
+ * @param {Object} options - Processing options
445
+ * @returns {Promise<Object>} Processing result
446
+ */
447
+ async function processDocument(bucketName, objectKey, options = {}) {
448
+ const textract = new TextractClient({
449
+ region: options.region || process.env.AWS_REGION || 'us-east-1'
450
+ });
451
+
452
+ try {
453
+ const detectCommand = new DetectDocumentTextCommand({
454
+ Document: {
455
+ S3Object: {
456
+ Bucket: bucketName,
457
+ Name: objectKey
458
+ }
459
+ }
460
+ });
461
+
462
+ const detectResult = await textract.send(detectCommand);
463
+ const extractedText = extractTextFromDetectText(detectResult);
464
+
465
+ let classificationResult;
466
+
467
+ if (options.genaiConfig && options.genaiConfig.enabled) {
468
+ try {
469
+ classificationResult = await classifyWithGenAI(extractedText, options.genaiConfig);
470
+ } catch (genaiError) {
471
+ console.warn('GenAI classification failed, falling back to rules:', genaiError.message);
472
+ classificationResult = classifyWithRules(detectResult);
473
+ }
474
+ } else {
475
+ classificationResult = classifyWithRules(detectResult);
476
+ }
477
+
478
+ return {
479
+ success: true,
480
+ bucketName,
481
+ objectKey,
482
+ extractedText: extractedText.substring(0, 1000),
483
+ extractedEntities: extractedText, // Full extracted text as entities
484
+ classification: classificationResult,
485
+ classificationMethod: classificationResult.method,
486
+ genaiUsed: classificationResult.method === 'genai',
487
+ apiUsed: 'DetectDocumentText', // The Textract API that was used
488
+ textractResult: {
489
+ blockCount: detectResult.Blocks?.length || 0,
490
+ documentMetadata: detectResult.DocumentMetadata
491
+ }
492
+ };
493
+
494
+ } catch (error) {
495
+ console.error('Document processing error:', error);
496
+ throw new Error(`Document processing failed: ${error.message}`);
497
+ }
498
+ }
499
+
500
+ /**
501
+ * Process Bedrock output and extract structured data
502
+ * @param {Object} bedrockOutput - Output from Bedrock document analysis
503
+ * @returns {Object} Processed result with extracted data
504
+ */
505
+ function processBedrockOutput(bedrockOutput) {
506
+ try {
507
+ const result = {
508
+ success: true,
509
+ documentType: 'UNKNOWN',
510
+ extractedData: {},
511
+ metadata: bedrockOutput.metadata || {},
512
+ confidence: 0
513
+ };
514
+
515
+ if (bedrockOutput.elements && Array.isArray(bedrockOutput.elements)) {
516
+ const extractedData = {};
517
+ let totalConfidence = 0;
518
+ let confidenceCount = 0;
519
+
520
+ bedrockOutput.elements.forEach(element => {
521
+ if (element.category && element.text) {
522
+ const category = element.category.toLowerCase();
523
+ if (!extractedData[category]) {
524
+ extractedData[category] = [];
525
+ }
526
+
527
+ extractedData[category].push({
528
+ text: element.text,
529
+ confidence: element.confidence || 0,
530
+ boundingBox: element.boundingBox || null
531
+ });
532
+
533
+ if (element.confidence) {
534
+ totalConfidence += element.confidence;
535
+ confidenceCount++;
536
+ }
537
+ }
538
+ });
539
+
540
+ result.extractedData = extractedData;
541
+ result.confidence = confidenceCount > 0 ? Math.round(totalConfidence / confidenceCount) : 0;
542
+
543
+ const categories = Object.keys(extractedData);
544
+ if (categories.includes('invoice_number') || categories.includes('total_amount')) {
545
+ result.documentType = 'INVOICE';
546
+ } else if (categories.includes('receipt_total') || categories.includes('merchant_name')) {
547
+ result.documentType = 'RECEIPT';
548
+ } else if (categories.includes('account_number') || categories.includes('statement_date')) {
549
+ result.documentType = 'BANK_STATEMENT';
550
+ } else if (categories.includes('tax_year') || categories.includes('tax_form')) {
551
+ result.documentType = 'TAX_FORM';
552
+ } else if (categories.includes('pay_period') || categories.includes('gross_pay')) {
553
+ result.documentType = 'PAYSLIP';
554
+ } else if (categories.includes('utility_type') || categories.includes('service_address')) {
555
+ result.documentType = 'UTILITY_BILL';
556
+ } else if (categories.includes('loan_amount') || categories.includes('property_address')) {
557
+ result.documentType = 'MORTGAGE_DOCUMENT';
558
+ } else if (categories.includes('document_number') || categories.includes('issue_date')) {
559
+ result.documentType = 'IDENTITY_DOCUMENT';
560
+ } else {
561
+ result.documentType = 'GENERIC_DOCUMENT';
562
+ }
563
+ }
564
+
565
+ return result;
566
+
567
+ } catch (error) {
568
+ console.error('Error processing Bedrock output:', error);
569
+ return {
570
+ success: false,
571
+ error: error.message,
572
+ documentType: 'UNKNOWN',
573
+ extractedData: {},
574
+ confidence: 0
575
+ };
576
+ }
577
+ }
578
+
579
+ /**
580
+ * AWS Lambda handler function
581
+ * @param {Object} event - Lambda event
582
+ * @returns {Promise<Object>} Lambda response
583
+ */
584
+ export const handler = async (event) => {
585
+ try {
586
+ if (event.body && typeof event.body === 'string') {
587
+ const body = JSON.parse(event.body);
588
+ if (body.metadata && body.elements) {
589
+ return {
590
+ statusCode: 200,
591
+ body: JSON.stringify(processBedrockOutput(body))
592
+ };
593
+ }
594
+ }
595
+
596
+ const bucket = event.bucket || (event.Records && event.Records[0]?.s3?.bucket?.name);
597
+ const key = event.key || (event.Records && event.Records[0]?.s3?.object?.key);
598
+ const region = event.region || process.env.AWS_REGION || 'us-east-1';
599
+ const processWithRecommendedApi = event.processWithRecommendedApi !== false;
600
+
601
+ if (!bucket || !key) {
602
+ return {
603
+ statusCode: 400,
604
+ body: JSON.stringify({
605
+ success: false,
606
+ message: 'Missing bucket or key parameter'
607
+ })
608
+ };
609
+ }
610
+
611
+ try {
612
+ const result = await processDocument(bucket, key, {
613
+ region,
614
+ processWithRecommendedApi,
615
+ genaiConfig: event.genaiConfig
616
+ });
617
+
618
+ return {
619
+ statusCode: 200,
620
+ body: JSON.stringify({
621
+ success: true,
622
+ ...result
623
+ })
624
+ };
625
+ } catch (processingError) {
626
+ console.error('Document processing error:', processingError);
627
+ return {
628
+ statusCode: 500,
629
+ body: JSON.stringify({
630
+ success: false,
631
+ error: processingError.message,
632
+ message: 'Document processing failed'
633
+ })
634
+ };
635
+ }
636
+ } catch (error) {
637
+ console.error('Lambda execution error:', error);
638
+ return {
639
+ statusCode: 500,
640
+ body: JSON.stringify({
641
+ success: false,
642
+ error: error.message,
643
+ message: 'Lambda execution failed'
644
+ })
645
+ };
646
+ }
647
+ };
648
+
649
+ // Export classes and functions for use in other modules
650
+ export { TextractDocumentClassifier, processDocument, processBedrockOutput };
651
+
652
+ // For local testing when run directly
653
+ if (import.meta.url === `file://${process.argv[1]}`) {
654
+ try {
655
+ console.log('Testing with Bedrock output...');
656
+ const bedrockOutput = JSON.parse(
657
+ await readFile(new URL('./StandardOutputDocument.json', import.meta.url), 'utf8')
658
+ );
659
+
660
+ const result = processBedrockOutput(bedrockOutput);
661
+ console.log('Result:', JSON.stringify(result, null, 2));
662
+ } catch (error) {
663
+ console.error('Error during local testing:', error);
664
+ }
665
+ }