@cdklabs/cdk-appmod-catalog-blueprints 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.jsii +8644 -0
- package/LICENSE +202 -0
- package/README.md +212 -0
- package/lib/document-processing/agentic-document-processing.d.ts +16 -0
- package/lib/document-processing/agentic-document-processing.js +90 -0
- package/lib/document-processing/base-document-processing.d.ts +189 -0
- package/lib/document-processing/base-document-processing.js +509 -0
- package/lib/document-processing/bedrock-document-processing.d.ts +167 -0
- package/lib/document-processing/bedrock-document-processing.js +297 -0
- package/lib/document-processing/index.d.ts +3 -0
- package/lib/document-processing/index.js +20 -0
- package/lib/document-processing/resources/default-bedrock-invoke/index.py +63 -0
- package/lib/document-processing/resources/default-bedrock-invoke/requirements.txt +4 -0
- package/lib/document-processing/resources/default-doc-retrieval-lambda/index.mjs +92 -0
- package/lib/document-processing/resources/default-doc-retrieval-lambda/package.json +10 -0
- package/lib/document-processing/resources/default-error-handler/index.js +46 -0
- package/lib/document-processing/resources/default-error-handler/package.json +4 -0
- package/lib/document-processing/resources/default-image-processor/classifier.mjs +665 -0
- package/lib/document-processing/resources/default-image-processor/extractors.mjs +465 -0
- package/lib/document-processing/resources/default-image-processor/index.mjs +143 -0
- package/lib/document-processing/resources/default-image-processor/package-lock.json +12 -0
- package/lib/document-processing/resources/default-image-processor/package.json +4 -0
- package/lib/document-processing/resources/default-image-validator/index.mjs +76 -0
- package/lib/document-processing/resources/default-image-validator/package-lock.json +154 -0
- package/lib/document-processing/resources/default-image-validator/package.json +7 -0
- package/lib/document-processing/resources/default-pdf-processor/index.js +46 -0
- package/lib/document-processing/resources/default-pdf-validator/index.js +36 -0
- package/lib/document-processing/resources/default-sqs-consumer/index.py +111 -0
- package/lib/document-processing/resources/default-sqs-consumer/requirements.txt +4 -0
- package/lib/document-processing/resources/default-sqs-consumer/sample_payload.json +20 -0
- package/lib/document-processing/resources/default-sqs-consumer/sample_payload_multi.json +24 -0
- package/lib/document-processing/resources/default-strands-agent/index.py +111 -0
- package/lib/document-processing/resources/default-strands-agent/requirements.txt +6 -0
- package/lib/document-processing/tests/agentic-document-processing-nag.test.d.ts +1 -0
- package/lib/document-processing/tests/agentic-document-processing-nag.test.js +107 -0
- package/lib/document-processing/tests/agentic-document-processing.test.d.ts +1 -0
- package/lib/document-processing/tests/agentic-document-processing.test.js +125 -0
- package/lib/document-processing/tests/bedrock-document-processing-nag.test.d.ts +1 -0
- package/lib/document-processing/tests/bedrock-document-processing-nag.test.js +101 -0
- package/lib/document-processing/tests/bedrock-document-processing.test.d.ts +1 -0
- package/lib/document-processing/tests/bedrock-document-processing.test.js +79 -0
- package/lib/framework/custom-resource/default-runtimes.d.ts +21 -0
- package/lib/framework/custom-resource/default-runtimes.js +34 -0
- package/lib/framework/custom-resource/index.d.ts +1 -0
- package/lib/framework/custom-resource/index.js +18 -0
- package/lib/framework/foundation/access-log.d.ts +69 -0
- package/lib/framework/foundation/access-log.js +121 -0
- package/lib/framework/foundation/eventbridge-broker.d.ts +18 -0
- package/lib/framework/foundation/eventbridge-broker.js +42 -0
- package/lib/framework/foundation/index.d.ts +3 -0
- package/lib/framework/foundation/index.js +20 -0
- package/lib/framework/foundation/network.d.ts +19 -0
- package/lib/framework/foundation/network.js +83 -0
- package/lib/framework/index.d.ts +2 -0
- package/lib/framework/index.js +19 -0
- package/lib/framework/quickstart/base-quickstart.d.ts +30 -0
- package/lib/framework/quickstart/base-quickstart.js +30 -0
- package/lib/index.d.ts +4 -0
- package/lib/index.js +21 -0
- package/lib/tsconfig.tsbuildinfo +1 -0
- package/lib/utilities/cdk-nag-config.d.ts +42 -0
- package/lib/utilities/cdk-nag-config.js +194 -0
- package/lib/utilities/data-loader-lambda/index.py +282 -0
- package/lib/utilities/data-loader-lambda/requirements.txt +3 -0
- package/lib/utilities/data-loader.d.ts +173 -0
- package/lib/utilities/data-loader.js +447 -0
- package/lib/utilities/index.d.ts +3 -0
- package/lib/utilities/index.js +20 -0
- package/lib/utilities/lambda-iam-utils.d.ts +145 -0
- package/lib/utilities/lambda-iam-utils.js +235 -0
- package/lib/utilities/lambda_layers/data-masking/layer-construct.d.ts +42 -0
- package/lib/utilities/lambda_layers/data-masking/layer-construct.js +53 -0
- package/lib/utilities/lambda_layers/data-masking/layer-construct.ts +88 -0
- package/lib/utilities/observability/bedrock-observability.d.ts +18 -0
- package/lib/utilities/observability/bedrock-observability.js +131 -0
- package/lib/utilities/observability/cloudfront-distribution-observability-property-injector.d.ts +6 -0
- package/lib/utilities/observability/cloudfront-distribution-observability-property-injector.js +22 -0
- package/lib/utilities/observability/index.d.ts +6 -0
- package/lib/utilities/observability/index.js +25 -0
- package/lib/utilities/observability/lambda-observability-property-injector.d.ts +8 -0
- package/lib/utilities/observability/lambda-observability-property-injector.js +43 -0
- package/lib/utilities/observability/log-group-data-protection-props.d.ts +19 -0
- package/lib/utilities/observability/log-group-data-protection-props.js +5 -0
- package/lib/utilities/observability/observability.d.ts +83 -0
- package/lib/utilities/observability/observability.js +278 -0
- package/lib/utilities/observability/observable.d.ts +32 -0
- package/lib/utilities/observability/observable.js +3 -0
- package/lib/utilities/observability/powertools-config.d.ts +3 -0
- package/lib/utilities/observability/powertools-config.js +25 -0
- package/lib/utilities/observability/resources/bedrock-manage-logging-configuration/index.py +27 -0
- package/lib/utilities/observability/state-machine-observability-property-injector.d.ts +8 -0
- package/lib/utilities/observability/state-machine-observability-property-injector.js +49 -0
- package/lib/utilities/tests/data-loader-nag.test.d.ts +1 -0
- package/lib/utilities/tests/data-loader-nag.test.js +432 -0
- package/lib/utilities/tests/data-loader.test.d.ts +1 -0
- package/lib/utilities/tests/data-loader.test.js +284 -0
- package/lib/webapp/frontend-construct.d.ts +136 -0
- package/lib/webapp/frontend-construct.js +253 -0
- package/lib/webapp/index.d.ts +1 -0
- package/lib/webapp/index.js +18 -0
- package/lib/webapp/tests/frontend-construct-nag.test.d.ts +1 -0
- package/lib/webapp/tests/frontend-construct-nag.test.js +266 -0
- package/lib/webapp/tests/frontend-construct.test.d.ts +1 -0
- package/lib/webapp/tests/frontend-construct.test.js +385 -0
- package/package.json +183 -0
|
@@ -0,0 +1,465 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Extractors for different document types from Textract API results
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Extract key information from identity documents (AnalyzeID results)
|
|
7
|
+
* @param {Object} analyzeIdResult - Result from Textract AnalyzeID API
|
|
8
|
+
* @returns {Object} Extracted identity information
|
|
9
|
+
*/
|
|
10
|
+
export function extractIdentityDocumentInfo(analyzeIdResult) {
|
|
11
|
+
const extractedInfo = {
|
|
12
|
+
documentType: 'IDENTITY_DOCUMENT',
|
|
13
|
+
identityDocumentFields: {},
|
|
14
|
+
identityDocumentType: null
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
if (!analyzeIdResult || !analyzeIdResult.IdentityDocuments || analyzeIdResult.IdentityDocuments.length === 0) {
|
|
18
|
+
return extractedInfo;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
const idDoc = analyzeIdResult.IdentityDocuments[0];
|
|
22
|
+
extractedInfo.identityDocumentType = idDoc.DocumentType || null;
|
|
23
|
+
|
|
24
|
+
// Process identity document fields
|
|
25
|
+
if (idDoc.IdentityDocumentFields) {
|
|
26
|
+
idDoc.IdentityDocumentFields.forEach(field => {
|
|
27
|
+
if (field.Type && field.Type.Text && field.ValueDetection && field.ValueDetection.Text) {
|
|
28
|
+
const fieldName = field.Type.Text;
|
|
29
|
+
const fieldValue = field.ValueDetection.Text;
|
|
30
|
+
const fieldConfidence = field.ValueDetection.Confidence;
|
|
31
|
+
|
|
32
|
+
extractedInfo.identityDocumentFields[fieldName] = {
|
|
33
|
+
value: fieldValue,
|
|
34
|
+
confidence: fieldConfidence
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
});
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// Extract common fields for easier access
|
|
41
|
+
const commonFields = {
|
|
42
|
+
'FIRST_NAME': 'firstName',
|
|
43
|
+
'LAST_NAME': 'lastName',
|
|
44
|
+
'MIDDLE_NAME': 'middleName',
|
|
45
|
+
'FULL_NAME': 'fullName',
|
|
46
|
+
'DATE_OF_BIRTH': 'dateOfBirth',
|
|
47
|
+
'DATE_OF_ISSUE': 'dateOfIssue',
|
|
48
|
+
'DATE_OF_EXPIRY': 'dateOfExpiry',
|
|
49
|
+
'DOCUMENT_NUMBER': 'documentNumber',
|
|
50
|
+
'DOCUMENT_ID': 'documentId',
|
|
51
|
+
'ADDRESS': 'address',
|
|
52
|
+
'ISSUED_BY': 'issuedBy',
|
|
53
|
+
'GENDER': 'gender',
|
|
54
|
+
'NATIONALITY': 'nationality',
|
|
55
|
+
'COUNTRY': 'country',
|
|
56
|
+
'PLACE_OF_BIRTH': 'placeOfBirth'
|
|
57
|
+
};
|
|
58
|
+
|
|
59
|
+
Object.entries(commonFields).forEach(([apiField, normalizedField]) => {
|
|
60
|
+
if (extractedInfo.identityDocumentFields[apiField]) {
|
|
61
|
+
extractedInfo[normalizedField] = extractedInfo.identityDocumentFields[apiField].value;
|
|
62
|
+
}
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
return extractedInfo;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Extract key information from expense documents (AnalyzeExpense results)
|
|
70
|
+
* @param {Object} analyzeExpenseResult - Result from Textract AnalyzeExpense API
|
|
71
|
+
* @returns {Object} Extracted expense information
|
|
72
|
+
*/
|
|
73
|
+
export function extractExpenseInfo(analyzeExpenseResult) {
|
|
74
|
+
const extractedInfo = {
|
|
75
|
+
documentType: 'EXPENSE_DOCUMENT',
|
|
76
|
+
expenseDocumentType: null, // INVOICE or RECEIPT
|
|
77
|
+
summaryFields: {},
|
|
78
|
+
lineItems: [],
|
|
79
|
+
vendor: null,
|
|
80
|
+
total: null,
|
|
81
|
+
subtotal: null,
|
|
82
|
+
tax: null,
|
|
83
|
+
paymentInfo: {},
|
|
84
|
+
dates: {}
|
|
85
|
+
};
|
|
86
|
+
|
|
87
|
+
if (!analyzeExpenseResult || !analyzeExpenseResult.ExpenseDocuments || analyzeExpenseResult.ExpenseDocuments.length === 0) {
|
|
88
|
+
return extractedInfo;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const expenseDoc = analyzeExpenseResult.ExpenseDocuments[0];
|
|
92
|
+
|
|
93
|
+
// Determine if it's an invoice or receipt
|
|
94
|
+
if (expenseDoc.Type) {
|
|
95
|
+
extractedInfo.expenseDocumentType = expenseDoc.Type.Text;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Process summary fields
|
|
99
|
+
if (expenseDoc.SummaryFields) {
|
|
100
|
+
expenseDoc.SummaryFields.forEach(field => {
|
|
101
|
+
if (field.Type && field.Type.Text && field.ValueDetection) {
|
|
102
|
+
const fieldName = field.Type.Text;
|
|
103
|
+
const fieldValue = field.ValueDetection.Text;
|
|
104
|
+
const fieldConfidence = field.ValueDetection.Confidence;
|
|
105
|
+
|
|
106
|
+
extractedInfo.summaryFields[fieldName] = {
|
|
107
|
+
value: fieldValue,
|
|
108
|
+
confidence: fieldConfidence
|
|
109
|
+
};
|
|
110
|
+
|
|
111
|
+
// Extract common fields for easier access
|
|
112
|
+
switch (fieldName) {
|
|
113
|
+
case 'TOTAL':
|
|
114
|
+
extractedInfo.total = fieldValue;
|
|
115
|
+
break;
|
|
116
|
+
case 'SUBTOTAL':
|
|
117
|
+
extractedInfo.subtotal = fieldValue;
|
|
118
|
+
break;
|
|
119
|
+
case 'TAX':
|
|
120
|
+
case 'TOTAL_TAX':
|
|
121
|
+
extractedInfo.tax = fieldValue;
|
|
122
|
+
break;
|
|
123
|
+
case 'VENDOR_NAME':
|
|
124
|
+
extractedInfo.vendor = fieldValue;
|
|
125
|
+
break;
|
|
126
|
+
case 'INVOICE_RECEIPT_DATE':
|
|
127
|
+
case 'RECEIPT_DATE':
|
|
128
|
+
case 'INVOICE_DATE':
|
|
129
|
+
extractedInfo.dates.documentDate = fieldValue;
|
|
130
|
+
break;
|
|
131
|
+
case 'PAYMENT_DATE':
|
|
132
|
+
case 'DUE_DATE':
|
|
133
|
+
extractedInfo.dates.dueDate = fieldValue;
|
|
134
|
+
break;
|
|
135
|
+
case 'PAYMENT_TERMS':
|
|
136
|
+
extractedInfo.paymentInfo.terms = fieldValue;
|
|
137
|
+
break;
|
|
138
|
+
case 'PAYMENT_METHOD':
|
|
139
|
+
extractedInfo.paymentInfo.method = fieldValue;
|
|
140
|
+
break;
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Process line items
|
|
147
|
+
if (expenseDoc.LineItemGroups) {
|
|
148
|
+
expenseDoc.LineItemGroups.forEach(group => {
|
|
149
|
+
if (group.LineItems) {
|
|
150
|
+
group.LineItems.forEach(item => {
|
|
151
|
+
const lineItem = {};
|
|
152
|
+
|
|
153
|
+
if (item.LineItemExpenseFields) {
|
|
154
|
+
item.LineItemExpenseFields.forEach(field => {
|
|
155
|
+
if (field.Type && field.Type.Text && field.ValueDetection) {
|
|
156
|
+
const fieldName = field.Type.Text.toLowerCase().replace(/\s+/g, '_');
|
|
157
|
+
lineItem[fieldName] = field.ValueDetection.Text;
|
|
158
|
+
}
|
|
159
|
+
});
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
if (Object.keys(lineItem).length > 0) {
|
|
163
|
+
extractedInfo.lineItems.push(lineItem);
|
|
164
|
+
}
|
|
165
|
+
});
|
|
166
|
+
}
|
|
167
|
+
});
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
return extractedInfo;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* Extract key information from lending documents (AnalyzeLending results)
|
|
175
|
+
* @param {Object} analyzeLendingResult - Result from Textract AnalyzeLending API
|
|
176
|
+
* @returns {Object} Extracted lending information
|
|
177
|
+
*/
|
|
178
|
+
export function extractLendingInfo(analyzeLendingResult) {
|
|
179
|
+
const extractedInfo = {
|
|
180
|
+
documentType: 'LENDING_DOCUMENT',
|
|
181
|
+
lendingType: null,
|
|
182
|
+
extractedFields: {},
|
|
183
|
+
signatures: [],
|
|
184
|
+
paymentInfo: {},
|
|
185
|
+
loanInfo: {},
|
|
186
|
+
propertyInfo: {},
|
|
187
|
+
borrowerInfo: {},
|
|
188
|
+
lenderInfo: {}
|
|
189
|
+
};
|
|
190
|
+
|
|
191
|
+
if (!analyzeLendingResult || !analyzeLendingResult.LendingDocuments || analyzeLendingResult.LendingDocuments.length === 0) {
|
|
192
|
+
return extractedInfo;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
const lendingDoc = analyzeLendingResult.LendingDocuments[0];
|
|
196
|
+
|
|
197
|
+
// Get lending type
|
|
198
|
+
if (lendingDoc.Type) {
|
|
199
|
+
extractedInfo.lendingType = lendingDoc.Type.Text;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
// Process extracted fields
|
|
203
|
+
if (lendingDoc.ExtractedFields) {
|
|
204
|
+
lendingDoc.ExtractedFields.forEach(field => {
|
|
205
|
+
if (field.Type && field.Type.Text && field.ValueDetection) {
|
|
206
|
+
const fieldName = field.Type.Text;
|
|
207
|
+
const fieldValue = field.ValueDetection.Text;
|
|
208
|
+
const fieldConfidence = field.ValueDetection.Confidence;
|
|
209
|
+
|
|
210
|
+
extractedInfo.extractedFields[fieldName] = {
|
|
211
|
+
value: fieldValue,
|
|
212
|
+
confidence: fieldConfidence
|
|
213
|
+
};
|
|
214
|
+
|
|
215
|
+
// Categorize fields into appropriate sections
|
|
216
|
+
if (fieldName.includes('LOAN')) {
|
|
217
|
+
extractedInfo.loanInfo[fieldName] = fieldValue;
|
|
218
|
+
} else if (fieldName.includes('PROPERTY')) {
|
|
219
|
+
extractedInfo.propertyInfo[fieldName] = fieldValue;
|
|
220
|
+
} else if (fieldName.includes('BORROWER')) {
|
|
221
|
+
extractedInfo.borrowerInfo[fieldName] = fieldValue;
|
|
222
|
+
} else if (fieldName.includes('LENDER')) {
|
|
223
|
+
extractedInfo.lenderInfo[fieldName] = fieldValue;
|
|
224
|
+
} else if (fieldName.includes('PAYMENT')) {
|
|
225
|
+
extractedInfo.paymentInfo[fieldName] = fieldValue;
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
});
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
// Process signatures
|
|
232
|
+
if (lendingDoc.Signatures) {
|
|
233
|
+
lendingDoc.Signatures.forEach(signature => {
|
|
234
|
+
if (signature.ValueDetection) {
|
|
235
|
+
extractedInfo.signatures.push({
|
|
236
|
+
value: signature.ValueDetection.Text,
|
|
237
|
+
confidence: signature.ValueDetection.Confidence,
|
|
238
|
+
page: signature.Page
|
|
239
|
+
});
|
|
240
|
+
}
|
|
241
|
+
});
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
return extractedInfo;
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* Extract key information from generic documents (AnalyzeDocument results)
|
|
249
|
+
* @param {Object} analyzeDocumentResult - Result from Textract AnalyzeDocument API
|
|
250
|
+
* @returns {Object} Extracted document information
|
|
251
|
+
*/
|
|
252
|
+
export function extractDocumentInfo(analyzeDocumentResult) {
|
|
253
|
+
const extractedInfo = {
|
|
254
|
+
documentType: 'GENERIC_DOCUMENT',
|
|
255
|
+
formFields: {},
|
|
256
|
+
tables: [],
|
|
257
|
+
text: '',
|
|
258
|
+
pages: 0
|
|
259
|
+
};
|
|
260
|
+
|
|
261
|
+
if (!analyzeDocumentResult || !analyzeDocumentResult.Blocks) {
|
|
262
|
+
return extractedInfo;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
const blocks = analyzeDocumentResult.Blocks;
|
|
266
|
+
|
|
267
|
+
// Get page count
|
|
268
|
+
const pageBlocks = blocks.filter(block => block.BlockType === 'PAGE');
|
|
269
|
+
extractedInfo.pages = pageBlocks.length;
|
|
270
|
+
|
|
271
|
+
// Extract form fields (key-value pairs)
|
|
272
|
+
const keyMap = {};
|
|
273
|
+
const valueMap = {};
|
|
274
|
+
|
|
275
|
+
blocks.forEach(block => {
|
|
276
|
+
if (block.BlockType === 'KEY_VALUE_SET') {
|
|
277
|
+
if (block.EntityTypes && block.EntityTypes.includes('KEY')) {
|
|
278
|
+
keyMap[block.Id] = { id: block.Id };
|
|
279
|
+
} else if (block.EntityTypes && block.EntityTypes.includes('VALUE')) {
|
|
280
|
+
valueMap[block.Id] = { id: block.Id, value: '' };
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
});
|
|
284
|
+
|
|
285
|
+
// Get relationships between keys and values
|
|
286
|
+
blocks.forEach(block => {
|
|
287
|
+
if (block.BlockType === 'KEY_VALUE_SET' && block.EntityTypes && block.EntityTypes.includes('KEY')) {
|
|
288
|
+
if (block.Relationships) {
|
|
289
|
+
block.Relationships.forEach(relationship => {
|
|
290
|
+
if (relationship.Type === 'VALUE') {
|
|
291
|
+
relationship.Ids.forEach(valueId => {
|
|
292
|
+
keyMap[block.Id].valueId = valueId;
|
|
293
|
+
});
|
|
294
|
+
}
|
|
295
|
+
});
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
});
|
|
299
|
+
|
|
300
|
+
// Get key texts
|
|
301
|
+
blocks.forEach(block => {
|
|
302
|
+
if (block.BlockType === 'WORD') {
|
|
303
|
+
Object.values(keyMap).forEach(key => {
|
|
304
|
+
if (key.childIds && key.childIds.includes(block.Id)) {
|
|
305
|
+
key.text = (key.text || '') + ' ' + block.Text;
|
|
306
|
+
}
|
|
307
|
+
});
|
|
308
|
+
|
|
309
|
+
Object.values(valueMap).forEach(value => {
|
|
310
|
+
if (value.childIds && value.childIds.includes(block.Id)) {
|
|
311
|
+
value.value = (value.value || '') + ' ' + block.Text;
|
|
312
|
+
}
|
|
313
|
+
});
|
|
314
|
+
}
|
|
315
|
+
});
|
|
316
|
+
|
|
317
|
+
// Get child IDs for keys and values
|
|
318
|
+
blocks.forEach(block => {
|
|
319
|
+
if (block.BlockType === 'KEY_VALUE_SET') {
|
|
320
|
+
if (block.Relationships) {
|
|
321
|
+
block.Relationships.forEach(relationship => {
|
|
322
|
+
if (relationship.Type === 'CHILD') {
|
|
323
|
+
if (block.EntityTypes && block.EntityTypes.includes('KEY')) {
|
|
324
|
+
keyMap[block.Id].childIds = relationship.Ids;
|
|
325
|
+
} else if (block.EntityTypes && block.EntityTypes.includes('VALUE')) {
|
|
326
|
+
valueMap[block.Id].childIds = relationship.Ids;
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
});
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
});
|
|
333
|
+
|
|
334
|
+
// Extract text from keys and values
|
|
335
|
+
blocks.forEach(block => {
|
|
336
|
+
if (block.BlockType === 'WORD') {
|
|
337
|
+
Object.values(keyMap).forEach(key => {
|
|
338
|
+
if (key.childIds && key.childIds.includes(block.Id)) {
|
|
339
|
+
key.text = (key.text || '') + ' ' + block.Text;
|
|
340
|
+
}
|
|
341
|
+
});
|
|
342
|
+
|
|
343
|
+
Object.values(valueMap).forEach(value => {
|
|
344
|
+
if (value.childIds && value.childIds.includes(block.Id)) {
|
|
345
|
+
value.value = (value.value || '') + ' ' + block.Text;
|
|
346
|
+
}
|
|
347
|
+
});
|
|
348
|
+
}
|
|
349
|
+
});
|
|
350
|
+
|
|
351
|
+
// Map keys to values
|
|
352
|
+
Object.values(keyMap).forEach(key => {
|
|
353
|
+
if (key.text && key.valueId && valueMap[key.valueId]) {
|
|
354
|
+
const keyText = key.text.trim();
|
|
355
|
+
const valueText = valueMap[key.valueId].value ? valueMap[key.valueId].value.trim() : '';
|
|
356
|
+
|
|
357
|
+
if (keyText && valueText) {
|
|
358
|
+
extractedInfo.formFields[keyText] = valueText;
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
});
|
|
362
|
+
|
|
363
|
+
// Extract tables
|
|
364
|
+
const tableBlocks = blocks.filter(block => block.BlockType === 'TABLE');
|
|
365
|
+
|
|
366
|
+
tableBlocks.forEach(tableBlock => {
|
|
367
|
+
const table = {
|
|
368
|
+
rows: [],
|
|
369
|
+
rowCount: tableBlock.RowCount || 0,
|
|
370
|
+
columnCount: tableBlock.ColumnCount || 0,
|
|
371
|
+
page: tableBlock.Page || 1
|
|
372
|
+
};
|
|
373
|
+
|
|
374
|
+
if (tableBlock.Relationships) {
|
|
375
|
+
const cellIds = tableBlock.Relationships
|
|
376
|
+
.filter(rel => rel.Type === 'CHILD')
|
|
377
|
+
.flatMap(rel => rel.Ids);
|
|
378
|
+
|
|
379
|
+
const cellBlocks = blocks.filter(block =>
|
|
380
|
+
cellIds.includes(block.Id) && block.BlockType === 'CELL');
|
|
381
|
+
|
|
382
|
+
// Group cells by row
|
|
383
|
+
const rowMap = {};
|
|
384
|
+
cellBlocks.forEach(cell => {
|
|
385
|
+
const rowIndex = cell.RowIndex - 1; // Convert to 0-based index
|
|
386
|
+
if (!rowMap[rowIndex]) {
|
|
387
|
+
rowMap[rowIndex] = [];
|
|
388
|
+
}
|
|
389
|
+
rowMap[rowIndex].push(cell);
|
|
390
|
+
});
|
|
391
|
+
|
|
392
|
+
// Sort rows and cells
|
|
393
|
+
const sortedRows = Object.entries(rowMap)
|
|
394
|
+
.sort(([rowA], [rowB]) => parseInt(rowA) - parseInt(rowB))
|
|
395
|
+
.map(([_, cells]) =>
|
|
396
|
+
cells.sort((a, b) => a.ColumnIndex - b.ColumnIndex));
|
|
397
|
+
|
|
398
|
+
// Extract cell text
|
|
399
|
+
sortedRows.forEach(rowCells => {
|
|
400
|
+
const row = [];
|
|
401
|
+
rowCells.forEach(cell => {
|
|
402
|
+
let cellText = '';
|
|
403
|
+
|
|
404
|
+
if (cell.Relationships) {
|
|
405
|
+
const wordIds = cell.Relationships
|
|
406
|
+
.filter(rel => rel.Type === 'CHILD')
|
|
407
|
+
.flatMap(rel => rel.Ids);
|
|
408
|
+
|
|
409
|
+
const wordBlocks = blocks.filter(block =>
|
|
410
|
+
wordIds.includes(block.Id) && block.BlockType === 'WORD');
|
|
411
|
+
|
|
412
|
+
cellText = wordBlocks
|
|
413
|
+
.map(word => word.Text)
|
|
414
|
+
.join(' ');
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
row.push(cellText.trim());
|
|
418
|
+
});
|
|
419
|
+
|
|
420
|
+
table.rows.push(row);
|
|
421
|
+
});
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
extractedInfo.tables.push(table);
|
|
425
|
+
});
|
|
426
|
+
|
|
427
|
+
// Extract full text
|
|
428
|
+
const lineBlocks = blocks
|
|
429
|
+
.filter(block => block.BlockType === 'LINE')
|
|
430
|
+
.sort((a, b) => {
|
|
431
|
+
if (a.Page !== b.Page) return a.Page - b.Page;
|
|
432
|
+
return a.Geometry.BoundingBox.Top - b.Geometry.BoundingBox.Top;
|
|
433
|
+
});
|
|
434
|
+
|
|
435
|
+
extractedInfo.text = lineBlocks.map(block => block.Text).join('\n');
|
|
436
|
+
|
|
437
|
+
return extractedInfo;
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
/**
|
|
441
|
+
* Process Textract analysis result based on the API used
|
|
442
|
+
* @param {Object} analysisResult - Result from Textract API
|
|
443
|
+
* @param {string} apiUsed - The Textract API that was used
|
|
444
|
+
* @returns {Object} Extracted information
|
|
445
|
+
*/
|
|
446
|
+
export function processAnalysisResult(analysisResult, apiUsed) {
|
|
447
|
+
if (!analysisResult) {
|
|
448
|
+
return { error: 'No analysis result provided' };
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
switch (apiUsed) {
|
|
452
|
+
case 'AnalyzeID':
|
|
453
|
+
return extractIdentityDocumentInfo(analysisResult);
|
|
454
|
+
|
|
455
|
+
case 'AnalyzeExpense':
|
|
456
|
+
return extractExpenseInfo(analysisResult);
|
|
457
|
+
|
|
458
|
+
case 'AnalyzeLending':
|
|
459
|
+
return extractLendingInfo(analysisResult);
|
|
460
|
+
|
|
461
|
+
case 'AnalyzeDocument':
|
|
462
|
+
default:
|
|
463
|
+
return extractDocumentInfo(analysisResult);
|
|
464
|
+
}
|
|
465
|
+
}
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
import { S3Client, CopyObjectCommand, DeleteObjectCommand } from '@aws-sdk/client-s3';
|
|
4
|
+
import { DynamoDBClient, PutItemCommand } from '@aws-sdk/client-dynamodb';
|
|
5
|
+
import { processDocument } from './classifier.mjs';
|
|
6
|
+
|
|
7
|
+
const s3Client = new S3Client({region: process.env.AWS_REGION});
|
|
8
|
+
const ddbClient = new DynamoDBClient({region: process.env.AWS_REGION});
|
|
9
|
+
|
|
10
|
+
async function streamToBuffer(stream) {
|
|
11
|
+
const chunks = [];
|
|
12
|
+
return new Promise((resolve, reject) => {
|
|
13
|
+
stream.on('data', (chunk) => chunks.push(Buffer.from(chunk)));
|
|
14
|
+
stream.on('error', (err) => reject(err));
|
|
15
|
+
stream.on('end', () => resolve(Buffer.concat(chunks)));
|
|
16
|
+
});
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
async function moveFile(bucket, sourceKey, status) {
|
|
20
|
+
try {
|
|
21
|
+
// Get the appropriate prefix based on status
|
|
22
|
+
const destinationPrefix = status === "PROCESSED"
|
|
23
|
+
? process.env.OUTPUT_PREFIX
|
|
24
|
+
: process.env.FAILED_PREFIX;
|
|
25
|
+
|
|
26
|
+
if (!destinationPrefix) {
|
|
27
|
+
throw new Error(`${status === "PROCESSED" ? 'OUTPUT_PREFIX' : 'FAILED_PREFIX'} environment variable is not set`);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// Extract filename from the source key
|
|
31
|
+
const fileName = sourceKey.split('/').pop();
|
|
32
|
+
const destinationKey = `${destinationPrefix}${fileName}`;
|
|
33
|
+
|
|
34
|
+
// Copy the object to new location
|
|
35
|
+
await s3Client.send(new CopyObjectCommand({
|
|
36
|
+
Bucket: bucket,
|
|
37
|
+
CopySource: `${bucket}/${sourceKey}`,
|
|
38
|
+
Key: destinationKey
|
|
39
|
+
}));
|
|
40
|
+
|
|
41
|
+
// Delete the original object
|
|
42
|
+
await s3Client.send(new DeleteObjectCommand({
|
|
43
|
+
Bucket: bucket,
|
|
44
|
+
Key: sourceKey
|
|
45
|
+
}));
|
|
46
|
+
|
|
47
|
+
return destinationKey;
|
|
48
|
+
} catch (error) {
|
|
49
|
+
console.error('Error moving file:', error);
|
|
50
|
+
throw error;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export const handler = async (event) => {
|
|
55
|
+
console.log('Event:', JSON.stringify(event, null, 2))
|
|
56
|
+
const bucket = process.env.INPUT_BUCKET;
|
|
57
|
+
|
|
58
|
+
try {
|
|
59
|
+
// Get bucket and key from the event
|
|
60
|
+
const documentId = event.documentId
|
|
61
|
+
const key = event.key;
|
|
62
|
+
|
|
63
|
+
// GenAI configuration removed - will be handled by Step Functions integration
|
|
64
|
+
console.log('Using traditional Lambda processing (Textract + rules)');
|
|
65
|
+
|
|
66
|
+
const processResults = await processDocument(bucket, key, {
|
|
67
|
+
region: process.env.AWS_REGION,
|
|
68
|
+
processWithRecommendedApi: true,
|
|
69
|
+
genaiConfig: null // No GenAI in Lambda
|
|
70
|
+
})
|
|
71
|
+
|
|
72
|
+
console.log('Classification Method:', processResults.classificationMethod);
|
|
73
|
+
console.log('GenAI Used:', processResults.genaiUsed);
|
|
74
|
+
console.log(processResults.extractedEntities)
|
|
75
|
+
|
|
76
|
+
const newLocation = await moveFile(bucket, key, "PROCESSED");
|
|
77
|
+
|
|
78
|
+
// Process Textract results
|
|
79
|
+
const metadata = {
|
|
80
|
+
s3Location: newLocation,
|
|
81
|
+
timestamp: new Date().toISOString()
|
|
82
|
+
};
|
|
83
|
+
|
|
84
|
+
// 3. Store metadata in DynamoDB
|
|
85
|
+
const tableName = process.env.METADATA_TABLE;
|
|
86
|
+
if (!tableName) {
|
|
87
|
+
throw new Error('METADATA_TABLE environment variable is not set');
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
await ddbClient.send(
|
|
91
|
+
new PutItemCommand({
|
|
92
|
+
TableName: tableName, // Fixed the property name from TableItem to TableName
|
|
93
|
+
Item: {
|
|
94
|
+
documentId: {
|
|
95
|
+
S: documentId
|
|
96
|
+
},
|
|
97
|
+
s3Location: {
|
|
98
|
+
S: metadata.s3Location
|
|
99
|
+
},
|
|
100
|
+
createdAt: {
|
|
101
|
+
S: metadata.timestamp
|
|
102
|
+
},
|
|
103
|
+
classification: {
|
|
104
|
+
S: JSON.stringify(processResults.classification)
|
|
105
|
+
},
|
|
106
|
+
classificationMethod: {
|
|
107
|
+
S: processResults.classificationMethod || 'rule-based'
|
|
108
|
+
},
|
|
109
|
+
genaiUsed: {
|
|
110
|
+
BOOL: processResults.genaiUsed || false
|
|
111
|
+
},
|
|
112
|
+
textractApiUsed: {
|
|
113
|
+
S: JSON.stringify(processResults.apiUsed)
|
|
114
|
+
},
|
|
115
|
+
entities: {
|
|
116
|
+
S: JSON.stringify(processResults.extractedEntities)
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
})
|
|
120
|
+
);
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
// 4. Return the metadata
|
|
125
|
+
return {
|
|
126
|
+
status: "PROCESSED",
|
|
127
|
+
message: 'Document processed successfully',
|
|
128
|
+
metadata: metadata,
|
|
129
|
+
newLocation,
|
|
130
|
+
classificationMethod: processResults.classificationMethod,
|
|
131
|
+
genaiUsed: processResults.genaiUsed
|
|
132
|
+
};
|
|
133
|
+
|
|
134
|
+
} catch (error) {
|
|
135
|
+
console.error('Error processing document:', error);
|
|
136
|
+
const newLocation = await moveFile(bucket, event.key, "FAILED");
|
|
137
|
+
return {
|
|
138
|
+
status: "FAILED",
|
|
139
|
+
error: error.message || 'Unknown error',
|
|
140
|
+
newLocation
|
|
141
|
+
};
|
|
142
|
+
}
|
|
143
|
+
};
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import { S3Client, GetObjectCommand } from "@aws-sdk/client-s3";
|
|
2
|
+
import { fileTypeFromBuffer } from 'file-type';
|
|
3
|
+
|
|
4
|
+
const s3Client = new S3Client({});
|
|
5
|
+
|
|
6
|
+
export const handler = async (event) => {
|
|
7
|
+
console.log('Received event:', JSON.stringify(event, null, 2));
|
|
8
|
+
const { documentId, key } = event;
|
|
9
|
+
const bucket = process.env.INPUT_BUCKET;
|
|
10
|
+
|
|
11
|
+
try {
|
|
12
|
+
if (!documentId || !bucket || !key) {
|
|
13
|
+
throw new Error('Missing required fields: documentId, bucket, or key');
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
// Get the claimed extension from the filename
|
|
17
|
+
const claimedExt = key.split('.').pop().toLowerCase();
|
|
18
|
+
const validExtensions = ['jpg', 'jpeg', 'png', 'tiff'];
|
|
19
|
+
|
|
20
|
+
if (!validExtensions.includes(claimedExt)) {
|
|
21
|
+
throw new Error(`Invalid file format. Supported formats: JPG, PNG, TIFF. Received: ${claimedExt}`);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
// Get the file from S3
|
|
25
|
+
const getObjectResponse = await s3Client.send(
|
|
26
|
+
new GetObjectCommand({
|
|
27
|
+
Bucket: bucket,
|
|
28
|
+
Key: key,
|
|
29
|
+
})
|
|
30
|
+
);
|
|
31
|
+
|
|
32
|
+
// Read the file content
|
|
33
|
+
const chunks = [];
|
|
34
|
+
for await (const chunk of getObjectResponse.Body) {
|
|
35
|
+
chunks.push(chunk);
|
|
36
|
+
}
|
|
37
|
+
const buffer = Buffer.concat(chunks);
|
|
38
|
+
|
|
39
|
+
// Detect actual file type from content
|
|
40
|
+
const fileType = await fileTypeFromBuffer(buffer);
|
|
41
|
+
|
|
42
|
+
if (!fileType) {
|
|
43
|
+
throw new Error('Could not determine file type from content');
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// Validate that actual content type matches claimed extension
|
|
47
|
+
const actualExt = fileType.ext.toLowerCase();
|
|
48
|
+
if (actualExt !== claimedExt && !(actualExt === 'jpg' && claimedExt === 'jpeg')) {
|
|
49
|
+
throw new Error(`File extension mismatch. Claimed: ${claimedExt}, Actual: ${actualExt}`);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
return {
|
|
53
|
+
documentId,
|
|
54
|
+
bucket,
|
|
55
|
+
key,
|
|
56
|
+
status: 'VALID',
|
|
57
|
+
metadata: {
|
|
58
|
+
validatedAt: new Date().toISOString(),
|
|
59
|
+
format: actualExt,
|
|
60
|
+
mimeType: fileType.mime
|
|
61
|
+
},
|
|
62
|
+
timestamp: new Date().toISOString()
|
|
63
|
+
};
|
|
64
|
+
|
|
65
|
+
} catch (error) {
|
|
66
|
+
console.error('Validation error:', error);
|
|
67
|
+
return {
|
|
68
|
+
documentId,
|
|
69
|
+
bucket,
|
|
70
|
+
key,
|
|
71
|
+
error: error.message,
|
|
72
|
+
status: 'INVALID',
|
|
73
|
+
timestamp: new Date().toISOString()
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
};
|