@cdklabs/cdk-appmod-catalog-blueprints 1.4.1 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/.jsii +2579 -194
  2. package/lib/document-processing/adapter/adapter.d.ts +4 -2
  3. package/lib/document-processing/adapter/adapter.js +1 -1
  4. package/lib/document-processing/adapter/queued-s3-adapter.d.ts +9 -2
  5. package/lib/document-processing/adapter/queued-s3-adapter.js +29 -15
  6. package/lib/document-processing/agentic-document-processing.d.ts +4 -0
  7. package/lib/document-processing/agentic-document-processing.js +20 -10
  8. package/lib/document-processing/base-document-processing.d.ts +54 -2
  9. package/lib/document-processing/base-document-processing.js +136 -82
  10. package/lib/document-processing/bedrock-document-processing.d.ts +202 -2
  11. package/lib/document-processing/bedrock-document-processing.js +717 -77
  12. package/lib/document-processing/chunking-config.d.ts +614 -0
  13. package/lib/document-processing/chunking-config.js +5 -0
  14. package/lib/document-processing/default-document-processing-config.js +1 -1
  15. package/lib/document-processing/index.d.ts +1 -0
  16. package/lib/document-processing/index.js +2 -1
  17. package/lib/document-processing/resources/aggregation/handler.py +567 -0
  18. package/lib/document-processing/resources/aggregation/requirements.txt +7 -0
  19. package/lib/document-processing/resources/aggregation/test_handler.py +362 -0
  20. package/lib/document-processing/resources/cleanup/handler.py +276 -0
  21. package/lib/document-processing/resources/cleanup/requirements.txt +5 -0
  22. package/lib/document-processing/resources/cleanup/test_handler.py +436 -0
  23. package/lib/document-processing/resources/default-bedrock-invoke/index.py +85 -3
  24. package/lib/document-processing/resources/default-bedrock-invoke/test_index.py +622 -0
  25. package/lib/document-processing/resources/pdf-chunking/README.md +313 -0
  26. package/lib/document-processing/resources/pdf-chunking/chunking_strategies.py +460 -0
  27. package/lib/document-processing/resources/pdf-chunking/error_handling.py +491 -0
  28. package/lib/document-processing/resources/pdf-chunking/handler.py +958 -0
  29. package/lib/document-processing/resources/pdf-chunking/metrics.py +435 -0
  30. package/lib/document-processing/resources/pdf-chunking/requirements.txt +3 -0
  31. package/lib/document-processing/resources/pdf-chunking/strategy_selection.py +420 -0
  32. package/lib/document-processing/resources/pdf-chunking/structured_logging.py +457 -0
  33. package/lib/document-processing/resources/pdf-chunking/test_chunking_strategies.py +353 -0
  34. package/lib/document-processing/resources/pdf-chunking/test_error_handling.py +487 -0
  35. package/lib/document-processing/resources/pdf-chunking/test_handler.py +609 -0
  36. package/lib/document-processing/resources/pdf-chunking/test_integration.py +694 -0
  37. package/lib/document-processing/resources/pdf-chunking/test_metrics.py +532 -0
  38. package/lib/document-processing/resources/pdf-chunking/test_strategy_selection.py +471 -0
  39. package/lib/document-processing/resources/pdf-chunking/test_structured_logging.py +449 -0
  40. package/lib/document-processing/resources/pdf-chunking/test_token_estimation.py +374 -0
  41. package/lib/document-processing/resources/pdf-chunking/token_estimation.py +189 -0
  42. package/lib/document-processing/tests/agentic-document-processing-nag.test.js +4 -3
  43. package/lib/document-processing/tests/agentic-document-processing.test.js +488 -4
  44. package/lib/document-processing/tests/base-document-processing-nag.test.js +9 -2
  45. package/lib/document-processing/tests/base-document-processing-schema.test.d.ts +1 -0
  46. package/lib/document-processing/tests/base-document-processing-schema.test.js +337 -0
  47. package/lib/document-processing/tests/base-document-processing.test.js +114 -8
  48. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.d.ts +1 -0
  49. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.js +382 -0
  50. package/lib/document-processing/tests/bedrock-document-processing-nag.test.js +4 -3
  51. package/lib/document-processing/tests/bedrock-document-processing-security.test.d.ts +1 -0
  52. package/lib/document-processing/tests/bedrock-document-processing-security.test.js +389 -0
  53. package/lib/document-processing/tests/bedrock-document-processing.test.js +808 -8
  54. package/lib/document-processing/tests/chunking-config.test.d.ts +1 -0
  55. package/lib/document-processing/tests/chunking-config.test.js +238 -0
  56. package/lib/document-processing/tests/queued-s3-adapter-nag.test.js +9 -2
  57. package/lib/document-processing/tests/queued-s3-adapter.test.js +17 -6
  58. package/lib/framework/agents/base-agent.js +1 -1
  59. package/lib/framework/agents/batch-agent.js +1 -1
  60. package/lib/framework/agents/default-agent-config.js +1 -1
  61. package/lib/framework/bedrock/bedrock.js +1 -1
  62. package/lib/framework/custom-resource/default-runtimes.js +1 -1
  63. package/lib/framework/foundation/access-log.js +1 -1
  64. package/lib/framework/foundation/eventbridge-broker.js +1 -1
  65. package/lib/framework/foundation/network.d.ts +4 -2
  66. package/lib/framework/foundation/network.js +52 -41
  67. package/lib/framework/tests/access-log.test.js +5 -2
  68. package/lib/framework/tests/batch-agent.test.js +5 -2
  69. package/lib/framework/tests/bedrock.test.js +5 -2
  70. package/lib/framework/tests/eventbridge-broker.test.js +5 -2
  71. package/lib/framework/tests/framework-nag.test.js +26 -7
  72. package/lib/framework/tests/network.test.js +30 -2
  73. package/lib/tsconfig.tsbuildinfo +1 -1
  74. package/lib/utilities/data-loader.js +1 -1
  75. package/lib/utilities/lambda-iam-utils.js +1 -1
  76. package/lib/utilities/observability/cloudfront-distribution-observability-property-injector.js +1 -1
  77. package/lib/utilities/observability/default-observability-config.js +1 -1
  78. package/lib/utilities/observability/lambda-observability-property-injector.js +1 -1
  79. package/lib/utilities/observability/log-group-data-protection-utils.js +1 -1
  80. package/lib/utilities/observability/powertools-config.d.ts +10 -1
  81. package/lib/utilities/observability/powertools-config.js +19 -3
  82. package/lib/utilities/observability/state-machine-observability-property-injector.js +1 -1
  83. package/lib/utilities/test-utils.d.ts +43 -0
  84. package/lib/utilities/test-utils.js +56 -0
  85. package/lib/utilities/tests/data-loader-nag.test.js +3 -2
  86. package/lib/utilities/tests/data-loader.test.js +3 -2
  87. package/lib/webapp/frontend-construct.js +1 -1
  88. package/lib/webapp/tests/frontend-construct-nag.test.js +3 -2
  89. package/lib/webapp/tests/frontend-construct.test.js +3 -2
  90. package/package.json +6 -5
  91. package/lib/document-processing/resources/default-error-handler/index.js +0 -46
  92. package/lib/document-processing/resources/default-pdf-processor/index.js +0 -46
  93. package/lib/document-processing/resources/default-pdf-validator/index.js +0 -36
@@ -1,9 +1,11 @@
1
1
  import { Duration } from 'aws-cdk-lib';
2
2
  import { Role } from 'aws-cdk-lib/aws-iam';
3
3
  import { Function } from 'aws-cdk-lib/aws-lambda';
4
- import { StateMachine } from 'aws-cdk-lib/aws-stepfunctions';
4
+ import { IChainable } from 'aws-cdk-lib/aws-stepfunctions';
5
+ import { DynamoAttributeValue } from 'aws-cdk-lib/aws-stepfunctions-tasks';
5
6
  import { Construct } from 'constructs';
6
7
  import { BaseDocumentProcessing, BaseDocumentProcessingProps, DocumentProcessingStepType } from './base-document-processing';
8
+ import { ChunkingConfig } from './chunking-config';
7
9
  import { BedrockModelProps } from '../framework/bedrock';
8
10
  /**
9
11
  * Configuration properties for BedrockDocumentProcessing construct.
@@ -45,6 +47,74 @@ export interface BedrockDocumentProcessingProps extends BaseDocumentProcessingPr
45
47
  * @default Duration.minutes(5)
46
48
  */
47
49
  readonly stepTimeouts?: Duration;
50
+ /**
51
+ * Custom prompt template for aggregating results from multiple chunks.
52
+ * Used when chunking is enabled to merge processing results from all chunks
53
+ * into a single coherent result.
54
+ *
55
+ * The prompt receives the concatenated processing results from all chunks
56
+ * and should instruct the model to synthesize them into a unified output.
57
+ *
58
+ * @default DEFAULT_AGGREGATION_PROMPT
59
+ */
60
+ readonly aggregationPrompt?: string;
61
+ /**
62
+ * Enable PDF chunking for large documents.
63
+ *
64
+ * When enabled, documents exceeding configured thresholds will be automatically
65
+ * split into chunks, processed in parallel or sequentially, and results aggregated.
66
+ *
67
+ * This feature is useful for:
68
+ * - Processing large PDFs (>100 pages)
69
+ * - Handling documents that exceed Bedrock token limits (~200K tokens)
70
+ * - Improving processing reliability for complex documents
71
+ * - Processing documents with variable content density
72
+ *
73
+ * The chunking workflow:
74
+ * 1. Analyzes PDF to determine page count and estimate token count
75
+ * 2. Decides if chunking is needed based on configured thresholds
76
+ * 3. If chunking is needed, splits PDF into chunks and uploads to S3
77
+ * 4. Processes each chunk through classification and extraction
78
+ * 5. Aggregates results using majority voting for classification
79
+ * 6. Deduplicates entities across chunks
80
+ * 7. Cleans up temporary chunk files from S3
81
+ *
82
+ * @default false
83
+ */
84
+ readonly enableChunking?: boolean;
85
+ /**
86
+ * Configuration for PDF chunking behavior.
87
+ *
88
+ * Only applies when `enableChunking` is true. Allows customization of:
89
+ * - **Chunking strategy**: How documents are split (fixed-pages, token-based, or hybrid)
90
+ * - **Thresholds**: When to trigger chunking based on page count or token count
91
+ * - **Chunk size and overlap**: Control chunk boundaries and context preservation
92
+ * - **Processing mode**: Parallel (faster) or sequential (cost-optimized)
93
+ * - **Aggregation strategy**: How to combine results from multiple chunks
94
+ *
95
+ * ## Default Configuration
96
+ *
97
+ * If not provided, uses sensible defaults optimized for most use cases:
98
+ * - Strategy: `'hybrid'` (recommended - balances token and page limits)
99
+ * - Page threshold: 100 pages
100
+ * - Token threshold: 150,000 tokens
101
+ * - Processing mode: `'parallel'`
102
+ * - Max concurrency: 10
103
+ * - Aggregation strategy: `'majority-vote'`
104
+ *
105
+ * ## Strategy Comparison
106
+ *
107
+ * | Strategy | Best For | Pros | Cons |
108
+ * |----------|----------|------|------|
109
+ * | `hybrid` | Most documents | Balances token/page limits | Slightly more complex |
110
+ * | `token-based` | Variable density | Respects model limits | Slower analysis |
111
+ * | `fixed-pages` | Uniform density | Simple, fast | May exceed token limits |
112
+ *
113
+ * @default undefined (uses default configuration when enableChunking is true)
114
+ *
115
+ * @see {@link ChunkingConfig} for detailed configuration options
116
+ */
117
+ readonly chunkingConfig?: ChunkingConfig;
48
118
  }
49
119
  /**
50
120
  * Document processing workflow powered by Amazon Bedrock foundation models.
@@ -79,10 +149,29 @@ export interface BedrockDocumentProcessingProps extends BaseDocumentProcessingPr
79
149
  export declare class BedrockDocumentProcessing extends BaseDocumentProcessing {
80
150
  protected static readonly DEFAULT_CLASSIFICATION_PROMPT = "\n Analyze the document below, and classify the type of document it is (eg. INVOICE, IDENTITY_DOCUMENT, RECEIPT, etc). The result should be in JSON and should follow the following structure (only respond in JSON with the following structure and do not use markdown to indicate the json, just output plain old json with nothing else):\n\n {\n documentClassification: <CLASSIFICATION>\n }\n\n Attached document is as follows:\n\n ";
81
151
  protected static readonly DEFAULT_PROCESSING_PROMPT = "\n The document below has been classified as [ACTUAL_CLASSIFICATION]. Extract important entities from the document and return the result as JSON following the structure below (only respond in JSON with the following structure and do not use markdown to indicate the json, just output plain old json with nothing else):\n\n {\n documentClassification: <CLASSIFICATION>,\n result: {\n entities: [\n {\n type: <TYPE OF ENTITY>\n value: <VALUE OF ENTITY>\n },\n ...\n ]\n }\n }\n\n Attached document is as follows:\n\n ";
152
+ protected static readonly DEFAULT_AGGREGATION_PROMPT = "\n You are given the processing results from multiple chunks of a large document that was split for processing.\n Your task is to synthesize these chunk results into a single, coherent final result.\n\n Instructions:\n 1. Review all the chunk results provided below\n 2. Merge and deduplicate any overlapping information (chunks may have overlapping pages)\n 3. Synthesize the information into a unified, coherent result\n 4. Maintain the same output format as the individual chunk results\n 5. If the chunks contain summaries, create a comprehensive summary that covers all sections\n 6. If the chunks contain entities, deduplicate and consolidate them\n 7. Preserve important details from each chunk while avoiding redundancy\n\n Return the result as JSON (only respond in JSON without markdown formatting):\n\n {\n \"result\": <SYNTHESIZED_RESULT>\n }\n\n The chunk results to aggregate are as follows:\n\n ";
82
153
  /** Configuration properties specific to Bedrock document processing */
83
154
  protected readonly bedrockDocumentProcessingProps: BedrockDocumentProcessingProps;
84
155
  /** The Step Functions state machine that orchestrates the document processing workflow */
85
- readonly stateMachine: StateMachine;
156
+ readonly stateMachine: import("aws-cdk-lib/aws-stepfunctions").StateMachine;
157
+ /** Cached classification Lambda function to avoid duplicate resource creation */
158
+ private _classificationFunction?;
159
+ /** Cached processing Lambda function to avoid duplicate resource creation */
160
+ private _processingFunction?;
161
+ /** Counter for generating unique classification step IDs */
162
+ private _classificationStepCounter;
163
+ /** Counter for generating unique processing step IDs */
164
+ private _processingStepCounter;
165
+ /** Cached aggregation Lambda function to avoid duplicate resource creation */
166
+ private _aggregationFunction?;
167
+ /** Counter for generating unique aggregation step IDs */
168
+ private _aggregationStepCounter;
169
+ /** Cached cleanup Lambda function to avoid duplicate resource creation */
170
+ private _cleanupFunction?;
171
+ /** Counter for generating unique enrichment step IDs */
172
+ private _enrichmentStepCounter;
173
+ /** Counter for generating unique post-processing step IDs */
174
+ private _postProcessingStepCounter;
86
175
  /**
87
176
  * Creates a new BedrockDocumentProcessing construct.
88
177
  *
@@ -93,8 +182,23 @@ export declare class BedrockDocumentProcessing extends BaseDocumentProcessing {
93
182
  * @param scope - The scope in which to define this construct
94
183
  * @param id - The scoped construct ID. Must be unique within the scope.
95
184
  * @param props - Configuration properties for the Bedrock document processing pipeline
185
+ * @throws Error if chunking configuration is invalid
96
186
  */
97
187
  constructor(scope: Construct, id: string, props: BedrockDocumentProcessingProps);
188
+ /**
189
+ * Validates the chunking configuration parameters.
190
+ *
191
+ * Ensures that:
192
+ * - Chunk size is greater than 0
193
+ * - Overlap is non-negative and less than chunk size
194
+ * - Thresholds are greater than 0
195
+ * - Max concurrency is greater than 0
196
+ * - Min success threshold is between 0 and 1
197
+ *
198
+ * @param config - The chunking configuration to validate
199
+ * @throws Error if any configuration parameter is invalid
200
+ */
201
+ private validateChunkingConfig;
98
202
  /**
99
203
  * Implements the document classification step using Amazon Bedrock.
100
204
  *
@@ -102,6 +206,9 @@ export declare class BedrockDocumentProcessing extends BaseDocumentProcessing {
102
206
  * the document type. The function reads the document from S3 and sends it to
103
207
  * Bedrock with the classification prompt.
104
208
  *
209
+ * This method caches the Lambda function to avoid creating duplicate resources,
210
+ * but creates a new LambdaInvoke task each time to allow proper state chaining.
211
+ *
105
212
  * @returns LambdaInvoke task configured for document classification
106
213
  */
107
214
  protected classificationStep(): DocumentProcessingStepType;
@@ -112,6 +219,9 @@ export declare class BedrockDocumentProcessing extends BaseDocumentProcessing {
112
219
  * structured data from the document. Uses the classification result from the
113
220
  * previous step to provide context for more accurate extraction.
114
221
  *
222
+ * This method caches the Lambda function to avoid creating duplicate resources,
223
+ * but creates a new LambdaInvoke task each time to allow proper state chaining.
224
+ *
115
225
  * @returns LambdaInvoke task configured for document extraction
116
226
  */
117
227
  protected processingStep(): DocumentProcessingStepType;
@@ -136,4 +246,94 @@ export declare class BedrockDocumentProcessing extends BaseDocumentProcessing {
136
246
  * @returns LambdaInvoke task for post-processing, or undefined to skip this step
137
247
  */
138
248
  protected postProcessingStep(): DocumentProcessingStepType | undefined;
249
+ /**
250
+ * Implements the optional preprocessing step for PDF chunking.
251
+ *
252
+ * When chunking is enabled, creates a Lambda function that analyzes PDFs and
253
+ * splits large documents into manageable chunks. The function:
254
+ * 1. Analyzes the PDF to determine page count and token estimates
255
+ * 2. Decides if chunking is needed based on configured thresholds
256
+ * 3. If chunking is needed, splits the PDF and uploads chunks to S3
257
+ *
258
+ * @returns LambdaInvoke task for PDF analysis and chunking, or undefined if chunking is disabled
259
+ */
260
+ protected preprocessingStep(): DocumentProcessingStepType | undefined;
261
+ /**
262
+ * Provides additional metadata fields for chunking to be stored in DynamoDB.
263
+ *
264
+ * When chunking is enabled, adds fields for:
265
+ * - ChunkingEnabled: string representation of boolean flag
266
+ * - ChunkingStrategy: strategy used (fixed-pages, token-based, hybrid)
267
+ * - TokenAnalysis: JSON string with token analysis results
268
+ * - ChunkMetadata: JSON string array with chunk information
269
+ *
270
+ * @returns Record of DynamoDB attribute values for chunking metadata
271
+ */
272
+ protected preprocessingMetadata(): Record<string, DynamoAttributeValue>;
273
+ /**
274
+ * Creates the processing workflow with conditional branching for chunked documents.
275
+ *
276
+ * When chunking is enabled, creates a Choice State that:
277
+ * - Routes to chunked processing flow if document was chunked
278
+ * - Routes to standard processing flow if document was not chunked
279
+ *
280
+ * When chunking is disabled, returns the standard processing workflow.
281
+ *
282
+ * @returns Step Functions chain for processing the document
283
+ */
284
+ protected createProcessingWorkflow(): IChainable;
285
+ /**
286
+ * Creates the chunked processing flow for large documents.
287
+ *
288
+ * This flow:
289
+ * 1. Uses a Map State to process each chunk in parallel (or sequentially)
290
+ * 2. Each chunk goes through classification and processing
291
+ * 3. Results are aggregated using the aggregation Lambda
292
+ * 4. DynamoDB is updated with the aggregated result
293
+ * 5. Temporary chunks are cleaned up from S3
294
+ *
295
+ * @returns Step Functions chain for chunked document processing
296
+ */
297
+ private createChunkedProcessingFlow;
298
+ /**
299
+ * Creates the aggregation step for combining chunk results using Bedrock.
300
+ *
301
+ * Uses the same Bedrock invoke Lambda pattern as the processing step but with
302
+ * a different prompt designed for aggregating multiple chunk results.
303
+ * The chunk processing results are passed as text data to the model.
304
+ *
305
+ * @returns LambdaInvoke task for result aggregation
306
+ */
307
+ private createAggregationStep;
308
+ /**
309
+ * Creates the DynamoDB update step for storing aggregated results.
310
+ *
311
+ * Updates the document record with:
312
+ * - AggregatedResult: JSON string with classification, entities, and summary
313
+ * - WorkflowStatus: 'complete'
314
+ *
315
+ * @returns DynamoUpdateItem task for storing aggregated results
316
+ */
317
+ private createUpdateAggregatedResultStep;
318
+ /**
319
+ * Creates the cleanup Lambda step for removing temporary chunk files.
320
+ *
321
+ * The cleanup Lambda:
322
+ * - Deletes all chunk files from S3 chunks/ prefix
323
+ * - Uses batch delete for efficiency (up to 1000 objects per request)
324
+ * - Logs errors but doesn't fail the workflow
325
+ *
326
+ * @returns LambdaInvoke task for chunk cleanup
327
+ */
328
+ private createCleanupStep;
329
+ /**
330
+ * Creates the error handler for aggregation failures.
331
+ *
332
+ * When aggregation fails:
333
+ * - Updates DynamoDB with 'aggregation-failure' status
334
+ * - Moves document to failed/ prefix
335
+ *
336
+ * @returns Step Functions chain for handling aggregation errors
337
+ */
338
+ private createAggregationErrorHandler;
139
339
  }