@cdklabs/cdk-appmod-catalog-blueprints 1.5.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/.jsii +2537 -204
  2. package/lib/document-processing/adapter/adapter.d.ts +4 -2
  3. package/lib/document-processing/adapter/adapter.js +1 -1
  4. package/lib/document-processing/adapter/queued-s3-adapter.d.ts +9 -2
  5. package/lib/document-processing/adapter/queued-s3-adapter.js +29 -15
  6. package/lib/document-processing/agentic-document-processing.d.ts +4 -0
  7. package/lib/document-processing/agentic-document-processing.js +20 -10
  8. package/lib/document-processing/base-document-processing.d.ts +54 -2
  9. package/lib/document-processing/base-document-processing.js +136 -82
  10. package/lib/document-processing/bedrock-document-processing.d.ts +202 -2
  11. package/lib/document-processing/bedrock-document-processing.js +717 -77
  12. package/lib/document-processing/chunking-config.d.ts +614 -0
  13. package/lib/document-processing/chunking-config.js +5 -0
  14. package/lib/document-processing/default-document-processing-config.js +1 -1
  15. package/lib/document-processing/index.d.ts +1 -0
  16. package/lib/document-processing/index.js +2 -1
  17. package/lib/document-processing/resources/aggregation/handler.py +567 -0
  18. package/lib/document-processing/resources/aggregation/requirements.txt +7 -0
  19. package/lib/document-processing/resources/aggregation/test_handler.py +362 -0
  20. package/lib/document-processing/resources/cleanup/handler.py +276 -0
  21. package/lib/document-processing/resources/cleanup/requirements.txt +5 -0
  22. package/lib/document-processing/resources/cleanup/test_handler.py +436 -0
  23. package/lib/document-processing/resources/default-bedrock-invoke/index.py +85 -3
  24. package/lib/document-processing/resources/default-bedrock-invoke/test_index.py +622 -0
  25. package/lib/document-processing/resources/pdf-chunking/README.md +313 -0
  26. package/lib/document-processing/resources/pdf-chunking/chunking_strategies.py +460 -0
  27. package/lib/document-processing/resources/pdf-chunking/error_handling.py +491 -0
  28. package/lib/document-processing/resources/pdf-chunking/handler.py +958 -0
  29. package/lib/document-processing/resources/pdf-chunking/metrics.py +435 -0
  30. package/lib/document-processing/resources/pdf-chunking/requirements.txt +3 -0
  31. package/lib/document-processing/resources/pdf-chunking/strategy_selection.py +420 -0
  32. package/lib/document-processing/resources/pdf-chunking/structured_logging.py +457 -0
  33. package/lib/document-processing/resources/pdf-chunking/test_chunking_strategies.py +353 -0
  34. package/lib/document-processing/resources/pdf-chunking/test_error_handling.py +487 -0
  35. package/lib/document-processing/resources/pdf-chunking/test_handler.py +609 -0
  36. package/lib/document-processing/resources/pdf-chunking/test_integration.py +694 -0
  37. package/lib/document-processing/resources/pdf-chunking/test_metrics.py +532 -0
  38. package/lib/document-processing/resources/pdf-chunking/test_strategy_selection.py +471 -0
  39. package/lib/document-processing/resources/pdf-chunking/test_structured_logging.py +449 -0
  40. package/lib/document-processing/resources/pdf-chunking/test_token_estimation.py +374 -0
  41. package/lib/document-processing/resources/pdf-chunking/token_estimation.py +189 -0
  42. package/lib/document-processing/tests/agentic-document-processing-nag.test.js +4 -3
  43. package/lib/document-processing/tests/agentic-document-processing.test.js +488 -4
  44. package/lib/document-processing/tests/base-document-processing-nag.test.js +9 -2
  45. package/lib/document-processing/tests/base-document-processing-schema.test.d.ts +1 -0
  46. package/lib/document-processing/tests/base-document-processing-schema.test.js +337 -0
  47. package/lib/document-processing/tests/base-document-processing.test.js +114 -8
  48. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.d.ts +1 -0
  49. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.js +382 -0
  50. package/lib/document-processing/tests/bedrock-document-processing-nag.test.js +4 -3
  51. package/lib/document-processing/tests/bedrock-document-processing-security.test.d.ts +1 -0
  52. package/lib/document-processing/tests/bedrock-document-processing-security.test.js +389 -0
  53. package/lib/document-processing/tests/bedrock-document-processing.test.js +808 -8
  54. package/lib/document-processing/tests/chunking-config.test.d.ts +1 -0
  55. package/lib/document-processing/tests/chunking-config.test.js +238 -0
  56. package/lib/document-processing/tests/queued-s3-adapter-nag.test.js +9 -2
  57. package/lib/document-processing/tests/queued-s3-adapter.test.js +17 -6
  58. package/lib/framework/agents/base-agent.js +1 -1
  59. package/lib/framework/agents/batch-agent.js +1 -1
  60. package/lib/framework/agents/default-agent-config.js +1 -1
  61. package/lib/framework/bedrock/bedrock.js +1 -1
  62. package/lib/framework/custom-resource/default-runtimes.js +1 -1
  63. package/lib/framework/foundation/access-log.js +1 -1
  64. package/lib/framework/foundation/eventbridge-broker.js +1 -1
  65. package/lib/framework/foundation/network.js +1 -1
  66. package/lib/framework/tests/access-log.test.js +5 -2
  67. package/lib/framework/tests/batch-agent.test.js +5 -2
  68. package/lib/framework/tests/bedrock.test.js +5 -2
  69. package/lib/framework/tests/eventbridge-broker.test.js +5 -2
  70. package/lib/framework/tests/framework-nag.test.js +16 -8
  71. package/lib/framework/tests/network.test.js +9 -4
  72. package/lib/tsconfig.tsbuildinfo +1 -1
  73. package/lib/utilities/data-loader.js +1 -1
  74. package/lib/utilities/lambda-iam-utils.js +1 -1
  75. package/lib/utilities/observability/cloudfront-distribution-observability-property-injector.js +1 -1
  76. package/lib/utilities/observability/default-observability-config.js +1 -1
  77. package/lib/utilities/observability/lambda-observability-property-injector.js +1 -1
  78. package/lib/utilities/observability/log-group-data-protection-utils.js +1 -1
  79. package/lib/utilities/observability/powertools-config.d.ts +10 -1
  80. package/lib/utilities/observability/powertools-config.js +19 -3
  81. package/lib/utilities/observability/state-machine-observability-property-injector.js +1 -1
  82. package/lib/utilities/test-utils.d.ts +43 -0
  83. package/lib/utilities/test-utils.js +56 -0
  84. package/lib/utilities/tests/data-loader-nag.test.js +3 -2
  85. package/lib/utilities/tests/data-loader.test.js +3 -2
  86. package/lib/webapp/frontend-construct.js +1 -1
  87. package/lib/webapp/tests/frontend-construct-nag.test.js +3 -2
  88. package/lib/webapp/tests/frontend-construct.test.js +3 -2
  89. package/package.json +6 -5
  90. package/lib/document-processing/resources/default-error-handler/index.js +0 -46
  91. package/lib/document-processing/resources/default-pdf-processor/index.js +0 -46
  92. package/lib/document-processing/resources/default-pdf-validator/index.js +0 -36
@@ -0,0 +1,614 @@
1
+ /**
2
+ * Chunking strategy options for PDF document processing.
3
+ *
4
+ * Choose the strategy based on your document characteristics:
5
+ *
6
+ * - **fixed-pages**: Legacy approach, splits by fixed page count.
7
+ * - Pros: Fast, simple, predictable chunk sizes
8
+ * - Cons: May exceed token limits for dense documents
9
+ * - Best for: Uniform density documents, simple text
10
+ *
11
+ * - **token-based**: Splits based on token count to respect model limits.
12
+ * - Pros: Respects model token limits, handles variable density
13
+ * - Cons: Slower analysis, variable chunk sizes
14
+ * - Best for: Variable density documents, technical content
15
+ *
16
+ * - **hybrid**: RECOMMENDED - Balances token count and page limits.
17
+ * - Pros: Best of both worlds, reliable, flexible
18
+ * - Cons: Slightly more complex configuration
19
+ * - Best for: Most documents, general-purpose processing
20
+ */
21
+ export type ChunkingStrategy = 'fixed-pages' | 'token-based' | 'hybrid';
22
+ /**
23
+ * Processing mode for chunked documents.
24
+ *
25
+ * - **sequential**: Process chunks one at a time (cost-optimized)
26
+ * - **parallel**: Process multiple chunks simultaneously (speed-optimized)
27
+ */
28
+ export type ProcessingMode = 'sequential' | 'parallel';
29
+ /**
30
+ * Aggregation strategy for combining chunk results.
31
+ *
32
+ * - **majority-vote**: Select most frequent classification across chunks
33
+ * - **weighted-vote**: Weight early chunks higher than later chunks
34
+ * - **first-chunk**: Use classification from first chunk only
35
+ */
36
+ export type AggregationStrategy = 'majority-vote' | 'weighted-vote' | 'first-chunk';
37
+ /**
38
+ * Configuration for fixed-pages chunking strategy.
39
+ * Splits documents by fixed page count (legacy approach).
40
+ */
41
+ export interface FixedPagesConfig {
42
+ /**
43
+ * Threshold for triggering chunking based on page count.
44
+ * Documents with pages > threshold will be chunked.
45
+ * @default 100
46
+ */
47
+ readonly pageThreshold?: number;
48
+ /**
49
+ * Number of pages per chunk.
50
+ * @default 50
51
+ */
52
+ readonly chunkSize?: number;
53
+ /**
54
+ * Number of overlapping pages between consecutive chunks.
55
+ * Must be less than chunkSize.
56
+ * @default 5
57
+ */
58
+ readonly overlapPages?: number;
59
+ }
60
+ /**
61
+ * Configuration for token-based chunking strategy.
62
+ * Splits documents based on estimated token count to respect model limits.
63
+ */
64
+ export interface TokenBasedConfig {
65
+ /**
66
+ * Threshold for triggering chunking based on token count.
67
+ * Documents with tokens > threshold will be chunked.
68
+ * @default 150000
69
+ */
70
+ readonly tokenThreshold?: number;
71
+ /**
72
+ * Maximum tokens per chunk.
73
+ * Ensures no chunk exceeds model token limits.
74
+ * @default 100000
75
+ */
76
+ readonly maxTokensPerChunk?: number;
77
+ /**
78
+ * Number of overlapping tokens between consecutive chunks.
79
+ * Provides context continuity across chunks.
80
+ * @default 5000
81
+ */
82
+ readonly overlapTokens?: number;
83
+ }
84
+ /**
85
+ * Configuration for hybrid chunking strategy (RECOMMENDED).
86
+ * Balances token count and page limits for optimal chunking.
87
+ */
88
+ export interface HybridConfig {
89
+ /**
90
+ * Hard limit on pages per chunk.
91
+ * Prevents very large chunks even if token count is low.
92
+ * Note: Bedrock has a hard limit of 100 pages per PDF, so we default to 99
93
+ * to provide a safety margin.
94
+ * @default 99
95
+ */
96
+ readonly maxPagesPerChunk?: number;
97
+ /**
98
+ * Soft target for tokens per chunk.
99
+ * Chunks aim for this token count but respect maxPagesPerChunk.
100
+ * @default 80000
101
+ */
102
+ readonly targetTokensPerChunk?: number;
103
+ /**
104
+ * Threshold for triggering chunking based on page count.
105
+ * Documents with pages > threshold will be chunked.
106
+ * @default 100
107
+ */
108
+ readonly pageThreshold?: number;
109
+ /**
110
+ * Threshold for triggering chunking based on token count.
111
+ * Documents with tokens > threshold will be chunked.
112
+ * @default 150000
113
+ */
114
+ readonly tokenThreshold?: number;
115
+ /**
116
+ * Number of overlapping tokens between consecutive chunks.
117
+ * Provides context continuity across chunks.
118
+ * @default 5000
119
+ */
120
+ readonly overlapTokens?: number;
121
+ }
122
+ /**
123
+ * Comprehensive configuration for PDF chunking behavior.
124
+ *
125
+ * This interface provides fine-grained control over how large PDF documents are
126
+ * split into manageable chunks for processing. The chunking system supports three
127
+ * strategies, each optimized for different document types and use cases.
128
+ *
129
+ * ## Chunking Strategies
130
+ *
131
+ * ### 1. Hybrid Strategy (RECOMMENDED)
132
+ * Balances both token count and page limits for optimal chunking. Best for most
133
+ * documents as it respects model token limits while preventing excessively large chunks.
134
+ *
135
+ * ### 2. Token-Based Strategy
136
+ * Splits documents based on estimated token count. Best for documents with variable
137
+ * content density (e.g., mixed text and images, tables, charts).
138
+ *
139
+ * ### 3. Fixed-Pages Strategy (Legacy)
140
+ * Simple page-based splitting. Fast but may exceed token limits for dense documents.
141
+ * Use only for documents with uniform content density.
142
+ *
143
+ * ## Processing Modes
144
+ *
145
+ * - **parallel**: Process multiple chunks simultaneously (faster, higher cost)
146
+ * - **sequential**: Process chunks one at a time (slower, lower cost)
147
+ *
148
+ * ## Aggregation Strategies
149
+ *
150
+ * - **majority-vote**: Most frequent classification wins (recommended)
151
+ * - **weighted-vote**: Early chunks weighted higher
152
+ * - **first-chunk**: Use first chunk's classification only
153
+ *
154
+ * ## Default Values
155
+ *
156
+ * | Parameter | Default | Description |
157
+ * |-----------|---------|-------------|
158
+ * | strategy | 'hybrid' | Chunking strategy |
159
+ * | pageThreshold | 100 | Pages to trigger chunking |
160
+ * | tokenThreshold | 150000 | Tokens to trigger chunking |
161
+ * | chunkSize | 50 | Pages per chunk (fixed-pages) |
162
+ * | overlapPages | 5 | Overlap pages (fixed-pages) |
163
+ * | maxTokensPerChunk | 100000 | Max tokens per chunk (token-based) |
164
+ * | overlapTokens | 5000 | Overlap tokens (token-based, hybrid) |
165
+ * | targetTokensPerChunk | 80000 | Target tokens per chunk (hybrid) |
166
+ * | maxPagesPerChunk | 99 | Max pages per chunk (hybrid) |
167
+ * | processingMode | 'parallel' | Processing mode |
168
+ * | maxConcurrency | 10 | Max parallel chunks |
169
+ * | aggregationStrategy | 'majority-vote' | Result aggregation |
170
+ * | minSuccessThreshold | 0.5 | Min success rate for valid result |
171
+ */
172
+ export interface ChunkingConfig {
173
+ /**
174
+ * Chunking strategy to use.
175
+ *
176
+ * - **hybrid** (RECOMMENDED): Balances token count and page limits
177
+ * - **token-based**: Respects model token limits, good for variable density
178
+ * - **fixed-pages**: Simple page-based splitting (legacy, not recommended)
179
+ *
180
+ * @default 'hybrid'
181
+ */
182
+ readonly strategy?: ChunkingStrategy;
183
+ /**
184
+ * Threshold for triggering chunking based on page count (fixed-pages strategy).
185
+ * @default 100
186
+ */
187
+ readonly pageThreshold?: number;
188
+ /**
189
+ * Number of pages per chunk (fixed-pages strategy).
190
+ * @default 50
191
+ */
192
+ readonly chunkSize?: number;
193
+ /**
194
+ * Number of overlapping pages between chunks (fixed-pages strategy).
195
+ * @default 5
196
+ */
197
+ readonly overlapPages?: number;
198
+ /**
199
+ * Threshold for triggering chunking based on token count (token-based strategy).
200
+ * @default 150000
201
+ */
202
+ readonly tokenThreshold?: number;
203
+ /**
204
+ * Maximum tokens per chunk (token-based strategy).
205
+ * @default 100000
206
+ */
207
+ readonly maxTokensPerChunk?: number;
208
+ /**
209
+ * Number of overlapping tokens between chunks (token-based and hybrid strategies).
210
+ * @default 5000
211
+ */
212
+ readonly overlapTokens?: number;
213
+ /**
214
+ * Hard limit on pages per chunk (hybrid strategy).
215
+ * Note: Bedrock has a hard limit of 100 pages per PDF, so we default to 99
216
+ * to provide a safety margin.
217
+ * @default 99
218
+ */
219
+ readonly maxPagesPerChunk?: number;
220
+ /**
221
+ * Soft target for tokens per chunk (hybrid strategy).
222
+ * @default 80000
223
+ */
224
+ readonly targetTokensPerChunk?: number;
225
+ /**
226
+ * Processing mode for chunks.
227
+ *
228
+ * - **parallel**: Process multiple chunks simultaneously (faster, higher cost)
229
+ * - **sequential**: Process chunks one at a time (slower, lower cost)
230
+ *
231
+ * @default 'parallel'
232
+ */
233
+ readonly processingMode?: ProcessingMode;
234
+ /**
235
+ * Maximum number of chunks to process concurrently (parallel mode only).
236
+ * Higher values increase speed but also cost.
237
+ *
238
+ * @default 10
239
+ */
240
+ readonly maxConcurrency?: number;
241
+ /**
242
+ * Strategy for aggregating results from multiple chunks.
243
+ *
244
+ * - **majority-vote**: Most frequent classification wins
245
+ * - **weighted-vote**: Early chunks weighted higher
246
+ * - **first-chunk**: Use first chunk's classification
247
+ *
248
+ * @default 'majority-vote'
249
+ */
250
+ readonly aggregationStrategy?: AggregationStrategy;
251
+ /**
252
+ * Minimum percentage of chunks that must succeed for aggregation.
253
+ * If fewer chunks succeed, the result is marked as partial failure.
254
+ *
255
+ * @default 0.5 (50%)
256
+ */
257
+ readonly minSuccessThreshold?: number;
258
+ }
259
+ /**
260
+ * Metadata about a single chunk of a document.
261
+ * Contains information about the chunk's position, size, and S3 location.
262
+ */
263
+ export interface ChunkMetadata {
264
+ /**
265
+ * Unique identifier for this chunk.
266
+ * Format: {documentId}_chunk_{index}
267
+ */
268
+ readonly chunkId: string;
269
+ /**
270
+ * Zero-based index of this chunk in the document.
271
+ */
272
+ readonly chunkIndex: number;
273
+ /**
274
+ * Total number of chunks in the document.
275
+ */
276
+ readonly totalChunks: number;
277
+ /**
278
+ * Starting page number (zero-based) of this chunk.
279
+ */
280
+ readonly startPage: number;
281
+ /**
282
+ * Ending page number (zero-based, inclusive) of this chunk.
283
+ */
284
+ readonly endPage: number;
285
+ /**
286
+ * Number of pages in this chunk.
287
+ */
288
+ readonly pageCount: number;
289
+ /**
290
+ * Estimated token count for this chunk.
291
+ * Based on word-count heuristic (1.3 tokens per word).
292
+ */
293
+ readonly estimatedTokens: number;
294
+ /**
295
+ * S3 bucket containing the chunk file.
296
+ */
297
+ readonly bucket: string;
298
+ /**
299
+ * S3 key for the chunk file.
300
+ * Typically in chunks/ prefix.
301
+ */
302
+ readonly key: string;
303
+ }
304
+ /**
305
+ * Document content location information.
306
+ */
307
+ export interface DocumentContent {
308
+ /**
309
+ * Storage location type (e.g., 's3').
310
+ */
311
+ readonly location: string;
312
+ /**
313
+ * S3 bucket containing the document.
314
+ */
315
+ readonly bucket: string;
316
+ /**
317
+ * S3 key for the document.
318
+ */
319
+ readonly key: string;
320
+ /**
321
+ * Original filename of the document.
322
+ */
323
+ readonly filename: string;
324
+ }
325
+ /**
326
+ * Request payload for PDF analysis and chunking Lambda.
327
+ * Contains document information and chunking configuration.
328
+ */
329
+ export interface ChunkingRequest {
330
+ /**
331
+ * Unique identifier for the document.
332
+ */
333
+ readonly documentId: string;
334
+ /**
335
+ * Content type of the document.
336
+ * Typically 'file' for S3-based documents.
337
+ */
338
+ readonly contentType: string;
339
+ /**
340
+ * Document content location information.
341
+ */
342
+ readonly content: DocumentContent;
343
+ /**
344
+ * Optional chunking configuration.
345
+ * If not provided, uses default configuration.
346
+ */
347
+ readonly config?: ChunkingConfig;
348
+ }
349
+ /**
350
+ * Token analysis results from PDF analysis.
351
+ * Provides information about document size and token distribution.
352
+ */
353
+ export interface TokenAnalysis {
354
+ /**
355
+ * Total estimated tokens in the document.
356
+ */
357
+ readonly totalTokens: number;
358
+ /**
359
+ * Total number of pages in the document.
360
+ */
361
+ readonly totalPages: number;
362
+ /**
363
+ * Average tokens per page across the document.
364
+ */
365
+ readonly avgTokensPerPage: number;
366
+ /**
367
+ * Optional detailed token count for each page.
368
+ * Used for token-based and hybrid chunking strategies.
369
+ */
370
+ readonly tokensPerPage?: number[];
371
+ }
372
+ /**
373
+ * Response when chunking is NOT required.
374
+ * Document is below thresholds and will be processed without chunking.
375
+ */
376
+ export interface NoChunkingResponse {
377
+ /**
378
+ * Document identifier.
379
+ */
380
+ readonly documentId: string;
381
+ /**
382
+ * Indicates chunking is not required.
383
+ */
384
+ readonly requiresChunking: false;
385
+ /**
386
+ * Token analysis results.
387
+ */
388
+ readonly tokenAnalysis: TokenAnalysis;
389
+ /**
390
+ * Human-readable reason why chunking was not applied.
391
+ * Example: "Document has 50 pages, below threshold of 100"
392
+ */
393
+ readonly reason: string;
394
+ }
395
+ /**
396
+ * Chunking configuration used for processing.
397
+ * Includes both user-provided and default values.
398
+ */
399
+ export interface ChunkingConfigUsed {
400
+ readonly strategy: ChunkingStrategy;
401
+ readonly totalPages: number;
402
+ readonly totalTokens: number;
403
+ readonly chunkSize?: number;
404
+ readonly overlapPages?: number;
405
+ readonly maxTokensPerChunk?: number;
406
+ readonly overlapTokens?: number;
407
+ readonly targetTokensPerChunk?: number;
408
+ readonly maxPagesPerChunk?: number;
409
+ readonly processingMode?: string;
410
+ }
411
+ /**
412
+ * Response when chunking IS required.
413
+ * Document exceeds thresholds and has been split into chunks.
414
+ */
415
+ export interface ChunkingResponse {
416
+ /**
417
+ * Document identifier.
418
+ */
419
+ readonly documentId: string;
420
+ /**
421
+ * Indicates chunking is required.
422
+ */
423
+ readonly requiresChunking: true;
424
+ /**
425
+ * Token analysis results with detailed per-page information.
426
+ */
427
+ readonly tokenAnalysis: TokenAnalysis;
428
+ /**
429
+ * Strategy used for chunking.
430
+ */
431
+ readonly strategy: ChunkingStrategy;
432
+ /**
433
+ * Array of chunk metadata for all created chunks.
434
+ */
435
+ readonly chunks: ChunkMetadata[];
436
+ /**
437
+ * Configuration used for chunking.
438
+ * Includes both user-provided and default values.
439
+ */
440
+ readonly config: ChunkingConfigUsed;
441
+ }
442
+ /**
443
+ * Union type for chunking Lambda response.
444
+ * Either chunking is required or not.
445
+ */
446
+ export type ChunkingLambdaResponse = NoChunkingResponse | ChunkingResponse;
447
+ /**
448
+ * Classification result for a chunk.
449
+ */
450
+ export interface ChunkClassificationResult {
451
+ readonly documentClassification: string;
452
+ readonly confidence?: number;
453
+ }
454
+ /**
455
+ * Processing result for a chunk.
456
+ */
457
+ export interface ChunkProcessingResult {
458
+ readonly entities: Entity[];
459
+ }
460
+ /**
461
+ * Result from processing a single chunk.
462
+ * Contains classification and extraction results, or error information.
463
+ */
464
+ export interface ChunkResult {
465
+ /**
466
+ * Chunk identifier.
467
+ */
468
+ readonly chunkId: string;
469
+ /**
470
+ * Zero-based chunk index.
471
+ */
472
+ readonly chunkIndex: number;
473
+ /**
474
+ * Optional classification result for this chunk.
475
+ */
476
+ readonly classificationResult?: ChunkClassificationResult;
477
+ /**
478
+ * Optional extraction result for this chunk.
479
+ */
480
+ readonly processingResult?: ChunkProcessingResult;
481
+ /**
482
+ * Error message if chunk processing failed.
483
+ */
484
+ readonly error?: string;
485
+ }
486
+ /**
487
+ * Extracted entity from document processing.
488
+ */
489
+ export interface Entity {
490
+ /**
491
+ * Type of entity (e.g., 'NAME', 'DATE', 'AMOUNT', 'ADDRESS').
492
+ */
493
+ readonly type: string;
494
+ /**
495
+ * Value of the entity.
496
+ */
497
+ readonly value: string;
498
+ /**
499
+ * Optional page number where entity was found.
500
+ * Entities with page numbers are preserved even if duplicated.
501
+ */
502
+ readonly page?: number;
503
+ /**
504
+ * Optional chunk index where entity was found.
505
+ */
506
+ readonly chunkIndex?: number;
507
+ }
508
+ /**
509
+ * Request payload for aggregation Lambda.
510
+ * Contains results from all processed chunks.
511
+ */
512
+ export interface AggregationRequest {
513
+ /**
514
+ * Document identifier.
515
+ */
516
+ readonly documentId: string;
517
+ /**
518
+ * Results from all processed chunks.
519
+ */
520
+ readonly chunkResults: ChunkResult[];
521
+ /**
522
+ * Strategy to use for aggregation.
523
+ * @default 'majority-vote'
524
+ */
525
+ readonly aggregationStrategy?: AggregationStrategy;
526
+ }
527
+ /**
528
+ * Summary of chunk processing results.
529
+ */
530
+ export interface ChunksSummary {
531
+ /**
532
+ * Total number of chunks created.
533
+ */
534
+ readonly totalChunks: number;
535
+ /**
536
+ * Number of chunks that processed successfully.
537
+ */
538
+ readonly successfulChunks: number;
539
+ /**
540
+ * Number of chunks that failed processing.
541
+ */
542
+ readonly failedChunks: number;
543
+ /**
544
+ * Optional total tokens processed across all chunks.
545
+ */
546
+ readonly totalTokensProcessed?: number;
547
+ }
548
+ /**
549
+ * Aggregated result from processing all chunks.
550
+ * Combines classification and extraction results into final output.
551
+ */
552
+ export interface AggregatedResult {
553
+ /**
554
+ * Document identifier.
555
+ */
556
+ readonly documentId: string;
557
+ /**
558
+ * Final document classification (from majority vote or other strategy).
559
+ */
560
+ readonly classification: string;
561
+ /**
562
+ * Confidence score for the classification (0-1).
563
+ * For majority vote: (count of majority / total chunks)
564
+ */
565
+ readonly classificationConfidence: number;
566
+ /**
567
+ * Deduplicated entities from all chunks.
568
+ * Entities without page numbers are deduplicated by (type, value).
569
+ * Entities with page numbers are preserved even if duplicated.
570
+ */
571
+ readonly entities: Entity[];
572
+ /**
573
+ * Summary of chunk processing results.
574
+ */
575
+ readonly chunksSummary: ChunksSummary;
576
+ /**
577
+ * Indicates if result is partial due to chunk failures.
578
+ * True if fewer than minSuccessThreshold chunks succeeded.
579
+ */
580
+ readonly partialResult: boolean;
581
+ }
582
+ /**
583
+ * Request payload for cleanup Lambda.
584
+ * Contains information about chunks to delete.
585
+ */
586
+ export interface CleanupRequest {
587
+ /**
588
+ * Document identifier.
589
+ */
590
+ readonly documentId: string;
591
+ /**
592
+ * Array of chunk metadata for chunks to delete.
593
+ */
594
+ readonly chunks: ChunkMetadata[];
595
+ }
596
+ /**
597
+ * Response from cleanup Lambda.
598
+ * Reports success and any errors encountered.
599
+ */
600
+ export interface CleanupResponse {
601
+ /**
602
+ * Document identifier.
603
+ */
604
+ readonly documentId: string;
605
+ /**
606
+ * Number of chunks successfully deleted.
607
+ */
608
+ readonly deletedChunks: number;
609
+ /**
610
+ * Array of error messages for failed deletions.
611
+ * Empty if all deletions succeeded.
612
+ */
613
+ readonly errors: string[];
614
+ }
@@ -0,0 +1,5 @@
1
+ "use strict";
2
+ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ Object.defineProperty(exports, "__esModule", { value: true });
5
+ //# sourceMappingURL=data:application/json;base64,
@@ -9,6 +9,6 @@ class DefaultDocumentProcessingConfig {
9
9
  }
10
10
  exports.DefaultDocumentProcessingConfig = DefaultDocumentProcessingConfig;
11
11
  _a = JSII_RTTI_SYMBOL_1;
12
- DefaultDocumentProcessingConfig[_a] = { fqn: "@cdklabs/cdk-appmod-catalog-blueprints.DefaultDocumentProcessingConfig", version: "1.5.0" };
12
+ DefaultDocumentProcessingConfig[_a] = { fqn: "@cdklabs/cdk-appmod-catalog-blueprints.DefaultDocumentProcessingConfig", version: "1.6.0" };
13
13
  DefaultDocumentProcessingConfig.DEFAULT_OBSERVABILITY_METRIC_SVC_NAME = 'document-processing';
14
14
  //# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiZGVmYXVsdC1kb2N1bWVudC1wcm9jZXNzaW5nLWNvbmZpZy5qcyIsInNvdXJjZVJvb3QiOiIiLCJzb3VyY2VzIjpbIi4uLy4uL3VzZS1jYXNlcy9kb2N1bWVudC1wcm9jZXNzaW5nL2RlZmF1bHQtZG9jdW1lbnQtcHJvY2Vzc2luZy1jb25maWcudHMiXSwibmFtZXMiOltdLCJtYXBwaW5ncyI6Ijs7Ozs7QUFBQSxxRUFBcUU7QUFDckUsc0NBQXNDO0FBRXRDLE1BQWEsK0JBQStCOztBQUE1QywwRUFFQzs7O0FBRHdCLHFFQUFxQyxHQUFHLHFCQUFxQixDQUFDIiwic291cmNlc0NvbnRlbnQiOlsiLy8gQ29weXJpZ2h0IEFtYXpvbi5jb20sIEluYy4gb3IgaXRzIGFmZmlsaWF0ZXMuIEFsbCBSaWdodHMgUmVzZXJ2ZWQuXG4vLyBTUERYLUxpY2Vuc2UtSWRlbnRpZmllcjogQXBhY2hlLTIuMFxuXG5leHBvcnQgY2xhc3MgRGVmYXVsdERvY3VtZW50UHJvY2Vzc2luZ0NvbmZpZyB7XG4gIHB1YmxpYyBzdGF0aWMgcmVhZG9ubHkgREVGQVVMVF9PQlNFUlZBQklMSVRZX01FVFJJQ19TVkNfTkFNRSA9ICdkb2N1bWVudC1wcm9jZXNzaW5nJztcbn0iXX0=
@@ -3,3 +3,4 @@ export * from './bedrock-document-processing';
3
3
  export * from './agentic-document-processing';
4
4
  export * from './adapter';
5
5
  export * from './default-document-processing-config';
6
+ export * from './chunking-config';