@cdklabs/cdk-appmod-catalog-blueprints 1.5.0 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.jsii +2537 -204
- package/lib/document-processing/adapter/adapter.d.ts +4 -2
- package/lib/document-processing/adapter/adapter.js +1 -1
- package/lib/document-processing/adapter/queued-s3-adapter.d.ts +9 -2
- package/lib/document-processing/adapter/queued-s3-adapter.js +29 -15
- package/lib/document-processing/agentic-document-processing.d.ts +4 -0
- package/lib/document-processing/agentic-document-processing.js +20 -10
- package/lib/document-processing/base-document-processing.d.ts +54 -2
- package/lib/document-processing/base-document-processing.js +136 -82
- package/lib/document-processing/bedrock-document-processing.d.ts +202 -2
- package/lib/document-processing/bedrock-document-processing.js +717 -77
- package/lib/document-processing/chunking-config.d.ts +614 -0
- package/lib/document-processing/chunking-config.js +5 -0
- package/lib/document-processing/default-document-processing-config.js +1 -1
- package/lib/document-processing/index.d.ts +1 -0
- package/lib/document-processing/index.js +2 -1
- package/lib/document-processing/resources/aggregation/handler.py +567 -0
- package/lib/document-processing/resources/aggregation/requirements.txt +7 -0
- package/lib/document-processing/resources/aggregation/test_handler.py +362 -0
- package/lib/document-processing/resources/cleanup/handler.py +276 -0
- package/lib/document-processing/resources/cleanup/requirements.txt +5 -0
- package/lib/document-processing/resources/cleanup/test_handler.py +436 -0
- package/lib/document-processing/resources/default-bedrock-invoke/index.py +85 -3
- package/lib/document-processing/resources/default-bedrock-invoke/test_index.py +622 -0
- package/lib/document-processing/resources/pdf-chunking/README.md +313 -0
- package/lib/document-processing/resources/pdf-chunking/chunking_strategies.py +460 -0
- package/lib/document-processing/resources/pdf-chunking/error_handling.py +491 -0
- package/lib/document-processing/resources/pdf-chunking/handler.py +958 -0
- package/lib/document-processing/resources/pdf-chunking/metrics.py +435 -0
- package/lib/document-processing/resources/pdf-chunking/requirements.txt +3 -0
- package/lib/document-processing/resources/pdf-chunking/strategy_selection.py +420 -0
- package/lib/document-processing/resources/pdf-chunking/structured_logging.py +457 -0
- package/lib/document-processing/resources/pdf-chunking/test_chunking_strategies.py +353 -0
- package/lib/document-processing/resources/pdf-chunking/test_error_handling.py +487 -0
- package/lib/document-processing/resources/pdf-chunking/test_handler.py +609 -0
- package/lib/document-processing/resources/pdf-chunking/test_integration.py +694 -0
- package/lib/document-processing/resources/pdf-chunking/test_metrics.py +532 -0
- package/lib/document-processing/resources/pdf-chunking/test_strategy_selection.py +471 -0
- package/lib/document-processing/resources/pdf-chunking/test_structured_logging.py +449 -0
- package/lib/document-processing/resources/pdf-chunking/test_token_estimation.py +374 -0
- package/lib/document-processing/resources/pdf-chunking/token_estimation.py +189 -0
- package/lib/document-processing/tests/agentic-document-processing-nag.test.js +4 -3
- package/lib/document-processing/tests/agentic-document-processing.test.js +488 -4
- package/lib/document-processing/tests/base-document-processing-nag.test.js +9 -2
- package/lib/document-processing/tests/base-document-processing-schema.test.d.ts +1 -0
- package/lib/document-processing/tests/base-document-processing-schema.test.js +337 -0
- package/lib/document-processing/tests/base-document-processing.test.js +114 -8
- package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.d.ts +1 -0
- package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.js +382 -0
- package/lib/document-processing/tests/bedrock-document-processing-nag.test.js +4 -3
- package/lib/document-processing/tests/bedrock-document-processing-security.test.d.ts +1 -0
- package/lib/document-processing/tests/bedrock-document-processing-security.test.js +389 -0
- package/lib/document-processing/tests/bedrock-document-processing.test.js +808 -8
- package/lib/document-processing/tests/chunking-config.test.d.ts +1 -0
- package/lib/document-processing/tests/chunking-config.test.js +238 -0
- package/lib/document-processing/tests/queued-s3-adapter-nag.test.js +9 -2
- package/lib/document-processing/tests/queued-s3-adapter.test.js +17 -6
- package/lib/framework/agents/base-agent.js +1 -1
- package/lib/framework/agents/batch-agent.js +1 -1
- package/lib/framework/agents/default-agent-config.js +1 -1
- package/lib/framework/bedrock/bedrock.js +1 -1
- package/lib/framework/custom-resource/default-runtimes.js +1 -1
- package/lib/framework/foundation/access-log.js +1 -1
- package/lib/framework/foundation/eventbridge-broker.js +1 -1
- package/lib/framework/foundation/network.js +1 -1
- package/lib/framework/tests/access-log.test.js +5 -2
- package/lib/framework/tests/batch-agent.test.js +5 -2
- package/lib/framework/tests/bedrock.test.js +5 -2
- package/lib/framework/tests/eventbridge-broker.test.js +5 -2
- package/lib/framework/tests/framework-nag.test.js +16 -8
- package/lib/framework/tests/network.test.js +9 -4
- package/lib/tsconfig.tsbuildinfo +1 -1
- package/lib/utilities/data-loader.js +1 -1
- package/lib/utilities/lambda-iam-utils.js +1 -1
- package/lib/utilities/observability/cloudfront-distribution-observability-property-injector.js +1 -1
- package/lib/utilities/observability/default-observability-config.js +1 -1
- package/lib/utilities/observability/lambda-observability-property-injector.js +1 -1
- package/lib/utilities/observability/log-group-data-protection-utils.js +1 -1
- package/lib/utilities/observability/powertools-config.d.ts +10 -1
- package/lib/utilities/observability/powertools-config.js +19 -3
- package/lib/utilities/observability/state-machine-observability-property-injector.js +1 -1
- package/lib/utilities/test-utils.d.ts +43 -0
- package/lib/utilities/test-utils.js +56 -0
- package/lib/utilities/tests/data-loader-nag.test.js +3 -2
- package/lib/utilities/tests/data-loader.test.js +3 -2
- package/lib/webapp/frontend-construct.js +1 -1
- package/lib/webapp/tests/frontend-construct-nag.test.js +3 -2
- package/lib/webapp/tests/frontend-construct.test.js +3 -2
- package/package.json +6 -5
- package/lib/document-processing/resources/default-error-handler/index.js +0 -46
- package/lib/document-processing/resources/default-pdf-processor/index.js +0 -46
- package/lib/document-processing/resources/default-pdf-validator/index.js +0 -36
|
@@ -0,0 +1,614 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Chunking strategy options for PDF document processing.
|
|
3
|
+
*
|
|
4
|
+
* Choose the strategy based on your document characteristics:
|
|
5
|
+
*
|
|
6
|
+
* - **fixed-pages**: Legacy approach, splits by fixed page count.
|
|
7
|
+
* - Pros: Fast, simple, predictable chunk sizes
|
|
8
|
+
* - Cons: May exceed token limits for dense documents
|
|
9
|
+
* - Best for: Uniform density documents, simple text
|
|
10
|
+
*
|
|
11
|
+
* - **token-based**: Splits based on token count to respect model limits.
|
|
12
|
+
* - Pros: Respects model token limits, handles variable density
|
|
13
|
+
* - Cons: Slower analysis, variable chunk sizes
|
|
14
|
+
* - Best for: Variable density documents, technical content
|
|
15
|
+
*
|
|
16
|
+
* - **hybrid**: RECOMMENDED - Balances token count and page limits.
|
|
17
|
+
* - Pros: Best of both worlds, reliable, flexible
|
|
18
|
+
* - Cons: Slightly more complex configuration
|
|
19
|
+
* - Best for: Most documents, general-purpose processing
|
|
20
|
+
*/
|
|
21
|
+
export type ChunkingStrategy = 'fixed-pages' | 'token-based' | 'hybrid';
|
|
22
|
+
/**
|
|
23
|
+
* Processing mode for chunked documents.
|
|
24
|
+
*
|
|
25
|
+
* - **sequential**: Process chunks one at a time (cost-optimized)
|
|
26
|
+
* - **parallel**: Process multiple chunks simultaneously (speed-optimized)
|
|
27
|
+
*/
|
|
28
|
+
export type ProcessingMode = 'sequential' | 'parallel';
|
|
29
|
+
/**
|
|
30
|
+
* Aggregation strategy for combining chunk results.
|
|
31
|
+
*
|
|
32
|
+
* - **majority-vote**: Select most frequent classification across chunks
|
|
33
|
+
* - **weighted-vote**: Weight early chunks higher than later chunks
|
|
34
|
+
* - **first-chunk**: Use classification from first chunk only
|
|
35
|
+
*/
|
|
36
|
+
export type AggregationStrategy = 'majority-vote' | 'weighted-vote' | 'first-chunk';
|
|
37
|
+
/**
|
|
38
|
+
* Configuration for fixed-pages chunking strategy.
|
|
39
|
+
* Splits documents by fixed page count (legacy approach).
|
|
40
|
+
*/
|
|
41
|
+
export interface FixedPagesConfig {
|
|
42
|
+
/**
|
|
43
|
+
* Threshold for triggering chunking based on page count.
|
|
44
|
+
* Documents with pages > threshold will be chunked.
|
|
45
|
+
* @default 100
|
|
46
|
+
*/
|
|
47
|
+
readonly pageThreshold?: number;
|
|
48
|
+
/**
|
|
49
|
+
* Number of pages per chunk.
|
|
50
|
+
* @default 50
|
|
51
|
+
*/
|
|
52
|
+
readonly chunkSize?: number;
|
|
53
|
+
/**
|
|
54
|
+
* Number of overlapping pages between consecutive chunks.
|
|
55
|
+
* Must be less than chunkSize.
|
|
56
|
+
* @default 5
|
|
57
|
+
*/
|
|
58
|
+
readonly overlapPages?: number;
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Configuration for token-based chunking strategy.
|
|
62
|
+
* Splits documents based on estimated token count to respect model limits.
|
|
63
|
+
*/
|
|
64
|
+
export interface TokenBasedConfig {
|
|
65
|
+
/**
|
|
66
|
+
* Threshold for triggering chunking based on token count.
|
|
67
|
+
* Documents with tokens > threshold will be chunked.
|
|
68
|
+
* @default 150000
|
|
69
|
+
*/
|
|
70
|
+
readonly tokenThreshold?: number;
|
|
71
|
+
/**
|
|
72
|
+
* Maximum tokens per chunk.
|
|
73
|
+
* Ensures no chunk exceeds model token limits.
|
|
74
|
+
* @default 100000
|
|
75
|
+
*/
|
|
76
|
+
readonly maxTokensPerChunk?: number;
|
|
77
|
+
/**
|
|
78
|
+
* Number of overlapping tokens between consecutive chunks.
|
|
79
|
+
* Provides context continuity across chunks.
|
|
80
|
+
* @default 5000
|
|
81
|
+
*/
|
|
82
|
+
readonly overlapTokens?: number;
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Configuration for hybrid chunking strategy (RECOMMENDED).
|
|
86
|
+
* Balances token count and page limits for optimal chunking.
|
|
87
|
+
*/
|
|
88
|
+
export interface HybridConfig {
|
|
89
|
+
/**
|
|
90
|
+
* Hard limit on pages per chunk.
|
|
91
|
+
* Prevents very large chunks even if token count is low.
|
|
92
|
+
* Note: Bedrock has a hard limit of 100 pages per PDF, so we default to 99
|
|
93
|
+
* to provide a safety margin.
|
|
94
|
+
* @default 99
|
|
95
|
+
*/
|
|
96
|
+
readonly maxPagesPerChunk?: number;
|
|
97
|
+
/**
|
|
98
|
+
* Soft target for tokens per chunk.
|
|
99
|
+
* Chunks aim for this token count but respect maxPagesPerChunk.
|
|
100
|
+
* @default 80000
|
|
101
|
+
*/
|
|
102
|
+
readonly targetTokensPerChunk?: number;
|
|
103
|
+
/**
|
|
104
|
+
* Threshold for triggering chunking based on page count.
|
|
105
|
+
* Documents with pages > threshold will be chunked.
|
|
106
|
+
* @default 100
|
|
107
|
+
*/
|
|
108
|
+
readonly pageThreshold?: number;
|
|
109
|
+
/**
|
|
110
|
+
* Threshold for triggering chunking based on token count.
|
|
111
|
+
* Documents with tokens > threshold will be chunked.
|
|
112
|
+
* @default 150000
|
|
113
|
+
*/
|
|
114
|
+
readonly tokenThreshold?: number;
|
|
115
|
+
/**
|
|
116
|
+
* Number of overlapping tokens between consecutive chunks.
|
|
117
|
+
* Provides context continuity across chunks.
|
|
118
|
+
* @default 5000
|
|
119
|
+
*/
|
|
120
|
+
readonly overlapTokens?: number;
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Comprehensive configuration for PDF chunking behavior.
|
|
124
|
+
*
|
|
125
|
+
* This interface provides fine-grained control over how large PDF documents are
|
|
126
|
+
* split into manageable chunks for processing. The chunking system supports three
|
|
127
|
+
* strategies, each optimized for different document types and use cases.
|
|
128
|
+
*
|
|
129
|
+
* ## Chunking Strategies
|
|
130
|
+
*
|
|
131
|
+
* ### 1. Hybrid Strategy (RECOMMENDED)
|
|
132
|
+
* Balances both token count and page limits for optimal chunking. Best for most
|
|
133
|
+
* documents as it respects model token limits while preventing excessively large chunks.
|
|
134
|
+
*
|
|
135
|
+
* ### 2. Token-Based Strategy
|
|
136
|
+
* Splits documents based on estimated token count. Best for documents with variable
|
|
137
|
+
* content density (e.g., mixed text and images, tables, charts).
|
|
138
|
+
*
|
|
139
|
+
* ### 3. Fixed-Pages Strategy (Legacy)
|
|
140
|
+
* Simple page-based splitting. Fast but may exceed token limits for dense documents.
|
|
141
|
+
* Use only for documents with uniform content density.
|
|
142
|
+
*
|
|
143
|
+
* ## Processing Modes
|
|
144
|
+
*
|
|
145
|
+
* - **parallel**: Process multiple chunks simultaneously (faster, higher cost)
|
|
146
|
+
* - **sequential**: Process chunks one at a time (slower, lower cost)
|
|
147
|
+
*
|
|
148
|
+
* ## Aggregation Strategies
|
|
149
|
+
*
|
|
150
|
+
* - **majority-vote**: Most frequent classification wins (recommended)
|
|
151
|
+
* - **weighted-vote**: Early chunks weighted higher
|
|
152
|
+
* - **first-chunk**: Use first chunk's classification only
|
|
153
|
+
*
|
|
154
|
+
* ## Default Values
|
|
155
|
+
*
|
|
156
|
+
* | Parameter | Default | Description |
|
|
157
|
+
* |-----------|---------|-------------|
|
|
158
|
+
* | strategy | 'hybrid' | Chunking strategy |
|
|
159
|
+
* | pageThreshold | 100 | Pages to trigger chunking |
|
|
160
|
+
* | tokenThreshold | 150000 | Tokens to trigger chunking |
|
|
161
|
+
* | chunkSize | 50 | Pages per chunk (fixed-pages) |
|
|
162
|
+
* | overlapPages | 5 | Overlap pages (fixed-pages) |
|
|
163
|
+
* | maxTokensPerChunk | 100000 | Max tokens per chunk (token-based) |
|
|
164
|
+
* | overlapTokens | 5000 | Overlap tokens (token-based, hybrid) |
|
|
165
|
+
* | targetTokensPerChunk | 80000 | Target tokens per chunk (hybrid) |
|
|
166
|
+
* | maxPagesPerChunk | 99 | Max pages per chunk (hybrid) |
|
|
167
|
+
* | processingMode | 'parallel' | Processing mode |
|
|
168
|
+
* | maxConcurrency | 10 | Max parallel chunks |
|
|
169
|
+
* | aggregationStrategy | 'majority-vote' | Result aggregation |
|
|
170
|
+
* | minSuccessThreshold | 0.5 | Min success rate for valid result |
|
|
171
|
+
*/
|
|
172
|
+
export interface ChunkingConfig {
|
|
173
|
+
/**
|
|
174
|
+
* Chunking strategy to use.
|
|
175
|
+
*
|
|
176
|
+
* - **hybrid** (RECOMMENDED): Balances token count and page limits
|
|
177
|
+
* - **token-based**: Respects model token limits, good for variable density
|
|
178
|
+
* - **fixed-pages**: Simple page-based splitting (legacy, not recommended)
|
|
179
|
+
*
|
|
180
|
+
* @default 'hybrid'
|
|
181
|
+
*/
|
|
182
|
+
readonly strategy?: ChunkingStrategy;
|
|
183
|
+
/**
|
|
184
|
+
* Threshold for triggering chunking based on page count (fixed-pages strategy).
|
|
185
|
+
* @default 100
|
|
186
|
+
*/
|
|
187
|
+
readonly pageThreshold?: number;
|
|
188
|
+
/**
|
|
189
|
+
* Number of pages per chunk (fixed-pages strategy).
|
|
190
|
+
* @default 50
|
|
191
|
+
*/
|
|
192
|
+
readonly chunkSize?: number;
|
|
193
|
+
/**
|
|
194
|
+
* Number of overlapping pages between chunks (fixed-pages strategy).
|
|
195
|
+
* @default 5
|
|
196
|
+
*/
|
|
197
|
+
readonly overlapPages?: number;
|
|
198
|
+
/**
|
|
199
|
+
* Threshold for triggering chunking based on token count (token-based strategy).
|
|
200
|
+
* @default 150000
|
|
201
|
+
*/
|
|
202
|
+
readonly tokenThreshold?: number;
|
|
203
|
+
/**
|
|
204
|
+
* Maximum tokens per chunk (token-based strategy).
|
|
205
|
+
* @default 100000
|
|
206
|
+
*/
|
|
207
|
+
readonly maxTokensPerChunk?: number;
|
|
208
|
+
/**
|
|
209
|
+
* Number of overlapping tokens between chunks (token-based and hybrid strategies).
|
|
210
|
+
* @default 5000
|
|
211
|
+
*/
|
|
212
|
+
readonly overlapTokens?: number;
|
|
213
|
+
/**
|
|
214
|
+
* Hard limit on pages per chunk (hybrid strategy).
|
|
215
|
+
* Note: Bedrock has a hard limit of 100 pages per PDF, so we default to 99
|
|
216
|
+
* to provide a safety margin.
|
|
217
|
+
* @default 99
|
|
218
|
+
*/
|
|
219
|
+
readonly maxPagesPerChunk?: number;
|
|
220
|
+
/**
|
|
221
|
+
* Soft target for tokens per chunk (hybrid strategy).
|
|
222
|
+
* @default 80000
|
|
223
|
+
*/
|
|
224
|
+
readonly targetTokensPerChunk?: number;
|
|
225
|
+
/**
|
|
226
|
+
* Processing mode for chunks.
|
|
227
|
+
*
|
|
228
|
+
* - **parallel**: Process multiple chunks simultaneously (faster, higher cost)
|
|
229
|
+
* - **sequential**: Process chunks one at a time (slower, lower cost)
|
|
230
|
+
*
|
|
231
|
+
* @default 'parallel'
|
|
232
|
+
*/
|
|
233
|
+
readonly processingMode?: ProcessingMode;
|
|
234
|
+
/**
|
|
235
|
+
* Maximum number of chunks to process concurrently (parallel mode only).
|
|
236
|
+
* Higher values increase speed but also cost.
|
|
237
|
+
*
|
|
238
|
+
* @default 10
|
|
239
|
+
*/
|
|
240
|
+
readonly maxConcurrency?: number;
|
|
241
|
+
/**
|
|
242
|
+
* Strategy for aggregating results from multiple chunks.
|
|
243
|
+
*
|
|
244
|
+
* - **majority-vote**: Most frequent classification wins
|
|
245
|
+
* - **weighted-vote**: Early chunks weighted higher
|
|
246
|
+
* - **first-chunk**: Use first chunk's classification
|
|
247
|
+
*
|
|
248
|
+
* @default 'majority-vote'
|
|
249
|
+
*/
|
|
250
|
+
readonly aggregationStrategy?: AggregationStrategy;
|
|
251
|
+
/**
|
|
252
|
+
* Minimum percentage of chunks that must succeed for aggregation.
|
|
253
|
+
* If fewer chunks succeed, the result is marked as partial failure.
|
|
254
|
+
*
|
|
255
|
+
* @default 0.5 (50%)
|
|
256
|
+
*/
|
|
257
|
+
readonly minSuccessThreshold?: number;
|
|
258
|
+
}
|
|
259
|
+
/**
|
|
260
|
+
* Metadata about a single chunk of a document.
|
|
261
|
+
* Contains information about the chunk's position, size, and S3 location.
|
|
262
|
+
*/
|
|
263
|
+
export interface ChunkMetadata {
|
|
264
|
+
/**
|
|
265
|
+
* Unique identifier for this chunk.
|
|
266
|
+
* Format: {documentId}_chunk_{index}
|
|
267
|
+
*/
|
|
268
|
+
readonly chunkId: string;
|
|
269
|
+
/**
|
|
270
|
+
* Zero-based index of this chunk in the document.
|
|
271
|
+
*/
|
|
272
|
+
readonly chunkIndex: number;
|
|
273
|
+
/**
|
|
274
|
+
* Total number of chunks in the document.
|
|
275
|
+
*/
|
|
276
|
+
readonly totalChunks: number;
|
|
277
|
+
/**
|
|
278
|
+
* Starting page number (zero-based) of this chunk.
|
|
279
|
+
*/
|
|
280
|
+
readonly startPage: number;
|
|
281
|
+
/**
|
|
282
|
+
* Ending page number (zero-based, inclusive) of this chunk.
|
|
283
|
+
*/
|
|
284
|
+
readonly endPage: number;
|
|
285
|
+
/**
|
|
286
|
+
* Number of pages in this chunk.
|
|
287
|
+
*/
|
|
288
|
+
readonly pageCount: number;
|
|
289
|
+
/**
|
|
290
|
+
* Estimated token count for this chunk.
|
|
291
|
+
* Based on word-count heuristic (1.3 tokens per word).
|
|
292
|
+
*/
|
|
293
|
+
readonly estimatedTokens: number;
|
|
294
|
+
/**
|
|
295
|
+
* S3 bucket containing the chunk file.
|
|
296
|
+
*/
|
|
297
|
+
readonly bucket: string;
|
|
298
|
+
/**
|
|
299
|
+
* S3 key for the chunk file.
|
|
300
|
+
* Typically in chunks/ prefix.
|
|
301
|
+
*/
|
|
302
|
+
readonly key: string;
|
|
303
|
+
}
|
|
304
|
+
/**
|
|
305
|
+
* Document content location information.
|
|
306
|
+
*/
|
|
307
|
+
export interface DocumentContent {
|
|
308
|
+
/**
|
|
309
|
+
* Storage location type (e.g., 's3').
|
|
310
|
+
*/
|
|
311
|
+
readonly location: string;
|
|
312
|
+
/**
|
|
313
|
+
* S3 bucket containing the document.
|
|
314
|
+
*/
|
|
315
|
+
readonly bucket: string;
|
|
316
|
+
/**
|
|
317
|
+
* S3 key for the document.
|
|
318
|
+
*/
|
|
319
|
+
readonly key: string;
|
|
320
|
+
/**
|
|
321
|
+
* Original filename of the document.
|
|
322
|
+
*/
|
|
323
|
+
readonly filename: string;
|
|
324
|
+
}
|
|
325
|
+
/**
|
|
326
|
+
* Request payload for PDF analysis and chunking Lambda.
|
|
327
|
+
* Contains document information and chunking configuration.
|
|
328
|
+
*/
|
|
329
|
+
export interface ChunkingRequest {
|
|
330
|
+
/**
|
|
331
|
+
* Unique identifier for the document.
|
|
332
|
+
*/
|
|
333
|
+
readonly documentId: string;
|
|
334
|
+
/**
|
|
335
|
+
* Content type of the document.
|
|
336
|
+
* Typically 'file' for S3-based documents.
|
|
337
|
+
*/
|
|
338
|
+
readonly contentType: string;
|
|
339
|
+
/**
|
|
340
|
+
* Document content location information.
|
|
341
|
+
*/
|
|
342
|
+
readonly content: DocumentContent;
|
|
343
|
+
/**
|
|
344
|
+
* Optional chunking configuration.
|
|
345
|
+
* If not provided, uses default configuration.
|
|
346
|
+
*/
|
|
347
|
+
readonly config?: ChunkingConfig;
|
|
348
|
+
}
|
|
349
|
+
/**
|
|
350
|
+
* Token analysis results from PDF analysis.
|
|
351
|
+
* Provides information about document size and token distribution.
|
|
352
|
+
*/
|
|
353
|
+
export interface TokenAnalysis {
|
|
354
|
+
/**
|
|
355
|
+
* Total estimated tokens in the document.
|
|
356
|
+
*/
|
|
357
|
+
readonly totalTokens: number;
|
|
358
|
+
/**
|
|
359
|
+
* Total number of pages in the document.
|
|
360
|
+
*/
|
|
361
|
+
readonly totalPages: number;
|
|
362
|
+
/**
|
|
363
|
+
* Average tokens per page across the document.
|
|
364
|
+
*/
|
|
365
|
+
readonly avgTokensPerPage: number;
|
|
366
|
+
/**
|
|
367
|
+
* Optional detailed token count for each page.
|
|
368
|
+
* Used for token-based and hybrid chunking strategies.
|
|
369
|
+
*/
|
|
370
|
+
readonly tokensPerPage?: number[];
|
|
371
|
+
}
|
|
372
|
+
/**
|
|
373
|
+
* Response when chunking is NOT required.
|
|
374
|
+
* Document is below thresholds and will be processed without chunking.
|
|
375
|
+
*/
|
|
376
|
+
export interface NoChunkingResponse {
|
|
377
|
+
/**
|
|
378
|
+
* Document identifier.
|
|
379
|
+
*/
|
|
380
|
+
readonly documentId: string;
|
|
381
|
+
/**
|
|
382
|
+
* Indicates chunking is not required.
|
|
383
|
+
*/
|
|
384
|
+
readonly requiresChunking: false;
|
|
385
|
+
/**
|
|
386
|
+
* Token analysis results.
|
|
387
|
+
*/
|
|
388
|
+
readonly tokenAnalysis: TokenAnalysis;
|
|
389
|
+
/**
|
|
390
|
+
* Human-readable reason why chunking was not applied.
|
|
391
|
+
* Example: "Document has 50 pages, below threshold of 100"
|
|
392
|
+
*/
|
|
393
|
+
readonly reason: string;
|
|
394
|
+
}
|
|
395
|
+
/**
|
|
396
|
+
* Chunking configuration used for processing.
|
|
397
|
+
* Includes both user-provided and default values.
|
|
398
|
+
*/
|
|
399
|
+
export interface ChunkingConfigUsed {
|
|
400
|
+
readonly strategy: ChunkingStrategy;
|
|
401
|
+
readonly totalPages: number;
|
|
402
|
+
readonly totalTokens: number;
|
|
403
|
+
readonly chunkSize?: number;
|
|
404
|
+
readonly overlapPages?: number;
|
|
405
|
+
readonly maxTokensPerChunk?: number;
|
|
406
|
+
readonly overlapTokens?: number;
|
|
407
|
+
readonly targetTokensPerChunk?: number;
|
|
408
|
+
readonly maxPagesPerChunk?: number;
|
|
409
|
+
readonly processingMode?: string;
|
|
410
|
+
}
|
|
411
|
+
/**
|
|
412
|
+
* Response when chunking IS required.
|
|
413
|
+
* Document exceeds thresholds and has been split into chunks.
|
|
414
|
+
*/
|
|
415
|
+
export interface ChunkingResponse {
|
|
416
|
+
/**
|
|
417
|
+
* Document identifier.
|
|
418
|
+
*/
|
|
419
|
+
readonly documentId: string;
|
|
420
|
+
/**
|
|
421
|
+
* Indicates chunking is required.
|
|
422
|
+
*/
|
|
423
|
+
readonly requiresChunking: true;
|
|
424
|
+
/**
|
|
425
|
+
* Token analysis results with detailed per-page information.
|
|
426
|
+
*/
|
|
427
|
+
readonly tokenAnalysis: TokenAnalysis;
|
|
428
|
+
/**
|
|
429
|
+
* Strategy used for chunking.
|
|
430
|
+
*/
|
|
431
|
+
readonly strategy: ChunkingStrategy;
|
|
432
|
+
/**
|
|
433
|
+
* Array of chunk metadata for all created chunks.
|
|
434
|
+
*/
|
|
435
|
+
readonly chunks: ChunkMetadata[];
|
|
436
|
+
/**
|
|
437
|
+
* Configuration used for chunking.
|
|
438
|
+
* Includes both user-provided and default values.
|
|
439
|
+
*/
|
|
440
|
+
readonly config: ChunkingConfigUsed;
|
|
441
|
+
}
|
|
442
|
+
/**
|
|
443
|
+
* Union type for chunking Lambda response.
|
|
444
|
+
* Either chunking is required or not.
|
|
445
|
+
*/
|
|
446
|
+
export type ChunkingLambdaResponse = NoChunkingResponse | ChunkingResponse;
|
|
447
|
+
/**
|
|
448
|
+
* Classification result for a chunk.
|
|
449
|
+
*/
|
|
450
|
+
export interface ChunkClassificationResult {
|
|
451
|
+
readonly documentClassification: string;
|
|
452
|
+
readonly confidence?: number;
|
|
453
|
+
}
|
|
454
|
+
/**
|
|
455
|
+
* Processing result for a chunk.
|
|
456
|
+
*/
|
|
457
|
+
export interface ChunkProcessingResult {
|
|
458
|
+
readonly entities: Entity[];
|
|
459
|
+
}
|
|
460
|
+
/**
|
|
461
|
+
* Result from processing a single chunk.
|
|
462
|
+
* Contains classification and extraction results, or error information.
|
|
463
|
+
*/
|
|
464
|
+
export interface ChunkResult {
|
|
465
|
+
/**
|
|
466
|
+
* Chunk identifier.
|
|
467
|
+
*/
|
|
468
|
+
readonly chunkId: string;
|
|
469
|
+
/**
|
|
470
|
+
* Zero-based chunk index.
|
|
471
|
+
*/
|
|
472
|
+
readonly chunkIndex: number;
|
|
473
|
+
/**
|
|
474
|
+
* Optional classification result for this chunk.
|
|
475
|
+
*/
|
|
476
|
+
readonly classificationResult?: ChunkClassificationResult;
|
|
477
|
+
/**
|
|
478
|
+
* Optional extraction result for this chunk.
|
|
479
|
+
*/
|
|
480
|
+
readonly processingResult?: ChunkProcessingResult;
|
|
481
|
+
/**
|
|
482
|
+
* Error message if chunk processing failed.
|
|
483
|
+
*/
|
|
484
|
+
readonly error?: string;
|
|
485
|
+
}
|
|
486
|
+
/**
|
|
487
|
+
* Extracted entity from document processing.
|
|
488
|
+
*/
|
|
489
|
+
export interface Entity {
|
|
490
|
+
/**
|
|
491
|
+
* Type of entity (e.g., 'NAME', 'DATE', 'AMOUNT', 'ADDRESS').
|
|
492
|
+
*/
|
|
493
|
+
readonly type: string;
|
|
494
|
+
/**
|
|
495
|
+
* Value of the entity.
|
|
496
|
+
*/
|
|
497
|
+
readonly value: string;
|
|
498
|
+
/**
|
|
499
|
+
* Optional page number where entity was found.
|
|
500
|
+
* Entities with page numbers are preserved even if duplicated.
|
|
501
|
+
*/
|
|
502
|
+
readonly page?: number;
|
|
503
|
+
/**
|
|
504
|
+
* Optional chunk index where entity was found.
|
|
505
|
+
*/
|
|
506
|
+
readonly chunkIndex?: number;
|
|
507
|
+
}
|
|
508
|
+
/**
|
|
509
|
+
* Request payload for aggregation Lambda.
|
|
510
|
+
* Contains results from all processed chunks.
|
|
511
|
+
*/
|
|
512
|
+
export interface AggregationRequest {
|
|
513
|
+
/**
|
|
514
|
+
* Document identifier.
|
|
515
|
+
*/
|
|
516
|
+
readonly documentId: string;
|
|
517
|
+
/**
|
|
518
|
+
* Results from all processed chunks.
|
|
519
|
+
*/
|
|
520
|
+
readonly chunkResults: ChunkResult[];
|
|
521
|
+
/**
|
|
522
|
+
* Strategy to use for aggregation.
|
|
523
|
+
* @default 'majority-vote'
|
|
524
|
+
*/
|
|
525
|
+
readonly aggregationStrategy?: AggregationStrategy;
|
|
526
|
+
}
|
|
527
|
+
/**
|
|
528
|
+
* Summary of chunk processing results.
|
|
529
|
+
*/
|
|
530
|
+
export interface ChunksSummary {
|
|
531
|
+
/**
|
|
532
|
+
* Total number of chunks created.
|
|
533
|
+
*/
|
|
534
|
+
readonly totalChunks: number;
|
|
535
|
+
/**
|
|
536
|
+
* Number of chunks that processed successfully.
|
|
537
|
+
*/
|
|
538
|
+
readonly successfulChunks: number;
|
|
539
|
+
/**
|
|
540
|
+
* Number of chunks that failed processing.
|
|
541
|
+
*/
|
|
542
|
+
readonly failedChunks: number;
|
|
543
|
+
/**
|
|
544
|
+
* Optional total tokens processed across all chunks.
|
|
545
|
+
*/
|
|
546
|
+
readonly totalTokensProcessed?: number;
|
|
547
|
+
}
|
|
548
|
+
/**
|
|
549
|
+
* Aggregated result from processing all chunks.
|
|
550
|
+
* Combines classification and extraction results into final output.
|
|
551
|
+
*/
|
|
552
|
+
export interface AggregatedResult {
|
|
553
|
+
/**
|
|
554
|
+
* Document identifier.
|
|
555
|
+
*/
|
|
556
|
+
readonly documentId: string;
|
|
557
|
+
/**
|
|
558
|
+
* Final document classification (from majority vote or other strategy).
|
|
559
|
+
*/
|
|
560
|
+
readonly classification: string;
|
|
561
|
+
/**
|
|
562
|
+
* Confidence score for the classification (0-1).
|
|
563
|
+
* For majority vote: (count of majority / total chunks)
|
|
564
|
+
*/
|
|
565
|
+
readonly classificationConfidence: number;
|
|
566
|
+
/**
|
|
567
|
+
* Deduplicated entities from all chunks.
|
|
568
|
+
* Entities without page numbers are deduplicated by (type, value).
|
|
569
|
+
* Entities with page numbers are preserved even if duplicated.
|
|
570
|
+
*/
|
|
571
|
+
readonly entities: Entity[];
|
|
572
|
+
/**
|
|
573
|
+
* Summary of chunk processing results.
|
|
574
|
+
*/
|
|
575
|
+
readonly chunksSummary: ChunksSummary;
|
|
576
|
+
/**
|
|
577
|
+
* Indicates if result is partial due to chunk failures.
|
|
578
|
+
* True if fewer than minSuccessThreshold chunks succeeded.
|
|
579
|
+
*/
|
|
580
|
+
readonly partialResult: boolean;
|
|
581
|
+
}
|
|
582
|
+
/**
|
|
583
|
+
* Request payload for cleanup Lambda.
|
|
584
|
+
* Contains information about chunks to delete.
|
|
585
|
+
*/
|
|
586
|
+
export interface CleanupRequest {
|
|
587
|
+
/**
|
|
588
|
+
* Document identifier.
|
|
589
|
+
*/
|
|
590
|
+
readonly documentId: string;
|
|
591
|
+
/**
|
|
592
|
+
* Array of chunk metadata for chunks to delete.
|
|
593
|
+
*/
|
|
594
|
+
readonly chunks: ChunkMetadata[];
|
|
595
|
+
}
|
|
596
|
+
/**
|
|
597
|
+
* Response from cleanup Lambda.
|
|
598
|
+
* Reports success and any errors encountered.
|
|
599
|
+
*/
|
|
600
|
+
export interface CleanupResponse {
|
|
601
|
+
/**
|
|
602
|
+
* Document identifier.
|
|
603
|
+
*/
|
|
604
|
+
readonly documentId: string;
|
|
605
|
+
/**
|
|
606
|
+
* Number of chunks successfully deleted.
|
|
607
|
+
*/
|
|
608
|
+
readonly deletedChunks: number;
|
|
609
|
+
/**
|
|
610
|
+
* Array of error messages for failed deletions.
|
|
611
|
+
* Empty if all deletions succeeded.
|
|
612
|
+
*/
|
|
613
|
+
readonly errors: string[];
|
|
614
|
+
}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
5
|
+
//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"chunking-config.js","sourceRoot":"","sources":["../../use-cases/document-processing/chunking-config.ts"],"names":[],"mappings":";AAAA,qEAAqE;AACrE,sCAAsC","sourcesContent":["// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.\n// SPDX-License-Identifier: Apache-2.0\n\n/**\n * Chunking strategy options for PDF document processing.\n *\n * Choose the strategy based on your document characteristics:\n *\n * - **fixed-pages**: Legacy approach, splits by fixed page count.\n *   - Pros: Fast, simple, predictable chunk sizes\n *   - Cons: May exceed token limits for dense documents\n *   - Best for: Uniform density documents, simple text\n *\n * - **token-based**: Splits based on token count to respect model limits.\n *   - Pros: Respects model token limits, handles variable density\n *   - Cons: Slower analysis, variable chunk sizes\n *   - Best for: Variable density documents, technical content\n *\n * - **hybrid**: RECOMMENDED - Balances token count and page limits.\n *   - Pros: Best of both worlds, reliable, flexible\n *   - Cons: Slightly more complex configuration\n *   - Best for: Most documents, general-purpose processing\n */\nexport type ChunkingStrategy = 'fixed-pages' | 'token-based' | 'hybrid';\n\n/**\n * Processing mode for chunked documents.\n *\n * - **sequential**: Process chunks one at a time (cost-optimized)\n * - **parallel**: Process multiple chunks simultaneously (speed-optimized)\n */\nexport type ProcessingMode = 'sequential' | 'parallel';\n\n/**\n * Aggregation strategy for combining chunk results.\n *\n * - **majority-vote**: Select most frequent classification across chunks\n * - **weighted-vote**: Weight early chunks higher than later chunks\n * - **first-chunk**: Use classification from first chunk only\n */\nexport type AggregationStrategy = 'majority-vote' | 'weighted-vote' | 'first-chunk';\n\n/**\n * Configuration for fixed-pages chunking strategy.\n * Splits documents by fixed page count (legacy approach).\n */\nexport interface FixedPagesConfig {\n  /**\n   * Threshold for triggering chunking based on page count.\n   * Documents with pages > threshold will be chunked.\n   * @default 100\n   */\n  readonly pageThreshold?: number;\n\n  /**\n   * Number of pages per chunk.\n   * @default 50\n   */\n  readonly chunkSize?: number;\n\n  /**\n   * Number of overlapping pages between consecutive chunks.\n   * Must be less than chunkSize.\n   * @default 5\n   */\n  readonly overlapPages?: number;\n}\n\n/**\n * Configuration for token-based chunking strategy.\n * Splits documents based on estimated token count to respect model limits.\n */\nexport interface TokenBasedConfig {\n  /**\n   * Threshold for triggering chunking based on token count.\n   * Documents with tokens > threshold will be chunked.\n   * @default 150000\n   */\n  readonly tokenThreshold?: number;\n\n  /**\n   * Maximum tokens per chunk.\n   * Ensures no chunk exceeds model token limits.\n   * @default 100000\n   */\n  readonly maxTokensPerChunk?: number;\n\n  /**\n   * Number of overlapping tokens between consecutive chunks.\n   * Provides context continuity across chunks.\n   * @default 5000\n   */\n  readonly overlapTokens?: number;\n}\n\n/**\n * Configuration for hybrid chunking strategy (RECOMMENDED).\n * Balances token count and page limits for optimal chunking.\n */\nexport interface HybridConfig {\n  /**\n   * Hard limit on pages per chunk.\n   * Prevents very large chunks even if token count is low.\n   * Note: Bedrock has a hard limit of 100 pages per PDF, so we default to 99\n   * to provide a safety margin.\n   * @default 99\n   */\n  readonly maxPagesPerChunk?: number;\n\n  /**\n   * Soft target for tokens per chunk.\n   * Chunks aim for this token count but respect maxPagesPerChunk.\n   * @default 80000\n   */\n  readonly targetTokensPerChunk?: number;\n\n  /**\n   * Threshold for triggering chunking based on page count.\n   * Documents with pages > threshold will be chunked.\n   * @default 100\n   */\n  readonly pageThreshold?: number;\n\n  /**\n   * Threshold for triggering chunking based on token count.\n   * Documents with tokens > threshold will be chunked.\n   * @default 150000\n   */\n  readonly tokenThreshold?: number;\n\n  /**\n   * Number of overlapping tokens between consecutive chunks.\n   * Provides context continuity across chunks.\n   * @default 5000\n   */\n  readonly overlapTokens?: number;\n}\n\n/**\n * Comprehensive configuration for PDF chunking behavior.\n *\n * This interface provides fine-grained control over how large PDF documents are\n * split into manageable chunks for processing. The chunking system supports three\n * strategies, each optimized for different document types and use cases.\n *\n * ## Chunking Strategies\n *\n * ### 1. Hybrid Strategy (RECOMMENDED)\n * Balances both token count and page limits for optimal chunking. Best for most\n * documents as it respects model token limits while preventing excessively large chunks.\n *\n * ### 2. Token-Based Strategy\n * Splits documents based on estimated token count. Best for documents with variable\n * content density (e.g., mixed text and images, tables, charts).\n *\n * ### 3. Fixed-Pages Strategy (Legacy)\n * Simple page-based splitting. Fast but may exceed token limits for dense documents.\n * Use only for documents with uniform content density.\n *\n * ## Processing Modes\n *\n * - **parallel**: Process multiple chunks simultaneously (faster, higher cost)\n * - **sequential**: Process chunks one at a time (slower, lower cost)\n *\n * ## Aggregation Strategies\n *\n * - **majority-vote**: Most frequent classification wins (recommended)\n * - **weighted-vote**: Early chunks weighted higher\n * - **first-chunk**: Use first chunk's classification only\n *\n * ## Default Values\n *\n * | Parameter | Default | Description |\n * |-----------|---------|-------------|\n * | strategy | 'hybrid' | Chunking strategy |\n * | pageThreshold | 100 | Pages to trigger chunking |\n * | tokenThreshold | 150000 | Tokens to trigger chunking |\n * | chunkSize | 50 | Pages per chunk (fixed-pages) |\n * | overlapPages | 5 | Overlap pages (fixed-pages) |\n * | maxTokensPerChunk | 100000 | Max tokens per chunk (token-based) |\n * | overlapTokens | 5000 | Overlap tokens (token-based, hybrid) |\n * | targetTokensPerChunk | 80000 | Target tokens per chunk (hybrid) |\n * | maxPagesPerChunk | 99 | Max pages per chunk (hybrid) |\n * | processingMode | 'parallel' | Processing mode |\n * | maxConcurrency | 10 | Max parallel chunks |\n * | aggregationStrategy | 'majority-vote' | Result aggregation |\n * | minSuccessThreshold | 0.5 | Min success rate for valid result |\n */\nexport interface ChunkingConfig {\n  /**\n   * Chunking strategy to use.\n   *\n   * - **hybrid** (RECOMMENDED): Balances token count and page limits\n   * - **token-based**: Respects model token limits, good for variable density\n   * - **fixed-pages**: Simple page-based splitting (legacy, not recommended)\n   *\n   * @default 'hybrid'\n   */\n  readonly strategy?: ChunkingStrategy;\n\n  // Fixed-pages strategy configuration\n  /**\n   * Threshold for triggering chunking based on page count (fixed-pages strategy).\n   * @default 100\n   */\n  readonly pageThreshold?: number;\n\n  /**\n   * Number of pages per chunk (fixed-pages strategy).\n   * @default 50\n   */\n  readonly chunkSize?: number;\n\n  /**\n   * Number of overlapping pages between chunks (fixed-pages strategy).\n   * @default 5\n   */\n  readonly overlapPages?: number;\n\n  // Token-based strategy configuration\n  /**\n   * Threshold for triggering chunking based on token count (token-based strategy).\n   * @default 150000\n   */\n  readonly tokenThreshold?: number;\n\n  /**\n   * Maximum tokens per chunk (token-based strategy).\n   * @default 100000\n   */\n  readonly maxTokensPerChunk?: number;\n\n  /**\n   * Number of overlapping tokens between chunks (token-based and hybrid strategies).\n   * @default 5000\n   */\n  readonly overlapTokens?: number;\n\n  // Hybrid strategy configuration\n  /**\n   * Hard limit on pages per chunk (hybrid strategy).\n   * Note: Bedrock has a hard limit of 100 pages per PDF, so we default to 99\n   * to provide a safety margin.\n   * @default 99\n   */\n  readonly maxPagesPerChunk?: number;\n\n  /**\n   * Soft target for tokens per chunk (hybrid strategy).\n   * @default 80000\n   */\n  readonly targetTokensPerChunk?: number;\n\n  // Common settings\n  /**\n   * Processing mode for chunks.\n   *\n   * - **parallel**: Process multiple chunks simultaneously (faster, higher cost)\n   * - **sequential**: Process chunks one at a time (slower, lower cost)\n   *\n   * @default 'parallel'\n   */\n  readonly processingMode?: ProcessingMode;\n\n  /**\n   * Maximum number of chunks to process concurrently (parallel mode only).\n   * Higher values increase speed but also cost.\n   *\n   * @default 10\n   */\n  readonly maxConcurrency?: number;\n\n  /**\n   * Strategy for aggregating results from multiple chunks.\n   *\n   * - **majority-vote**: Most frequent classification wins\n   * - **weighted-vote**: Early chunks weighted higher\n   * - **first-chunk**: Use first chunk's classification\n   *\n   * @default 'majority-vote'\n   */\n  readonly aggregationStrategy?: AggregationStrategy;\n\n  /**\n   * Minimum percentage of chunks that must succeed for aggregation.\n   * If fewer chunks succeed, the result is marked as partial failure.\n   *\n   * @default 0.5 (50%)\n   */\n  readonly minSuccessThreshold?: number;\n}\n\n/**\n * Metadata about a single chunk of a document.\n * Contains information about the chunk's position, size, and S3 location.\n */\nexport interface ChunkMetadata {\n  /**\n   * Unique identifier for this chunk.\n   * Format: {documentId}_chunk_{index}\n   */\n  readonly chunkId: string;\n\n  /**\n   * Zero-based index of this chunk in the document.\n   */\n  readonly chunkIndex: number;\n\n  /**\n   * Total number of chunks in the document.\n   */\n  readonly totalChunks: number;\n\n  /**\n   * Starting page number (zero-based) of this chunk.\n   */\n  readonly startPage: number;\n\n  /**\n   * Ending page number (zero-based, inclusive) of this chunk.\n   */\n  readonly endPage: number;\n\n  /**\n   * Number of pages in this chunk.\n   */\n  readonly pageCount: number;\n\n  /**\n   * Estimated token count for this chunk.\n   * Based on word-count heuristic (1.3 tokens per word).\n   */\n  readonly estimatedTokens: number;\n\n  /**\n   * S3 bucket containing the chunk file.\n   */\n  readonly bucket: string;\n\n  /**\n   * S3 key for the chunk file.\n   * Typically in chunks/ prefix.\n   */\n  readonly key: string;\n}\n\n/**\n * Document content location information.\n */\nexport interface DocumentContent {\n  /**\n   * Storage location type (e.g., 's3').\n   */\n  readonly location: string;\n\n  /**\n   * S3 bucket containing the document.\n   */\n  readonly bucket: string;\n\n  /**\n   * S3 key for the document.\n   */\n  readonly key: string;\n\n  /**\n   * Original filename of the document.\n   */\n  readonly filename: string;\n}\n\n/**\n * Request payload for PDF analysis and chunking Lambda.\n * Contains document information and chunking configuration.\n */\nexport interface ChunkingRequest {\n  /**\n   * Unique identifier for the document.\n   */\n  readonly documentId: string;\n\n  /**\n   * Content type of the document.\n   * Typically 'file' for S3-based documents.\n   */\n  readonly contentType: string;\n\n  /**\n   * Document content location information.\n   */\n  readonly content: DocumentContent;\n\n  /**\n   * Optional chunking configuration.\n   * If not provided, uses default configuration.\n   */\n  readonly config?: ChunkingConfig;\n}\n\n/**\n * Token analysis results from PDF analysis.\n * Provides information about document size and token distribution.\n */\nexport interface TokenAnalysis {\n  /**\n   * Total estimated tokens in the document.\n   */\n  readonly totalTokens: number;\n\n  /**\n   * Total number of pages in the document.\n   */\n  readonly totalPages: number;\n\n  /**\n   * Average tokens per page across the document.\n   */\n  readonly avgTokensPerPage: number;\n\n  /**\n   * Optional detailed token count for each page.\n   * Used for token-based and hybrid chunking strategies.\n   */\n  readonly tokensPerPage?: number[];\n}\n\n/**\n * Response when chunking is NOT required.\n * Document is below thresholds and will be processed without chunking.\n */\nexport interface NoChunkingResponse {\n  /**\n   * Document identifier.\n   */\n  readonly documentId: string;\n\n  /**\n   * Indicates chunking is not required.\n   */\n  readonly requiresChunking: false;\n\n  /**\n   * Token analysis results.\n   */\n  readonly tokenAnalysis: TokenAnalysis;\n\n  /**\n   * Human-readable reason why chunking was not applied.\n   * Example: \"Document has 50 pages, below threshold of 100\"\n   */\n  readonly reason: string;\n}\n\n/**\n * Chunking configuration used for processing.\n * Includes both user-provided and default values.\n */\nexport interface ChunkingConfigUsed {\n  readonly strategy: ChunkingStrategy;\n  readonly totalPages: number;\n  readonly totalTokens: number;\n  readonly chunkSize?: number;\n  readonly overlapPages?: number;\n  readonly maxTokensPerChunk?: number;\n  readonly overlapTokens?: number;\n  readonly targetTokensPerChunk?: number;\n  readonly maxPagesPerChunk?: number;\n  readonly processingMode?: string;\n}\n\n/**\n * Response when chunking IS required.\n * Document exceeds thresholds and has been split into chunks.\n */\nexport interface ChunkingResponse {\n  /**\n   * Document identifier.\n   */\n  readonly documentId: string;\n\n  /**\n   * Indicates chunking is required.\n   */\n  readonly requiresChunking: true;\n\n  /**\n   * Token analysis results with detailed per-page information.\n   */\n  readonly tokenAnalysis: TokenAnalysis;\n\n  /**\n   * Strategy used for chunking.\n   */\n  readonly strategy: ChunkingStrategy;\n\n  /**\n   * Array of chunk metadata for all created chunks.\n   */\n  readonly chunks: ChunkMetadata[];\n\n  /**\n   * Configuration used for chunking.\n   * Includes both user-provided and default values.\n   */\n  readonly config: ChunkingConfigUsed;\n}\n\n/**\n * Union type for chunking Lambda response.\n * Either chunking is required or not.\n */\nexport type ChunkingLambdaResponse = NoChunkingResponse | ChunkingResponse;\n\n/**\n * Classification result for a chunk.\n */\nexport interface ChunkClassificationResult {\n  readonly documentClassification: string;\n  readonly confidence?: number;\n}\n\n/**\n * Processing result for a chunk.\n */\nexport interface ChunkProcessingResult {\n  readonly entities: Entity[];\n}\n\n/**\n * Result from processing a single chunk.\n * Contains classification and extraction results, or error information.\n */\nexport interface ChunkResult {\n  /**\n   * Chunk identifier.\n   */\n  readonly chunkId: string;\n\n  /**\n   * Zero-based chunk index.\n   */\n  readonly chunkIndex: number;\n\n  /**\n   * Optional classification result for this chunk.\n   */\n  readonly classificationResult?: ChunkClassificationResult;\n\n  /**\n   * Optional extraction result for this chunk.\n   */\n  readonly processingResult?: ChunkProcessingResult;\n\n  /**\n   * Error message if chunk processing failed.\n   */\n  readonly error?: string;\n}\n\n/**\n * Extracted entity from document processing.\n */\nexport interface Entity {\n  /**\n   * Type of entity (e.g., 'NAME', 'DATE', 'AMOUNT', 'ADDRESS').\n   */\n  readonly type: string;\n\n  /**\n   * Value of the entity.\n   */\n  readonly value: string;\n\n  /**\n   * Optional page number where entity was found.\n   * Entities with page numbers are preserved even if duplicated.\n   */\n  readonly page?: number;\n\n  /**\n   * Optional chunk index where entity was found.\n   */\n  readonly chunkIndex?: number;\n}\n\n/**\n * Request payload for aggregation Lambda.\n * Contains results from all processed chunks.\n */\nexport interface AggregationRequest {\n  /**\n   * Document identifier.\n   */\n  readonly documentId: string;\n\n  /**\n   * Results from all processed chunks.\n   */\n  readonly chunkResults: ChunkResult[];\n\n  /**\n   * Strategy to use for aggregation.\n   * @default 'majority-vote'\n   */\n  readonly aggregationStrategy?: AggregationStrategy;\n}\n\n/**\n * Summary of chunk processing results.\n */\nexport interface ChunksSummary {\n  /**\n   * Total number of chunks created.\n   */\n  readonly totalChunks: number;\n\n  /**\n   * Number of chunks that processed successfully.\n   */\n  readonly successfulChunks: number;\n\n  /**\n   * Number of chunks that failed processing.\n   */\n  readonly failedChunks: number;\n\n  /**\n   * Optional total tokens processed across all chunks.\n   */\n  readonly totalTokensProcessed?: number;\n}\n\n/**\n * Aggregated result from processing all chunks.\n * Combines classification and extraction results into final output.\n */\nexport interface AggregatedResult {\n  /**\n   * Document identifier.\n   */\n  readonly documentId: string;\n\n  /**\n   * Final document classification (from majority vote or other strategy).\n   */\n  readonly classification: string;\n\n  /**\n   * Confidence score for the classification (0-1).\n   * For majority vote: (count of majority / total chunks)\n   */\n  readonly classificationConfidence: number;\n\n  /**\n   * Deduplicated entities from all chunks.\n   * Entities without page numbers are deduplicated by (type, value).\n   * Entities with page numbers are preserved even if duplicated.\n   */\n  readonly entities: Entity[];\n\n  /**\n   * Summary of chunk processing results.\n   */\n  readonly chunksSummary: ChunksSummary;\n\n  /**\n   * Indicates if result is partial due to chunk failures.\n   * True if fewer than minSuccessThreshold chunks succeeded.\n   */\n  readonly partialResult: boolean;\n}\n\n/**\n * Request payload for cleanup Lambda.\n * Contains information about chunks to delete.\n */\nexport interface CleanupRequest {\n  /**\n   * Document identifier.\n   */\n  readonly documentId: string;\n\n  /**\n   * Array of chunk metadata for chunks to delete.\n   */\n  readonly chunks: ChunkMetadata[];\n}\n\n/**\n * Response from cleanup Lambda.\n * Reports success and any errors encountered.\n */\nexport interface CleanupResponse {\n  /**\n   * Document identifier.\n   */\n  readonly documentId: string;\n\n  /**\n   * Number of chunks successfully deleted.\n   */\n  readonly deletedChunks: number;\n\n  /**\n   * Array of error messages for failed deletions.\n   * Empty if all deletions succeeded.\n   */\n  readonly errors: string[];\n}\n"]}
|
|
@@ -9,6 +9,6 @@ class DefaultDocumentProcessingConfig {
|
|
|
9
9
|
}
|
|
10
10
|
exports.DefaultDocumentProcessingConfig = DefaultDocumentProcessingConfig;
|
|
11
11
|
_a = JSII_RTTI_SYMBOL_1;
|
|
12
|
-
DefaultDocumentProcessingConfig[_a] = { fqn: "@cdklabs/cdk-appmod-catalog-blueprints.DefaultDocumentProcessingConfig", version: "1.
|
|
12
|
+
DefaultDocumentProcessingConfig[_a] = { fqn: "@cdklabs/cdk-appmod-catalog-blueprints.DefaultDocumentProcessingConfig", version: "1.6.0" };
|
|
13
13
|
DefaultDocumentProcessingConfig.DEFAULT_OBSERVABILITY_METRIC_SVC_NAME = 'document-processing';
|
|
14
14
|
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiZGVmYXVsdC1kb2N1bWVudC1wcm9jZXNzaW5nLWNvbmZpZy5qcyIsInNvdXJjZVJvb3QiOiIiLCJzb3VyY2VzIjpbIi4uLy4uL3VzZS1jYXNlcy9kb2N1bWVudC1wcm9jZXNzaW5nL2RlZmF1bHQtZG9jdW1lbnQtcHJvY2Vzc2luZy1jb25maWcudHMiXSwibmFtZXMiOltdLCJtYXBwaW5ncyI6Ijs7Ozs7QUFBQSxxRUFBcUU7QUFDckUsc0NBQXNDO0FBRXRDLE1BQWEsK0JBQStCOztBQUE1QywwRUFFQzs7O0FBRHdCLHFFQUFxQyxHQUFHLHFCQUFxQixDQUFDIiwic291cmNlc0NvbnRlbnQiOlsiLy8gQ29weXJpZ2h0IEFtYXpvbi5jb20sIEluYy4gb3IgaXRzIGFmZmlsaWF0ZXMuIEFsbCBSaWdodHMgUmVzZXJ2ZWQuXG4vLyBTUERYLUxpY2Vuc2UtSWRlbnRpZmllcjogQXBhY2hlLTIuMFxuXG5leHBvcnQgY2xhc3MgRGVmYXVsdERvY3VtZW50UHJvY2Vzc2luZ0NvbmZpZyB7XG4gIHB1YmxpYyBzdGF0aWMgcmVhZG9ubHkgREVGQVVMVF9PQlNFUlZBQklMSVRZX01FVFJJQ19TVkNfTkFNRSA9ICdkb2N1bWVudC1wcm9jZXNzaW5nJztcbn0iXX0=
|