@cdklabs/cdk-appmod-catalog-blueprints 1.4.1 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/.jsii +2579 -194
  2. package/lib/document-processing/adapter/adapter.d.ts +4 -2
  3. package/lib/document-processing/adapter/adapter.js +1 -1
  4. package/lib/document-processing/adapter/queued-s3-adapter.d.ts +9 -2
  5. package/lib/document-processing/adapter/queued-s3-adapter.js +29 -15
  6. package/lib/document-processing/agentic-document-processing.d.ts +4 -0
  7. package/lib/document-processing/agentic-document-processing.js +20 -10
  8. package/lib/document-processing/base-document-processing.d.ts +54 -2
  9. package/lib/document-processing/base-document-processing.js +136 -82
  10. package/lib/document-processing/bedrock-document-processing.d.ts +202 -2
  11. package/lib/document-processing/bedrock-document-processing.js +717 -77
  12. package/lib/document-processing/chunking-config.d.ts +614 -0
  13. package/lib/document-processing/chunking-config.js +5 -0
  14. package/lib/document-processing/default-document-processing-config.js +1 -1
  15. package/lib/document-processing/index.d.ts +1 -0
  16. package/lib/document-processing/index.js +2 -1
  17. package/lib/document-processing/resources/aggregation/handler.py +567 -0
  18. package/lib/document-processing/resources/aggregation/requirements.txt +7 -0
  19. package/lib/document-processing/resources/aggregation/test_handler.py +362 -0
  20. package/lib/document-processing/resources/cleanup/handler.py +276 -0
  21. package/lib/document-processing/resources/cleanup/requirements.txt +5 -0
  22. package/lib/document-processing/resources/cleanup/test_handler.py +436 -0
  23. package/lib/document-processing/resources/default-bedrock-invoke/index.py +85 -3
  24. package/lib/document-processing/resources/default-bedrock-invoke/test_index.py +622 -0
  25. package/lib/document-processing/resources/pdf-chunking/README.md +313 -0
  26. package/lib/document-processing/resources/pdf-chunking/chunking_strategies.py +460 -0
  27. package/lib/document-processing/resources/pdf-chunking/error_handling.py +491 -0
  28. package/lib/document-processing/resources/pdf-chunking/handler.py +958 -0
  29. package/lib/document-processing/resources/pdf-chunking/metrics.py +435 -0
  30. package/lib/document-processing/resources/pdf-chunking/requirements.txt +3 -0
  31. package/lib/document-processing/resources/pdf-chunking/strategy_selection.py +420 -0
  32. package/lib/document-processing/resources/pdf-chunking/structured_logging.py +457 -0
  33. package/lib/document-processing/resources/pdf-chunking/test_chunking_strategies.py +353 -0
  34. package/lib/document-processing/resources/pdf-chunking/test_error_handling.py +487 -0
  35. package/lib/document-processing/resources/pdf-chunking/test_handler.py +609 -0
  36. package/lib/document-processing/resources/pdf-chunking/test_integration.py +694 -0
  37. package/lib/document-processing/resources/pdf-chunking/test_metrics.py +532 -0
  38. package/lib/document-processing/resources/pdf-chunking/test_strategy_selection.py +471 -0
  39. package/lib/document-processing/resources/pdf-chunking/test_structured_logging.py +449 -0
  40. package/lib/document-processing/resources/pdf-chunking/test_token_estimation.py +374 -0
  41. package/lib/document-processing/resources/pdf-chunking/token_estimation.py +189 -0
  42. package/lib/document-processing/tests/agentic-document-processing-nag.test.js +4 -3
  43. package/lib/document-processing/tests/agentic-document-processing.test.js +488 -4
  44. package/lib/document-processing/tests/base-document-processing-nag.test.js +9 -2
  45. package/lib/document-processing/tests/base-document-processing-schema.test.d.ts +1 -0
  46. package/lib/document-processing/tests/base-document-processing-schema.test.js +337 -0
  47. package/lib/document-processing/tests/base-document-processing.test.js +114 -8
  48. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.d.ts +1 -0
  49. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.js +382 -0
  50. package/lib/document-processing/tests/bedrock-document-processing-nag.test.js +4 -3
  51. package/lib/document-processing/tests/bedrock-document-processing-security.test.d.ts +1 -0
  52. package/lib/document-processing/tests/bedrock-document-processing-security.test.js +389 -0
  53. package/lib/document-processing/tests/bedrock-document-processing.test.js +808 -8
  54. package/lib/document-processing/tests/chunking-config.test.d.ts +1 -0
  55. package/lib/document-processing/tests/chunking-config.test.js +238 -0
  56. package/lib/document-processing/tests/queued-s3-adapter-nag.test.js +9 -2
  57. package/lib/document-processing/tests/queued-s3-adapter.test.js +17 -6
  58. package/lib/framework/agents/base-agent.js +1 -1
  59. package/lib/framework/agents/batch-agent.js +1 -1
  60. package/lib/framework/agents/default-agent-config.js +1 -1
  61. package/lib/framework/bedrock/bedrock.js +1 -1
  62. package/lib/framework/custom-resource/default-runtimes.js +1 -1
  63. package/lib/framework/foundation/access-log.js +1 -1
  64. package/lib/framework/foundation/eventbridge-broker.js +1 -1
  65. package/lib/framework/foundation/network.d.ts +4 -2
  66. package/lib/framework/foundation/network.js +52 -41
  67. package/lib/framework/tests/access-log.test.js +5 -2
  68. package/lib/framework/tests/batch-agent.test.js +5 -2
  69. package/lib/framework/tests/bedrock.test.js +5 -2
  70. package/lib/framework/tests/eventbridge-broker.test.js +5 -2
  71. package/lib/framework/tests/framework-nag.test.js +26 -7
  72. package/lib/framework/tests/network.test.js +30 -2
  73. package/lib/tsconfig.tsbuildinfo +1 -1
  74. package/lib/utilities/data-loader.js +1 -1
  75. package/lib/utilities/lambda-iam-utils.js +1 -1
  76. package/lib/utilities/observability/cloudfront-distribution-observability-property-injector.js +1 -1
  77. package/lib/utilities/observability/default-observability-config.js +1 -1
  78. package/lib/utilities/observability/lambda-observability-property-injector.js +1 -1
  79. package/lib/utilities/observability/log-group-data-protection-utils.js +1 -1
  80. package/lib/utilities/observability/powertools-config.d.ts +10 -1
  81. package/lib/utilities/observability/powertools-config.js +19 -3
  82. package/lib/utilities/observability/state-machine-observability-property-injector.js +1 -1
  83. package/lib/utilities/test-utils.d.ts +43 -0
  84. package/lib/utilities/test-utils.js +56 -0
  85. package/lib/utilities/tests/data-loader-nag.test.js +3 -2
  86. package/lib/utilities/tests/data-loader.test.js +3 -2
  87. package/lib/webapp/frontend-construct.js +1 -1
  88. package/lib/webapp/tests/frontend-construct-nag.test.js +3 -2
  89. package/lib/webapp/tests/frontend-construct.test.js +3 -2
  90. package/package.json +6 -5
  91. package/lib/document-processing/resources/default-error-handler/index.js +0 -46
  92. package/lib/document-processing/resources/default-pdf-processor/index.js +0 -46
  93. package/lib/document-processing/resources/default-pdf-validator/index.js +0 -36
@@ -0,0 +1,420 @@
1
+ """
2
+ Strategy selection module for PDF chunking.
3
+
4
+ This module provides explicit threshold checks for each chunking strategy
5
+ and determines whether chunking is required based on document characteristics.
6
+
7
+ Strategies:
8
+ - fixed-pages: Chunks based on page count threshold only
9
+ - token-based: Chunks based on token count threshold only
10
+ - hybrid: Chunks if either page OR token threshold is exceeded (RECOMMENDED)
11
+
12
+ Requirements: 7.5
13
+ """
14
+
15
+ import logging
16
+ import os
17
+ from typing import Dict, Any, Optional, Tuple
18
+
19
+ # Try to import structured logging
20
+ try:
21
+ from structured_logging import get_logger, log_strategy_selection, is_observability_enabled
22
+ structured_logger = get_logger(__name__)
23
+ USE_STRUCTURED_LOGGING = True
24
+ except ImportError:
25
+ USE_STRUCTURED_LOGGING = False
26
+ structured_logger = None
27
+
28
+ # Configure module logger as fallback
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class StrategySelectionResult:
33
+ """
34
+ Result of strategy selection containing the decision and reasoning.
35
+
36
+ Attributes:
37
+ requires_chunking: Whether the document requires chunking
38
+ strategy: The strategy used for the decision
39
+ reason: Human-readable explanation of the decision
40
+ document_pages: Number of pages in the document
41
+ document_tokens: Total tokens in the document
42
+ page_threshold: Page threshold used for comparison
43
+ token_threshold: Token threshold used for comparison
44
+ page_threshold_exceeded: Whether page threshold was exceeded
45
+ token_threshold_exceeded: Whether token threshold was exceeded
46
+ """
47
+
48
+ def __init__(
49
+ self,
50
+ requires_chunking: bool,
51
+ strategy: str,
52
+ reason: str,
53
+ document_pages: int,
54
+ document_tokens: int,
55
+ page_threshold: int,
56
+ token_threshold: int,
57
+ page_threshold_exceeded: bool,
58
+ token_threshold_exceeded: bool
59
+ ):
60
+ self.requires_chunking = requires_chunking
61
+ self.strategy = strategy
62
+ self.reason = reason
63
+ self.document_pages = document_pages
64
+ self.document_tokens = document_tokens
65
+ self.page_threshold = page_threshold
66
+ self.token_threshold = token_threshold
67
+ self.page_threshold_exceeded = page_threshold_exceeded
68
+ self.token_threshold_exceeded = token_threshold_exceeded
69
+
70
+ def to_dict(self) -> Dict[str, Any]:
71
+ """Convert result to dictionary for serialization."""
72
+ return {
73
+ 'requires_chunking': self.requires_chunking,
74
+ 'strategy': self.strategy,
75
+ 'reason': self.reason,
76
+ 'document_pages': self.document_pages,
77
+ 'document_tokens': self.document_tokens,
78
+ 'page_threshold': self.page_threshold,
79
+ 'token_threshold': self.token_threshold,
80
+ 'page_threshold_exceeded': self.page_threshold_exceeded,
81
+ 'token_threshold_exceeded': self.token_threshold_exceeded
82
+ }
83
+
84
+
85
+ def check_fixed_pages_threshold(
86
+ total_pages: int,
87
+ page_threshold: int
88
+ ) -> Tuple[bool, bool]:
89
+ """
90
+ Check if document exceeds the fixed-pages strategy threshold.
91
+
92
+ Fixed-pages strategy only considers page count, ignoring token density.
93
+ This is the simplest and fastest strategy but may not be optimal for
94
+ documents with variable content density.
95
+
96
+ Args:
97
+ total_pages: Total number of pages in the document
98
+ page_threshold: Maximum pages before chunking is required
99
+
100
+ Returns:
101
+ Tuple of (requires_chunking, page_threshold_exceeded)
102
+
103
+ Example:
104
+ >>> check_fixed_pages_threshold(150, 100)
105
+ (True, True)
106
+ >>> check_fixed_pages_threshold(50, 100)
107
+ (False, False)
108
+ """
109
+ page_threshold_exceeded = total_pages > page_threshold
110
+ requires_chunking = page_threshold_exceeded
111
+
112
+ return requires_chunking, page_threshold_exceeded
113
+
114
+
115
+ def check_token_based_threshold(
116
+ total_tokens: int,
117
+ token_threshold: int
118
+ ) -> Tuple[bool, bool]:
119
+ """
120
+ Check if document exceeds the token-based strategy threshold.
121
+
122
+ Token-based strategy only considers total token count, ignoring page count.
123
+ This is ideal for documents with variable content density where token
124
+ limits are the primary concern.
125
+
126
+ Args:
127
+ total_tokens: Total estimated tokens in the document
128
+ token_threshold: Maximum tokens before chunking is required
129
+
130
+ Returns:
131
+ Tuple of (requires_chunking, token_threshold_exceeded)
132
+
133
+ Example:
134
+ >>> check_token_based_threshold(200000, 150000)
135
+ (True, True)
136
+ >>> check_token_based_threshold(100000, 150000)
137
+ (False, False)
138
+ """
139
+ token_threshold_exceeded = total_tokens > token_threshold
140
+ requires_chunking = token_threshold_exceeded
141
+
142
+ return requires_chunking, token_threshold_exceeded
143
+
144
+
145
+ def check_hybrid_threshold(
146
+ total_pages: int,
147
+ total_tokens: int,
148
+ page_threshold: int,
149
+ token_threshold: int
150
+ ) -> Tuple[bool, bool, bool]:
151
+ """
152
+ Check if document exceeds the hybrid strategy thresholds.
153
+
154
+ Hybrid strategy (RECOMMENDED) triggers chunking if EITHER the page count
155
+ OR the token count exceeds their respective thresholds. This provides
156
+ the best balance between processing efficiency and model limits.
157
+
158
+ Args:
159
+ total_pages: Total number of pages in the document
160
+ total_tokens: Total estimated tokens in the document
161
+ page_threshold: Maximum pages before chunking is required
162
+ token_threshold: Maximum tokens before chunking is required
163
+
164
+ Returns:
165
+ Tuple of (requires_chunking, page_threshold_exceeded, token_threshold_exceeded)
166
+
167
+ Example:
168
+ >>> check_hybrid_threshold(150, 100000, 100, 150000)
169
+ (True, True, False) # Page threshold exceeded
170
+ >>> check_hybrid_threshold(50, 200000, 100, 150000)
171
+ (True, False, True) # Token threshold exceeded
172
+ >>> check_hybrid_threshold(150, 200000, 100, 150000)
173
+ (True, True, True) # Both thresholds exceeded
174
+ >>> check_hybrid_threshold(50, 100000, 100, 150000)
175
+ (False, False, False) # Neither threshold exceeded
176
+ """
177
+ page_threshold_exceeded = total_pages > page_threshold
178
+ token_threshold_exceeded = total_tokens > token_threshold
179
+ requires_chunking = page_threshold_exceeded or token_threshold_exceeded
180
+
181
+ return requires_chunking, page_threshold_exceeded, token_threshold_exceeded
182
+
183
+
184
+ def select_strategy_and_check_thresholds(
185
+ total_pages: int,
186
+ total_tokens: int,
187
+ config: Optional[Dict[str, Any]] = None
188
+ ) -> StrategySelectionResult:
189
+ """
190
+ Select chunking strategy and check if document requires chunking.
191
+
192
+ This is the main entry point for strategy selection. It determines
193
+ which strategy to use based on configuration and checks the appropriate
194
+ thresholds to decide if chunking is required.
195
+
196
+ Args:
197
+ total_pages: Total number of pages in the document
198
+ total_tokens: Total estimated tokens in the document
199
+ config: Configuration dictionary with optional keys:
200
+ - strategy: 'fixed-pages', 'token-based', or 'hybrid' (default: 'hybrid')
201
+ - pageThreshold: Maximum pages before chunking (default: 100)
202
+ - tokenThreshold: Maximum tokens before chunking (default: 150000)
203
+
204
+ Returns:
205
+ StrategySelectionResult containing the decision and reasoning
206
+
207
+ Example:
208
+ >>> result = select_strategy_and_check_thresholds(150, 200000)
209
+ >>> result.requires_chunking
210
+ True
211
+ >>> result.strategy
212
+ 'hybrid'
213
+ """
214
+ # Set default configuration
215
+ if config is None:
216
+ config = {}
217
+
218
+ strategy = config.get('strategy', config.get('chunkingStrategy', 'hybrid'))
219
+ page_threshold = config.get('pageThreshold', config.get('page_threshold', 100))
220
+ token_threshold = config.get('tokenThreshold', config.get('token_threshold', 150000))
221
+
222
+ # Log document characteristics
223
+ logger.info(
224
+ f"Strategy selection: analyzing document with {total_pages} pages "
225
+ f"and {total_tokens} tokens using '{strategy}' strategy"
226
+ )
227
+ logger.debug(
228
+ f"Threshold values: page_threshold={page_threshold}, "
229
+ f"token_threshold={token_threshold}"
230
+ )
231
+
232
+ # Check thresholds based on strategy
233
+ if strategy == 'fixed-pages':
234
+ requires_chunking, page_exceeded = check_fixed_pages_threshold(
235
+ total_pages, page_threshold
236
+ )
237
+ token_exceeded = False # Not checked in fixed-pages strategy
238
+ reason = _build_fixed_pages_reason(
239
+ total_pages, page_threshold, requires_chunking
240
+ )
241
+
242
+ elif strategy == 'token-based':
243
+ requires_chunking, token_exceeded = check_token_based_threshold(
244
+ total_tokens, token_threshold
245
+ )
246
+ page_exceeded = False # Not checked in token-based strategy
247
+ reason = _build_token_based_reason(
248
+ total_tokens, token_threshold, requires_chunking
249
+ )
250
+
251
+ else: # hybrid (default)
252
+ requires_chunking, page_exceeded, token_exceeded = check_hybrid_threshold(
253
+ total_pages, total_tokens, page_threshold, token_threshold
254
+ )
255
+ reason = _build_hybrid_reason(
256
+ total_pages, total_tokens, page_threshold, token_threshold,
257
+ page_exceeded, token_exceeded, requires_chunking
258
+ )
259
+
260
+ # Log the decision with reasoning
261
+ _log_strategy_decision(
262
+ strategy=strategy,
263
+ requires_chunking=requires_chunking,
264
+ reason=reason,
265
+ total_pages=total_pages,
266
+ total_tokens=total_tokens,
267
+ page_threshold=page_threshold,
268
+ token_threshold=token_threshold,
269
+ page_exceeded=page_exceeded,
270
+ token_exceeded=token_exceeded
271
+ )
272
+
273
+ return StrategySelectionResult(
274
+ requires_chunking=requires_chunking,
275
+ strategy=strategy,
276
+ reason=reason,
277
+ document_pages=total_pages,
278
+ document_tokens=total_tokens,
279
+ page_threshold=page_threshold,
280
+ token_threshold=token_threshold,
281
+ page_threshold_exceeded=page_exceeded,
282
+ token_threshold_exceeded=token_exceeded
283
+ )
284
+
285
+
286
+ def _build_fixed_pages_reason(
287
+ total_pages: int,
288
+ page_threshold: int,
289
+ requires_chunking: bool
290
+ ) -> str:
291
+ """Build human-readable reason for fixed-pages strategy decision."""
292
+ if requires_chunking:
293
+ return (
294
+ f"Document has {total_pages} pages, exceeding threshold of "
295
+ f"{page_threshold} pages (fixed-pages strategy)"
296
+ )
297
+ else:
298
+ return (
299
+ f"Document has {total_pages} pages, below threshold of "
300
+ f"{page_threshold} pages (fixed-pages strategy)"
301
+ )
302
+
303
+
304
+ def _build_token_based_reason(
305
+ total_tokens: int,
306
+ token_threshold: int,
307
+ requires_chunking: bool
308
+ ) -> str:
309
+ """Build human-readable reason for token-based strategy decision."""
310
+ if requires_chunking:
311
+ return (
312
+ f"Document has {total_tokens:,} tokens, exceeding threshold of "
313
+ f"{token_threshold:,} tokens (token-based strategy)"
314
+ )
315
+ else:
316
+ return (
317
+ f"Document has {total_tokens:,} tokens, below threshold of "
318
+ f"{token_threshold:,} tokens (token-based strategy)"
319
+ )
320
+
321
+
322
+ def _build_hybrid_reason(
323
+ total_pages: int,
324
+ total_tokens: int,
325
+ page_threshold: int,
326
+ token_threshold: int,
327
+ page_exceeded: bool,
328
+ token_exceeded: bool,
329
+ requires_chunking: bool
330
+ ) -> str:
331
+ """Build human-readable reason for hybrid strategy decision."""
332
+ if requires_chunking:
333
+ if page_exceeded and token_exceeded:
334
+ return (
335
+ f"Document has {total_pages} pages (threshold: {page_threshold}) "
336
+ f"and {total_tokens:,} tokens (threshold: {token_threshold:,}), "
337
+ f"both thresholds exceeded (hybrid strategy)"
338
+ )
339
+ elif page_exceeded:
340
+ return (
341
+ f"Document has {total_pages} pages, exceeding threshold of "
342
+ f"{page_threshold} pages; {total_tokens:,} tokens below "
343
+ f"threshold of {token_threshold:,} (hybrid strategy)"
344
+ )
345
+ else: # token_exceeded
346
+ return (
347
+ f"Document has {total_tokens:,} tokens, exceeding threshold of "
348
+ f"{token_threshold:,} tokens; {total_pages} pages below "
349
+ f"threshold of {page_threshold} (hybrid strategy)"
350
+ )
351
+ else:
352
+ return (
353
+ f"Document has {total_pages} pages and {total_tokens:,} tokens, "
354
+ f"below thresholds of {page_threshold} pages and "
355
+ f"{token_threshold:,} tokens (hybrid strategy)"
356
+ )
357
+
358
+
359
+ def _log_strategy_decision(
360
+ strategy: str,
361
+ requires_chunking: bool,
362
+ reason: str,
363
+ total_pages: int,
364
+ total_tokens: int,
365
+ page_threshold: int,
366
+ token_threshold: int,
367
+ page_exceeded: bool,
368
+ token_exceeded: bool
369
+ ) -> None:
370
+ """
371
+ Log the strategy selection decision with full context.
372
+
373
+ This provides comprehensive logging for observability and debugging,
374
+ including all relevant metrics and threshold comparisons.
375
+
376
+ Requirements: 7.5
377
+ """
378
+ decision = "CHUNKING REQUIRED" if requires_chunking else "NO CHUNKING NEEDED"
379
+
380
+ # Use structured logging if available and observability is enabled
381
+ if USE_STRUCTURED_LOGGING and structured_logger and is_observability_enabled():
382
+ log_strategy_selection(
383
+ logger=structured_logger,
384
+ strategy=strategy,
385
+ requires_chunking=requires_chunking,
386
+ reason=reason,
387
+ document_pages=total_pages,
388
+ document_tokens=total_tokens,
389
+ page_threshold=page_threshold,
390
+ token_threshold=token_threshold,
391
+ page_threshold_exceeded=page_exceeded,
392
+ token_threshold_exceeded=token_exceeded
393
+ )
394
+ else:
395
+ # Fall back to standard logging
396
+ logger.info(
397
+ f"Strategy selection result: {decision}",
398
+ extra={
399
+ 'strategy': strategy,
400
+ 'requires_chunking': requires_chunking,
401
+ 'reason': reason,
402
+ 'document_pages': total_pages,
403
+ 'document_tokens': total_tokens,
404
+ 'page_threshold': page_threshold,
405
+ 'token_threshold': token_threshold,
406
+ 'page_threshold_exceeded': page_exceeded,
407
+ 'token_threshold_exceeded': token_exceeded
408
+ }
409
+ )
410
+
411
+ # Log detailed breakdown at debug level
412
+ logger.debug(
413
+ f"Strategy selection details:\n"
414
+ f" Strategy: {strategy}\n"
415
+ f" Document: {total_pages} pages, {total_tokens:,} tokens\n"
416
+ f" Page threshold: {page_threshold} (exceeded: {page_exceeded})\n"
417
+ f" Token threshold: {token_threshold:,} (exceeded: {token_exceeded})\n"
418
+ f" Decision: {decision}\n"
419
+ f" Reason: {reason}"
420
+ )