@cdklabs/cdk-appmod-catalog-blueprints 1.4.1 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/.jsii +2579 -194
  2. package/lib/document-processing/adapter/adapter.d.ts +4 -2
  3. package/lib/document-processing/adapter/adapter.js +1 -1
  4. package/lib/document-processing/adapter/queued-s3-adapter.d.ts +9 -2
  5. package/lib/document-processing/adapter/queued-s3-adapter.js +29 -15
  6. package/lib/document-processing/agentic-document-processing.d.ts +4 -0
  7. package/lib/document-processing/agentic-document-processing.js +20 -10
  8. package/lib/document-processing/base-document-processing.d.ts +54 -2
  9. package/lib/document-processing/base-document-processing.js +136 -82
  10. package/lib/document-processing/bedrock-document-processing.d.ts +202 -2
  11. package/lib/document-processing/bedrock-document-processing.js +717 -77
  12. package/lib/document-processing/chunking-config.d.ts +614 -0
  13. package/lib/document-processing/chunking-config.js +5 -0
  14. package/lib/document-processing/default-document-processing-config.js +1 -1
  15. package/lib/document-processing/index.d.ts +1 -0
  16. package/lib/document-processing/index.js +2 -1
  17. package/lib/document-processing/resources/aggregation/handler.py +567 -0
  18. package/lib/document-processing/resources/aggregation/requirements.txt +7 -0
  19. package/lib/document-processing/resources/aggregation/test_handler.py +362 -0
  20. package/lib/document-processing/resources/cleanup/handler.py +276 -0
  21. package/lib/document-processing/resources/cleanup/requirements.txt +5 -0
  22. package/lib/document-processing/resources/cleanup/test_handler.py +436 -0
  23. package/lib/document-processing/resources/default-bedrock-invoke/index.py +85 -3
  24. package/lib/document-processing/resources/default-bedrock-invoke/test_index.py +622 -0
  25. package/lib/document-processing/resources/pdf-chunking/README.md +313 -0
  26. package/lib/document-processing/resources/pdf-chunking/chunking_strategies.py +460 -0
  27. package/lib/document-processing/resources/pdf-chunking/error_handling.py +491 -0
  28. package/lib/document-processing/resources/pdf-chunking/handler.py +958 -0
  29. package/lib/document-processing/resources/pdf-chunking/metrics.py +435 -0
  30. package/lib/document-processing/resources/pdf-chunking/requirements.txt +3 -0
  31. package/lib/document-processing/resources/pdf-chunking/strategy_selection.py +420 -0
  32. package/lib/document-processing/resources/pdf-chunking/structured_logging.py +457 -0
  33. package/lib/document-processing/resources/pdf-chunking/test_chunking_strategies.py +353 -0
  34. package/lib/document-processing/resources/pdf-chunking/test_error_handling.py +487 -0
  35. package/lib/document-processing/resources/pdf-chunking/test_handler.py +609 -0
  36. package/lib/document-processing/resources/pdf-chunking/test_integration.py +694 -0
  37. package/lib/document-processing/resources/pdf-chunking/test_metrics.py +532 -0
  38. package/lib/document-processing/resources/pdf-chunking/test_strategy_selection.py +471 -0
  39. package/lib/document-processing/resources/pdf-chunking/test_structured_logging.py +449 -0
  40. package/lib/document-processing/resources/pdf-chunking/test_token_estimation.py +374 -0
  41. package/lib/document-processing/resources/pdf-chunking/token_estimation.py +189 -0
  42. package/lib/document-processing/tests/agentic-document-processing-nag.test.js +4 -3
  43. package/lib/document-processing/tests/agentic-document-processing.test.js +488 -4
  44. package/lib/document-processing/tests/base-document-processing-nag.test.js +9 -2
  45. package/lib/document-processing/tests/base-document-processing-schema.test.d.ts +1 -0
  46. package/lib/document-processing/tests/base-document-processing-schema.test.js +337 -0
  47. package/lib/document-processing/tests/base-document-processing.test.js +114 -8
  48. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.d.ts +1 -0
  49. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.js +382 -0
  50. package/lib/document-processing/tests/bedrock-document-processing-nag.test.js +4 -3
  51. package/lib/document-processing/tests/bedrock-document-processing-security.test.d.ts +1 -0
  52. package/lib/document-processing/tests/bedrock-document-processing-security.test.js +389 -0
  53. package/lib/document-processing/tests/bedrock-document-processing.test.js +808 -8
  54. package/lib/document-processing/tests/chunking-config.test.d.ts +1 -0
  55. package/lib/document-processing/tests/chunking-config.test.js +238 -0
  56. package/lib/document-processing/tests/queued-s3-adapter-nag.test.js +9 -2
  57. package/lib/document-processing/tests/queued-s3-adapter.test.js +17 -6
  58. package/lib/framework/agents/base-agent.js +1 -1
  59. package/lib/framework/agents/batch-agent.js +1 -1
  60. package/lib/framework/agents/default-agent-config.js +1 -1
  61. package/lib/framework/bedrock/bedrock.js +1 -1
  62. package/lib/framework/custom-resource/default-runtimes.js +1 -1
  63. package/lib/framework/foundation/access-log.js +1 -1
  64. package/lib/framework/foundation/eventbridge-broker.js +1 -1
  65. package/lib/framework/foundation/network.d.ts +4 -2
  66. package/lib/framework/foundation/network.js +52 -41
  67. package/lib/framework/tests/access-log.test.js +5 -2
  68. package/lib/framework/tests/batch-agent.test.js +5 -2
  69. package/lib/framework/tests/bedrock.test.js +5 -2
  70. package/lib/framework/tests/eventbridge-broker.test.js +5 -2
  71. package/lib/framework/tests/framework-nag.test.js +26 -7
  72. package/lib/framework/tests/network.test.js +30 -2
  73. package/lib/tsconfig.tsbuildinfo +1 -1
  74. package/lib/utilities/data-loader.js +1 -1
  75. package/lib/utilities/lambda-iam-utils.js +1 -1
  76. package/lib/utilities/observability/cloudfront-distribution-observability-property-injector.js +1 -1
  77. package/lib/utilities/observability/default-observability-config.js +1 -1
  78. package/lib/utilities/observability/lambda-observability-property-injector.js +1 -1
  79. package/lib/utilities/observability/log-group-data-protection-utils.js +1 -1
  80. package/lib/utilities/observability/powertools-config.d.ts +10 -1
  81. package/lib/utilities/observability/powertools-config.js +19 -3
  82. package/lib/utilities/observability/state-machine-observability-property-injector.js +1 -1
  83. package/lib/utilities/test-utils.d.ts +43 -0
  84. package/lib/utilities/test-utils.js +56 -0
  85. package/lib/utilities/tests/data-loader-nag.test.js +3 -2
  86. package/lib/utilities/tests/data-loader.test.js +3 -2
  87. package/lib/webapp/frontend-construct.js +1 -1
  88. package/lib/webapp/tests/frontend-construct-nag.test.js +3 -2
  89. package/lib/webapp/tests/frontend-construct.test.js +3 -2
  90. package/package.json +6 -5
  91. package/lib/document-processing/resources/default-error-handler/index.js +0 -46
  92. package/lib/document-processing/resources/default-pdf-processor/index.js +0 -46
  93. package/lib/document-processing/resources/default-pdf-validator/index.js +0 -36
@@ -0,0 +1,457 @@
1
+ """
2
+ Structured Logging Module for PDF Chunking.
3
+
4
+ This module provides consistent JSON-formatted logging across all Lambda functions
5
+ in the PDF chunking workflow. Structured logging is only enabled when observability
6
+ is enabled (POWERTOOLS_METRICS_DISABLED != 'true').
7
+
8
+ Features:
9
+ - Consistent JSON log format with timestamp, level, message, and context
10
+ - Document ID, chunk index, and correlation ID in all log entries
11
+ - Configurable log levels via LOG_LEVEL environment variable
12
+ - Strategy selection reasoning logging
13
+
14
+ Requirements: 7.5
15
+ """
16
+
17
+ import json
18
+ import logging
19
+ import os
20
+ import sys
21
+ import uuid
22
+ from datetime import datetime, timezone
23
+ from typing import Any, Dict, Optional
24
+ from functools import wraps
25
+
26
+
27
+ # Check if observability is enabled
28
+ # When POWERTOOLS_METRICS_DISABLED is 'true', observability is disabled
29
+ def is_observability_enabled() -> bool:
30
+ """
31
+ Check if observability is enabled based on environment variables.
32
+
33
+ Observability is enabled when POWERTOOLS_METRICS_DISABLED is NOT 'true'.
34
+ This aligns with the CDK construct's enableObservability flag.
35
+
36
+ Returns:
37
+ True if observability is enabled, False otherwise
38
+ """
39
+ metrics_disabled = os.environ.get('POWERTOOLS_METRICS_DISABLED', 'false').lower()
40
+ return metrics_disabled != 'true'
41
+
42
+
43
+ class StructuredLogFormatter(logging.Formatter):
44
+ """
45
+ Custom log formatter that outputs JSON-structured logs.
46
+
47
+ Log format includes:
48
+ - timestamp: ISO 8601 formatted timestamp in UTC
49
+ - level: Log level (INFO, ERROR, WARNING, DEBUG)
50
+ - message: Log message
51
+ - logger: Logger name
52
+ - context: Additional context fields (documentId, chunkIndex, correlationId, etc.)
53
+
54
+ Requirements: 7.5
55
+ """
56
+
57
+ def __init__(self, service_name: Optional[str] = None):
58
+ """
59
+ Initialize the structured log formatter.
60
+
61
+ Args:
62
+ service_name: Service name to include in logs (from POWERTOOLS_SERVICE_NAME)
63
+ """
64
+ super().__init__()
65
+ self.service_name = service_name or os.environ.get(
66
+ 'POWERTOOLS_SERVICE_NAME', 'pdf-chunking'
67
+ )
68
+
69
+ def format(self, record: logging.LogRecord) -> str:
70
+ """
71
+ Format the log record as JSON.
72
+
73
+ Args:
74
+ record: Log record to format
75
+
76
+ Returns:
77
+ JSON-formatted log string
78
+ """
79
+ # Build base log structure
80
+ log_entry = {
81
+ 'timestamp': datetime.now(timezone.utc).isoformat(),
82
+ 'level': record.levelname,
83
+ 'message': record.getMessage(),
84
+ 'logger': record.name,
85
+ 'service': self.service_name,
86
+ }
87
+
88
+ # Add location info for errors
89
+ if record.levelno >= logging.ERROR:
90
+ log_entry['location'] = {
91
+ 'file': record.filename,
92
+ 'line': record.lineno,
93
+ 'function': record.funcName
94
+ }
95
+
96
+ # Add exception info if present
97
+ if record.exc_info:
98
+ log_entry['exception'] = {
99
+ 'type': record.exc_info[0].__name__ if record.exc_info[0] else None,
100
+ 'message': str(record.exc_info[1]) if record.exc_info[1] else None,
101
+ }
102
+
103
+ # Add extra context fields
104
+ # These are passed via logger.info(..., extra={...})
105
+ context = {}
106
+ for key, value in record.__dict__.items():
107
+ if key not in [
108
+ 'name', 'msg', 'args', 'created', 'filename', 'funcName',
109
+ 'levelname', 'levelno', 'lineno', 'module', 'msecs',
110
+ 'pathname', 'process', 'processName', 'relativeCreated',
111
+ 'stack_info', 'exc_info', 'exc_text', 'thread', 'threadName',
112
+ 'message', 'asctime'
113
+ ]:
114
+ context[key] = value
115
+
116
+ if context:
117
+ log_entry['context'] = context
118
+
119
+ return json.dumps(log_entry, default=str)
120
+
121
+
122
+ class StandardLogFormatter(logging.Formatter):
123
+ """
124
+ Standard log formatter for when observability is disabled.
125
+
126
+ Uses a simple format: [LEVEL] message
127
+ """
128
+
129
+ def __init__(self):
130
+ super().__init__('[%(levelname)s] %(message)s')
131
+
132
+
133
+ class StructuredLogger:
134
+ """
135
+ Structured logger wrapper that provides consistent logging interface.
136
+
137
+ This class wraps the standard Python logger and adds:
138
+ - Automatic correlation ID generation and propagation
139
+ - Document ID and chunk index context
140
+ - Structured JSON output when observability is enabled
141
+ - Standard output when observability is disabled
142
+
143
+ Requirements: 7.5
144
+ """
145
+
146
+ _correlation_id: Optional[str] = None
147
+ _document_id: Optional[str] = None
148
+ _chunk_index: Optional[int] = None
149
+
150
+ def __init__(self, name: str = __name__):
151
+ """
152
+ Initialize the structured logger.
153
+
154
+ Args:
155
+ name: Logger name (typically __name__)
156
+ """
157
+ self.logger = logging.getLogger(name)
158
+ self._configure_logger()
159
+
160
+ def _configure_logger(self) -> None:
161
+ """Configure the logger with appropriate formatter and level."""
162
+ # Get log level from environment
163
+ log_level = os.environ.get('LOG_LEVEL', 'INFO').upper()
164
+
165
+ # Validate log level
166
+ valid_levels = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
167
+ if log_level not in valid_levels:
168
+ log_level = 'INFO'
169
+
170
+ self.logger.setLevel(getattr(logging, log_level))
171
+
172
+ # Remove existing handlers to avoid duplicates
173
+ self.logger.handlers = []
174
+
175
+ # Create handler
176
+ handler = logging.StreamHandler(sys.stdout)
177
+ handler.setLevel(getattr(logging, log_level))
178
+
179
+ # Use structured formatter only when observability is enabled
180
+ if is_observability_enabled():
181
+ formatter = StructuredLogFormatter()
182
+ else:
183
+ formatter = StandardLogFormatter()
184
+
185
+ handler.setFormatter(formatter)
186
+ self.logger.addHandler(handler)
187
+
188
+ # Prevent propagation to root logger
189
+ self.logger.propagate = False
190
+
191
+ def set_correlation_id(self, correlation_id: Optional[str] = None) -> str:
192
+ """
193
+ Set or generate a correlation ID for request tracing.
194
+
195
+ Args:
196
+ correlation_id: Existing correlation ID to use, or None to generate new
197
+
198
+ Returns:
199
+ The correlation ID being used
200
+ """
201
+ if correlation_id:
202
+ StructuredLogger._correlation_id = correlation_id
203
+ else:
204
+ StructuredLogger._correlation_id = str(uuid.uuid4())
205
+ return StructuredLogger._correlation_id
206
+
207
+ def get_correlation_id(self) -> Optional[str]:
208
+ """Get the current correlation ID."""
209
+ return StructuredLogger._correlation_id
210
+
211
+ def set_document_context(
212
+ self,
213
+ document_id: Optional[str] = None,
214
+ chunk_index: Optional[int] = None
215
+ ) -> None:
216
+ """
217
+ Set document context for all subsequent log entries.
218
+
219
+ Args:
220
+ document_id: Document identifier
221
+ chunk_index: Chunk index (for chunk-specific operations)
222
+ """
223
+ StructuredLogger._document_id = document_id
224
+ StructuredLogger._chunk_index = chunk_index
225
+
226
+ def clear_context(self) -> None:
227
+ """Clear all context (correlation ID, document ID, chunk index)."""
228
+ StructuredLogger._correlation_id = None
229
+ StructuredLogger._document_id = None
230
+ StructuredLogger._chunk_index = None
231
+
232
+ def _build_extra(self, extra: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
233
+ """
234
+ Build extra context dictionary with standard fields.
235
+
236
+ Args:
237
+ extra: Additional context to include
238
+
239
+ Returns:
240
+ Combined extra dictionary
241
+ """
242
+ result = {}
243
+
244
+ # Add correlation ID if set
245
+ if StructuredLogger._correlation_id:
246
+ result['correlationId'] = StructuredLogger._correlation_id
247
+
248
+ # Add document ID if set
249
+ if StructuredLogger._document_id:
250
+ result['documentId'] = StructuredLogger._document_id
251
+
252
+ # Add chunk index if set
253
+ if StructuredLogger._chunk_index is not None:
254
+ result['chunkIndex'] = StructuredLogger._chunk_index
255
+
256
+ # Merge with provided extra
257
+ if extra:
258
+ result.update(extra)
259
+
260
+ return result
261
+
262
+ def debug(self, message: str, extra: Optional[Dict[str, Any]] = None) -> None:
263
+ """Log a debug message."""
264
+ self.logger.debug(message, extra=self._build_extra(extra))
265
+
266
+ def info(self, message: str, extra: Optional[Dict[str, Any]] = None) -> None:
267
+ """Log an info message."""
268
+ self.logger.info(message, extra=self._build_extra(extra))
269
+
270
+ def warning(self, message: str, extra: Optional[Dict[str, Any]] = None) -> None:
271
+ """Log a warning message."""
272
+ self.logger.warning(message, extra=self._build_extra(extra))
273
+
274
+ def error(
275
+ self,
276
+ message: str,
277
+ extra: Optional[Dict[str, Any]] = None,
278
+ exc_info: bool = False
279
+ ) -> None:
280
+ """Log an error message."""
281
+ self.logger.error(message, extra=self._build_extra(extra), exc_info=exc_info)
282
+
283
+ def critical(
284
+ self,
285
+ message: str,
286
+ extra: Optional[Dict[str, Any]] = None,
287
+ exc_info: bool = False
288
+ ) -> None:
289
+ """Log a critical message."""
290
+ self.logger.critical(message, extra=self._build_extra(extra), exc_info=exc_info)
291
+
292
+
293
+ def get_logger(name: str = __name__) -> StructuredLogger:
294
+ """
295
+ Get a structured logger instance.
296
+
297
+ Args:
298
+ name: Logger name (typically __name__)
299
+
300
+ Returns:
301
+ StructuredLogger instance
302
+ """
303
+ return StructuredLogger(name)
304
+
305
+
306
+ def log_strategy_selection(
307
+ logger: StructuredLogger,
308
+ strategy: str,
309
+ requires_chunking: bool,
310
+ reason: str,
311
+ document_pages: int,
312
+ document_tokens: int,
313
+ page_threshold: int,
314
+ token_threshold: int,
315
+ page_threshold_exceeded: bool,
316
+ token_threshold_exceeded: bool
317
+ ) -> None:
318
+ """
319
+ Log strategy selection decision with full context.
320
+
321
+ This function provides comprehensive logging for strategy selection,
322
+ including all relevant metrics and threshold comparisons.
323
+
324
+ Args:
325
+ logger: StructuredLogger instance
326
+ strategy: Selected strategy name
327
+ requires_chunking: Whether chunking is required
328
+ reason: Human-readable reason for the decision
329
+ document_pages: Number of pages in the document
330
+ document_tokens: Total tokens in the document
331
+ page_threshold: Page threshold used
332
+ token_threshold: Token threshold used
333
+ page_threshold_exceeded: Whether page threshold was exceeded
334
+ token_threshold_exceeded: Whether token threshold was exceeded
335
+
336
+ Requirements: 7.5
337
+ """
338
+ decision = "CHUNKING_REQUIRED" if requires_chunking else "NO_CHUNKING_NEEDED"
339
+
340
+ logger.info(
341
+ f"Strategy selection: {decision}",
342
+ extra={
343
+ 'event': 'strategy_selection',
344
+ 'strategy': strategy,
345
+ 'requiresChunking': requires_chunking,
346
+ 'reason': reason,
347
+ 'documentCharacteristics': {
348
+ 'pages': document_pages,
349
+ 'tokens': document_tokens
350
+ },
351
+ 'thresholds': {
352
+ 'pageThreshold': page_threshold,
353
+ 'tokenThreshold': token_threshold,
354
+ 'pageThresholdExceeded': page_threshold_exceeded,
355
+ 'tokenThresholdExceeded': token_threshold_exceeded
356
+ }
357
+ }
358
+ )
359
+
360
+
361
+ def log_chunking_operation(
362
+ logger: StructuredLogger,
363
+ operation: str,
364
+ document_id: str,
365
+ chunk_count: Optional[int] = None,
366
+ chunk_index: Optional[int] = None,
367
+ success: bool = True,
368
+ error_message: Optional[str] = None,
369
+ duration_ms: Optional[float] = None,
370
+ extra: Optional[Dict[str, Any]] = None
371
+ ) -> None:
372
+ """
373
+ Log a chunking operation with standard fields.
374
+
375
+ Args:
376
+ logger: StructuredLogger instance
377
+ operation: Operation name (e.g., 'analyze', 'split', 'upload')
378
+ document_id: Document identifier
379
+ chunk_count: Total number of chunks (if applicable)
380
+ chunk_index: Current chunk index (if applicable)
381
+ success: Whether the operation succeeded
382
+ error_message: Error message if operation failed
383
+ duration_ms: Operation duration in milliseconds
384
+ extra: Additional context
385
+
386
+ Requirements: 7.5
387
+ """
388
+ log_extra = {
389
+ 'event': 'chunking_operation',
390
+ 'operation': operation,
391
+ 'documentId': document_id,
392
+ 'success': success
393
+ }
394
+
395
+ if chunk_count is not None:
396
+ log_extra['chunkCount'] = chunk_count
397
+
398
+ if chunk_index is not None:
399
+ log_extra['chunkIndex'] = chunk_index
400
+
401
+ if duration_ms is not None:
402
+ log_extra['durationMs'] = duration_ms
403
+
404
+ if error_message:
405
+ log_extra['errorMessage'] = error_message
406
+
407
+ if extra:
408
+ log_extra.update(extra)
409
+
410
+ if success:
411
+ logger.info(f"Chunking operation '{operation}' completed", extra=log_extra)
412
+ else:
413
+ logger.error(f"Chunking operation '{operation}' failed", extra=log_extra)
414
+
415
+
416
+ def with_correlation_id(func):
417
+ """
418
+ Decorator to automatically set correlation ID from event.
419
+
420
+ Looks for correlation ID in:
421
+ 1. event.correlationId
422
+ 2. event.headers.x-correlation-id
423
+ 3. Generates new UUID if not found
424
+
425
+ Args:
426
+ func: Lambda handler function
427
+
428
+ Returns:
429
+ Wrapped function with correlation ID handling
430
+ """
431
+ @wraps(func)
432
+ def wrapper(event: Dict[str, Any], context: Any) -> Any:
433
+ logger = get_logger()
434
+
435
+ # Try to get correlation ID from event
436
+ correlation_id = event.get('correlationId')
437
+
438
+ # Try headers if not in event root
439
+ if not correlation_id:
440
+ headers = event.get('headers', {})
441
+ correlation_id = headers.get('x-correlation-id') or headers.get('X-Correlation-Id')
442
+
443
+ # Set or generate correlation ID
444
+ logger.set_correlation_id(correlation_id)
445
+
446
+ # Set document context if available
447
+ document_id = event.get('documentId')
448
+ if document_id:
449
+ logger.set_document_context(document_id=document_id)
450
+
451
+ try:
452
+ return func(event, context)
453
+ finally:
454
+ # Clear context after request
455
+ logger.clear_context()
456
+
457
+ return wrapper