@cdklabs/cdk-appmod-catalog-blueprints 1.5.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/.jsii +2537 -204
  2. package/lib/document-processing/adapter/adapter.d.ts +4 -2
  3. package/lib/document-processing/adapter/adapter.js +1 -1
  4. package/lib/document-processing/adapter/queued-s3-adapter.d.ts +9 -2
  5. package/lib/document-processing/adapter/queued-s3-adapter.js +29 -15
  6. package/lib/document-processing/agentic-document-processing.d.ts +4 -0
  7. package/lib/document-processing/agentic-document-processing.js +20 -10
  8. package/lib/document-processing/base-document-processing.d.ts +54 -2
  9. package/lib/document-processing/base-document-processing.js +136 -82
  10. package/lib/document-processing/bedrock-document-processing.d.ts +202 -2
  11. package/lib/document-processing/bedrock-document-processing.js +717 -77
  12. package/lib/document-processing/chunking-config.d.ts +614 -0
  13. package/lib/document-processing/chunking-config.js +5 -0
  14. package/lib/document-processing/default-document-processing-config.js +1 -1
  15. package/lib/document-processing/index.d.ts +1 -0
  16. package/lib/document-processing/index.js +2 -1
  17. package/lib/document-processing/resources/aggregation/handler.py +567 -0
  18. package/lib/document-processing/resources/aggregation/requirements.txt +7 -0
  19. package/lib/document-processing/resources/aggregation/test_handler.py +362 -0
  20. package/lib/document-processing/resources/cleanup/handler.py +276 -0
  21. package/lib/document-processing/resources/cleanup/requirements.txt +5 -0
  22. package/lib/document-processing/resources/cleanup/test_handler.py +436 -0
  23. package/lib/document-processing/resources/default-bedrock-invoke/index.py +85 -3
  24. package/lib/document-processing/resources/default-bedrock-invoke/test_index.py +622 -0
  25. package/lib/document-processing/resources/pdf-chunking/README.md +313 -0
  26. package/lib/document-processing/resources/pdf-chunking/chunking_strategies.py +460 -0
  27. package/lib/document-processing/resources/pdf-chunking/error_handling.py +491 -0
  28. package/lib/document-processing/resources/pdf-chunking/handler.py +958 -0
  29. package/lib/document-processing/resources/pdf-chunking/metrics.py +435 -0
  30. package/lib/document-processing/resources/pdf-chunking/requirements.txt +3 -0
  31. package/lib/document-processing/resources/pdf-chunking/strategy_selection.py +420 -0
  32. package/lib/document-processing/resources/pdf-chunking/structured_logging.py +457 -0
  33. package/lib/document-processing/resources/pdf-chunking/test_chunking_strategies.py +353 -0
  34. package/lib/document-processing/resources/pdf-chunking/test_error_handling.py +487 -0
  35. package/lib/document-processing/resources/pdf-chunking/test_handler.py +609 -0
  36. package/lib/document-processing/resources/pdf-chunking/test_integration.py +694 -0
  37. package/lib/document-processing/resources/pdf-chunking/test_metrics.py +532 -0
  38. package/lib/document-processing/resources/pdf-chunking/test_strategy_selection.py +471 -0
  39. package/lib/document-processing/resources/pdf-chunking/test_structured_logging.py +449 -0
  40. package/lib/document-processing/resources/pdf-chunking/test_token_estimation.py +374 -0
  41. package/lib/document-processing/resources/pdf-chunking/token_estimation.py +189 -0
  42. package/lib/document-processing/tests/agentic-document-processing-nag.test.js +4 -3
  43. package/lib/document-processing/tests/agentic-document-processing.test.js +488 -4
  44. package/lib/document-processing/tests/base-document-processing-nag.test.js +9 -2
  45. package/lib/document-processing/tests/base-document-processing-schema.test.d.ts +1 -0
  46. package/lib/document-processing/tests/base-document-processing-schema.test.js +337 -0
  47. package/lib/document-processing/tests/base-document-processing.test.js +114 -8
  48. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.d.ts +1 -0
  49. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.js +382 -0
  50. package/lib/document-processing/tests/bedrock-document-processing-nag.test.js +4 -3
  51. package/lib/document-processing/tests/bedrock-document-processing-security.test.d.ts +1 -0
  52. package/lib/document-processing/tests/bedrock-document-processing-security.test.js +389 -0
  53. package/lib/document-processing/tests/bedrock-document-processing.test.js +808 -8
  54. package/lib/document-processing/tests/chunking-config.test.d.ts +1 -0
  55. package/lib/document-processing/tests/chunking-config.test.js +238 -0
  56. package/lib/document-processing/tests/queued-s3-adapter-nag.test.js +9 -2
  57. package/lib/document-processing/tests/queued-s3-adapter.test.js +17 -6
  58. package/lib/framework/agents/base-agent.js +1 -1
  59. package/lib/framework/agents/batch-agent.js +1 -1
  60. package/lib/framework/agents/default-agent-config.js +1 -1
  61. package/lib/framework/bedrock/bedrock.js +1 -1
  62. package/lib/framework/custom-resource/default-runtimes.js +1 -1
  63. package/lib/framework/foundation/access-log.js +1 -1
  64. package/lib/framework/foundation/eventbridge-broker.js +1 -1
  65. package/lib/framework/foundation/network.js +1 -1
  66. package/lib/framework/tests/access-log.test.js +5 -2
  67. package/lib/framework/tests/batch-agent.test.js +5 -2
  68. package/lib/framework/tests/bedrock.test.js +5 -2
  69. package/lib/framework/tests/eventbridge-broker.test.js +5 -2
  70. package/lib/framework/tests/framework-nag.test.js +16 -8
  71. package/lib/framework/tests/network.test.js +9 -4
  72. package/lib/tsconfig.tsbuildinfo +1 -1
  73. package/lib/utilities/data-loader.js +1 -1
  74. package/lib/utilities/lambda-iam-utils.js +1 -1
  75. package/lib/utilities/observability/cloudfront-distribution-observability-property-injector.js +1 -1
  76. package/lib/utilities/observability/default-observability-config.js +1 -1
  77. package/lib/utilities/observability/lambda-observability-property-injector.js +1 -1
  78. package/lib/utilities/observability/log-group-data-protection-utils.js +1 -1
  79. package/lib/utilities/observability/powertools-config.d.ts +10 -1
  80. package/lib/utilities/observability/powertools-config.js +19 -3
  81. package/lib/utilities/observability/state-machine-observability-property-injector.js +1 -1
  82. package/lib/utilities/test-utils.d.ts +43 -0
  83. package/lib/utilities/test-utils.js +56 -0
  84. package/lib/utilities/tests/data-loader-nag.test.js +3 -2
  85. package/lib/utilities/tests/data-loader.test.js +3 -2
  86. package/lib/webapp/frontend-construct.js +1 -1
  87. package/lib/webapp/tests/frontend-construct-nag.test.js +3 -2
  88. package/lib/webapp/tests/frontend-construct.test.js +3 -2
  89. package/package.json +6 -5
  90. package/lib/document-processing/resources/default-error-handler/index.js +0 -46
  91. package/lib/document-processing/resources/default-pdf-processor/index.js +0 -46
  92. package/lib/document-processing/resources/default-pdf-validator/index.js +0 -36
@@ -0,0 +1,491 @@
1
+ """
2
+ Error handling module for PDF chunking Lambda.
3
+
4
+ This module provides centralized error handling, classification, and response
5
+ generation for the PDF chunking Lambda function.
6
+
7
+ Requirements: 1.5, 7.1, 7.2, 7.3
8
+ """
9
+
10
+ import logging
11
+ import time
12
+ import random
13
+ from typing import Dict, Any, Optional, Callable, TypeVar
14
+ from functools import wraps
15
+ from botocore.exceptions import ClientError
16
+
17
+ # Configure module logger
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Type variable for generic retry decorator
21
+ T = TypeVar('T')
22
+
23
+
24
+ class PDFChunkingError(Exception):
25
+ """Base exception for PDF chunking errors."""
26
+
27
+ def __init__(
28
+ self,
29
+ message: str,
30
+ error_type: str,
31
+ document_id: Optional[str] = None,
32
+ recoverable: bool = False,
33
+ details: Optional[Dict[str, Any]] = None
34
+ ):
35
+ super().__init__(message)
36
+ self.message = message
37
+ self.error_type = error_type
38
+ self.document_id = document_id
39
+ self.recoverable = recoverable
40
+ self.details = details or {}
41
+
42
+
43
+ class InvalidPDFFormatError(PDFChunkingError):
44
+ """Raised when the file is not a valid PDF format."""
45
+
46
+ def __init__(
47
+ self,
48
+ message: str,
49
+ document_id: Optional[str] = None,
50
+ details: Optional[Dict[str, Any]] = None
51
+ ):
52
+ super().__init__(
53
+ message=message,
54
+ error_type='InvalidPDFFormat',
55
+ document_id=document_id,
56
+ recoverable=False,
57
+ details=details
58
+ )
59
+
60
+
61
+ class CorruptedPDFError(PDFChunkingError):
62
+ """Raised when the PDF file is corrupted and cannot be read."""
63
+
64
+ def __init__(
65
+ self,
66
+ message: str,
67
+ document_id: Optional[str] = None,
68
+ details: Optional[Dict[str, Any]] = None
69
+ ):
70
+ super().__init__(
71
+ message=message,
72
+ error_type='CorruptedPDF',
73
+ document_id=document_id,
74
+ recoverable=False,
75
+ details=details
76
+ )
77
+
78
+
79
+ class EncryptedPDFError(PDFChunkingError):
80
+ """Raised when the PDF file is encrypted and cannot be processed."""
81
+
82
+ def __init__(
83
+ self,
84
+ message: str,
85
+ document_id: Optional[str] = None,
86
+ details: Optional[Dict[str, Any]] = None
87
+ ):
88
+ super().__init__(
89
+ message=message,
90
+ error_type='EncryptedPDF',
91
+ document_id=document_id,
92
+ recoverable=False,
93
+ details=details
94
+ )
95
+
96
+
97
+ class S3AccessDeniedError(PDFChunkingError):
98
+ """Raised when S3 access is denied."""
99
+
100
+ def __init__(
101
+ self,
102
+ message: str,
103
+ document_id: Optional[str] = None,
104
+ bucket: Optional[str] = None,
105
+ key: Optional[str] = None,
106
+ details: Optional[Dict[str, Any]] = None
107
+ ):
108
+ error_details = details or {}
109
+ error_details.update({
110
+ 'bucket': bucket,
111
+ 'key': key
112
+ })
113
+ super().__init__(
114
+ message=message,
115
+ error_type='S3AccessDenied',
116
+ document_id=document_id,
117
+ recoverable=False,
118
+ details=error_details
119
+ )
120
+
121
+
122
+ class S3NotFoundError(PDFChunkingError):
123
+ """Raised when S3 object is not found."""
124
+
125
+ def __init__(
126
+ self,
127
+ message: str,
128
+ document_id: Optional[str] = None,
129
+ bucket: Optional[str] = None,
130
+ key: Optional[str] = None,
131
+ details: Optional[Dict[str, Any]] = None
132
+ ):
133
+ error_details = details or {}
134
+ error_details.update({
135
+ 'bucket': bucket,
136
+ 'key': key
137
+ })
138
+ super().__init__(
139
+ message=message,
140
+ error_type='S3NotFound',
141
+ document_id=document_id,
142
+ recoverable=False,
143
+ details=error_details
144
+ )
145
+
146
+
147
+ class S3ThrottlingError(PDFChunkingError):
148
+ """Raised when S3 requests are throttled."""
149
+
150
+ def __init__(
151
+ self,
152
+ message: str,
153
+ document_id: Optional[str] = None,
154
+ details: Optional[Dict[str, Any]] = None
155
+ ):
156
+ super().__init__(
157
+ message=message,
158
+ error_type='S3Throttling',
159
+ document_id=document_id,
160
+ recoverable=True,
161
+ details=details
162
+ )
163
+
164
+
165
+ class DynamoDBWriteError(PDFChunkingError):
166
+ """Raised when DynamoDB write operation fails."""
167
+
168
+ def __init__(
169
+ self,
170
+ message: str,
171
+ document_id: Optional[str] = None,
172
+ details: Optional[Dict[str, Any]] = None
173
+ ):
174
+ super().__init__(
175
+ message=message,
176
+ error_type='DynamoDBWriteError',
177
+ document_id=document_id,
178
+ recoverable=True,
179
+ details=details
180
+ )
181
+
182
+
183
+ class ChunkingTimeoutError(PDFChunkingError):
184
+ """Raised when chunking operation times out."""
185
+
186
+ def __init__(
187
+ self,
188
+ message: str,
189
+ document_id: Optional[str] = None,
190
+ details: Optional[Dict[str, Any]] = None
191
+ ):
192
+ super().__init__(
193
+ message=message,
194
+ error_type='ChunkingTimeout',
195
+ document_id=document_id,
196
+ recoverable=False,
197
+ details=details
198
+ )
199
+
200
+
201
+ class ConfigurationError(PDFChunkingError):
202
+ """Raised when chunking configuration is invalid."""
203
+
204
+ def __init__(
205
+ self,
206
+ message: str,
207
+ document_id: Optional[str] = None,
208
+ details: Optional[Dict[str, Any]] = None
209
+ ):
210
+ super().__init__(
211
+ message=message,
212
+ error_type='ConfigurationError',
213
+ document_id=document_id,
214
+ recoverable=False,
215
+ details=details
216
+ )
217
+
218
+
219
+ def classify_s3_error(
220
+ error: ClientError,
221
+ document_id: Optional[str] = None,
222
+ bucket: Optional[str] = None,
223
+ key: Optional[str] = None
224
+ ) -> PDFChunkingError:
225
+ """
226
+ Classify an S3 ClientError into a specific PDFChunkingError.
227
+
228
+ Args:
229
+ error: The boto3 ClientError from S3 operation
230
+ document_id: Document identifier for logging
231
+ bucket: S3 bucket name
232
+ key: S3 object key
233
+
234
+ Returns:
235
+ Appropriate PDFChunkingError subclass
236
+ """
237
+ error_code = error.response.get('Error', {}).get('Code', 'Unknown')
238
+ error_message = error.response.get('Error', {}).get('Message', str(error))
239
+
240
+ if error_code == 'AccessDenied':
241
+ return S3AccessDeniedError(
242
+ message=f"Access denied to S3 object s3://{bucket}/{key}: {error_message}",
243
+ document_id=document_id,
244
+ bucket=bucket,
245
+ key=key
246
+ )
247
+ elif error_code in ['NoSuchKey', 'NoSuchBucket', '404']:
248
+ return S3NotFoundError(
249
+ message=f"S3 object not found: s3://{bucket}/{key}",
250
+ document_id=document_id,
251
+ bucket=bucket,
252
+ key=key
253
+ )
254
+ elif error_code in ['SlowDown', 'Throttling', 'RequestLimitExceeded']:
255
+ return S3ThrottlingError(
256
+ message=f"S3 request throttled: {error_message}",
257
+ document_id=document_id,
258
+ details={'error_code': error_code}
259
+ )
260
+ else:
261
+ # Return generic error for other S3 errors
262
+ return PDFChunkingError(
263
+ message=f"S3 error ({error_code}): {error_message}",
264
+ error_type=f'S3Error_{error_code}',
265
+ document_id=document_id,
266
+ recoverable=error_code in ['ServiceUnavailable', 'InternalError'],
267
+ details={'error_code': error_code, 'bucket': bucket, 'key': key}
268
+ )
269
+
270
+
271
+ def classify_pdf_error(
272
+ error: Exception,
273
+ document_id: Optional[str] = None
274
+ ) -> PDFChunkingError:
275
+ """
276
+ Classify a PDF processing error into a specific PDFChunkingError.
277
+
278
+ Args:
279
+ error: The exception from PDF processing
280
+ document_id: Document identifier for logging
281
+
282
+ Returns:
283
+ Appropriate PDFChunkingError subclass
284
+ """
285
+ error_str = str(error).lower()
286
+
287
+ if 'encrypted' in error_str or 'password' in error_str:
288
+ return EncryptedPDFError(
289
+ message=f"PDF is encrypted and cannot be processed: {str(error)}",
290
+ document_id=document_id
291
+ )
292
+ elif 'invalid' in error_str or 'not a pdf' in error_str or 'not a valid pdf' in error_str or 'magic' in error_str:
293
+ return InvalidPDFFormatError(
294
+ message=f"Invalid PDF format: {str(error)}",
295
+ document_id=document_id
296
+ )
297
+ elif 'corrupt' in error_str or 'damaged' in error_str or 'malformed' in error_str:
298
+ return CorruptedPDFError(
299
+ message=f"PDF file is corrupted: {str(error)}",
300
+ document_id=document_id
301
+ )
302
+ else:
303
+ # Check for PyPDF2 specific errors
304
+ error_type = type(error).__name__
305
+ if 'PdfReadError' in error_type:
306
+ return CorruptedPDFError(
307
+ message=f"Failed to read PDF: {str(error)}",
308
+ document_id=document_id,
309
+ details={'original_error_type': error_type}
310
+ )
311
+
312
+ # Generic PDF error
313
+ return PDFChunkingError(
314
+ message=f"PDF processing error: {str(error)}",
315
+ error_type='PDFProcessingError',
316
+ document_id=document_id,
317
+ recoverable=False,
318
+ details={'original_error_type': error_type}
319
+ )
320
+
321
+
322
+ def create_error_response(
323
+ document_id: str,
324
+ error: PDFChunkingError
325
+ ) -> Dict[str, Any]:
326
+ """
327
+ Create a standardized error response for the Lambda handler.
328
+
329
+ Args:
330
+ document_id: Document identifier
331
+ error: The PDFChunkingError to convert to response
332
+
333
+ Returns:
334
+ Error response dictionary
335
+ """
336
+ return {
337
+ 'documentId': document_id,
338
+ 'requiresChunking': False,
339
+ 'error': {
340
+ 'type': error.error_type,
341
+ 'message': error.message,
342
+ 'recoverable': error.recoverable,
343
+ 'details': error.details
344
+ }
345
+ }
346
+
347
+
348
+ def log_error(
349
+ error: PDFChunkingError,
350
+ include_stack_trace: bool = True
351
+ ) -> None:
352
+ """
353
+ Log an error with structured logging.
354
+
355
+ Args:
356
+ error: The PDFChunkingError to log
357
+ include_stack_trace: Whether to include stack trace
358
+ """
359
+ log_data = {
360
+ 'documentId': error.document_id or 'unknown',
361
+ 'errorType': error.error_type,
362
+ 'errorMessage': error.message,
363
+ 'recoverable': error.recoverable,
364
+ 'details': error.details
365
+ }
366
+
367
+ if include_stack_trace:
368
+ logger.error(
369
+ f"Error processing document {error.document_id}: {error.message}",
370
+ extra=log_data,
371
+ exc_info=True
372
+ )
373
+ else:
374
+ logger.error(
375
+ f"Error processing document {error.document_id}: {error.message}",
376
+ extra=log_data
377
+ )
378
+
379
+
380
+ def retry_with_exponential_backoff(
381
+ max_retries: int = 3,
382
+ base_delay: float = 1.0,
383
+ max_delay: float = 30.0,
384
+ jitter: bool = True,
385
+ retryable_exceptions: tuple = (S3ThrottlingError, DynamoDBWriteError)
386
+ ):
387
+ """
388
+ Decorator for retrying operations with exponential backoff.
389
+
390
+ Args:
391
+ max_retries: Maximum number of retry attempts
392
+ base_delay: Base delay in seconds between retries
393
+ max_delay: Maximum delay in seconds
394
+ jitter: Whether to add random jitter to delay
395
+ retryable_exceptions: Tuple of exception types to retry
396
+
397
+ Returns:
398
+ Decorated function
399
+ """
400
+ def decorator(func: Callable[..., T]) -> Callable[..., T]:
401
+ @wraps(func)
402
+ def wrapper(*args, **kwargs) -> T:
403
+ last_exception = None
404
+
405
+ for attempt in range(max_retries + 1):
406
+ try:
407
+ return func(*args, **kwargs)
408
+ except retryable_exceptions as e:
409
+ last_exception = e
410
+
411
+ if attempt < max_retries:
412
+ # Calculate delay with exponential backoff
413
+ delay = min(base_delay * (2 ** attempt), max_delay)
414
+
415
+ # Add jitter if enabled
416
+ if jitter:
417
+ delay = delay * (0.5 + random.random())
418
+
419
+ logger.warning(
420
+ f"Retryable error on attempt {attempt + 1}/{max_retries + 1}: "
421
+ f"{str(e)}. Retrying in {delay:.2f}s",
422
+ extra={
423
+ 'attempt': attempt + 1,
424
+ 'maxRetries': max_retries + 1,
425
+ 'delay': delay,
426
+ 'errorType': type(e).__name__
427
+ }
428
+ )
429
+
430
+ time.sleep(delay)
431
+ else:
432
+ logger.error(
433
+ f"Max retries ({max_retries + 1}) exceeded: {str(e)}",
434
+ extra={
435
+ 'maxRetries': max_retries + 1,
436
+ 'errorType': type(e).__name__
437
+ }
438
+ )
439
+ raise
440
+
441
+ # Should not reach here, but raise last exception if we do
442
+ if last_exception:
443
+ raise last_exception
444
+
445
+ return wrapper
446
+ return decorator
447
+
448
+
449
+ def validate_pdf_magic_bytes(data: bytes, document_id: Optional[str] = None) -> None:
450
+ """
451
+ Validate that the file data starts with PDF magic bytes.
452
+
453
+ Args:
454
+ data: File data bytes
455
+ document_id: Document identifier for error messages
456
+
457
+ Raises:
458
+ InvalidPDFFormatError: If file is not a valid PDF
459
+ """
460
+ if not data or len(data) < 5:
461
+ raise InvalidPDFFormatError(
462
+ message="File is empty or too small to be a valid PDF",
463
+ document_id=document_id,
464
+ details={'file_size': len(data) if data else 0}
465
+ )
466
+
467
+ # Check for PDF magic bytes: %PDF-
468
+ pdf_magic = b'%PDF-'
469
+ if data[:5] != pdf_magic:
470
+ # Try to identify what type of file it is
471
+ file_type = 'unknown'
472
+ if data[:4] == b'PK\x03\x04':
473
+ file_type = 'ZIP/Office document'
474
+ elif data[:5] == b'<html' or data[:5] == b'<!DOC':
475
+ file_type = 'HTML'
476
+ elif data[:4] == b'RIFF':
477
+ file_type = 'RIFF (audio/video)'
478
+ elif data[:3] == b'\xff\xd8\xff':
479
+ file_type = 'JPEG image'
480
+ elif data[:8] == b'\x89PNG\r\n\x1a\n':
481
+ file_type = 'PNG image'
482
+
483
+ raise InvalidPDFFormatError(
484
+ message=f"File is not a valid PDF. Expected PDF magic bytes (%PDF-), "
485
+ f"but found {file_type} format",
486
+ document_id=document_id,
487
+ details={
488
+ 'detected_type': file_type,
489
+ 'first_bytes': data[:10].hex() if data else None
490
+ }
491
+ )