@cdklabs/cdk-appmod-catalog-blueprints 1.5.0 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.jsii +2537 -204
- package/lib/document-processing/adapter/adapter.d.ts +4 -2
- package/lib/document-processing/adapter/adapter.js +1 -1
- package/lib/document-processing/adapter/queued-s3-adapter.d.ts +9 -2
- package/lib/document-processing/adapter/queued-s3-adapter.js +29 -15
- package/lib/document-processing/agentic-document-processing.d.ts +4 -0
- package/lib/document-processing/agentic-document-processing.js +20 -10
- package/lib/document-processing/base-document-processing.d.ts +54 -2
- package/lib/document-processing/base-document-processing.js +136 -82
- package/lib/document-processing/bedrock-document-processing.d.ts +202 -2
- package/lib/document-processing/bedrock-document-processing.js +717 -77
- package/lib/document-processing/chunking-config.d.ts +614 -0
- package/lib/document-processing/chunking-config.js +5 -0
- package/lib/document-processing/default-document-processing-config.js +1 -1
- package/lib/document-processing/index.d.ts +1 -0
- package/lib/document-processing/index.js +2 -1
- package/lib/document-processing/resources/aggregation/handler.py +567 -0
- package/lib/document-processing/resources/aggregation/requirements.txt +7 -0
- package/lib/document-processing/resources/aggregation/test_handler.py +362 -0
- package/lib/document-processing/resources/cleanup/handler.py +276 -0
- package/lib/document-processing/resources/cleanup/requirements.txt +5 -0
- package/lib/document-processing/resources/cleanup/test_handler.py +436 -0
- package/lib/document-processing/resources/default-bedrock-invoke/index.py +85 -3
- package/lib/document-processing/resources/default-bedrock-invoke/test_index.py +622 -0
- package/lib/document-processing/resources/pdf-chunking/README.md +313 -0
- package/lib/document-processing/resources/pdf-chunking/chunking_strategies.py +460 -0
- package/lib/document-processing/resources/pdf-chunking/error_handling.py +491 -0
- package/lib/document-processing/resources/pdf-chunking/handler.py +958 -0
- package/lib/document-processing/resources/pdf-chunking/metrics.py +435 -0
- package/lib/document-processing/resources/pdf-chunking/requirements.txt +3 -0
- package/lib/document-processing/resources/pdf-chunking/strategy_selection.py +420 -0
- package/lib/document-processing/resources/pdf-chunking/structured_logging.py +457 -0
- package/lib/document-processing/resources/pdf-chunking/test_chunking_strategies.py +353 -0
- package/lib/document-processing/resources/pdf-chunking/test_error_handling.py +487 -0
- package/lib/document-processing/resources/pdf-chunking/test_handler.py +609 -0
- package/lib/document-processing/resources/pdf-chunking/test_integration.py +694 -0
- package/lib/document-processing/resources/pdf-chunking/test_metrics.py +532 -0
- package/lib/document-processing/resources/pdf-chunking/test_strategy_selection.py +471 -0
- package/lib/document-processing/resources/pdf-chunking/test_structured_logging.py +449 -0
- package/lib/document-processing/resources/pdf-chunking/test_token_estimation.py +374 -0
- package/lib/document-processing/resources/pdf-chunking/token_estimation.py +189 -0
- package/lib/document-processing/tests/agentic-document-processing-nag.test.js +4 -3
- package/lib/document-processing/tests/agentic-document-processing.test.js +488 -4
- package/lib/document-processing/tests/base-document-processing-nag.test.js +9 -2
- package/lib/document-processing/tests/base-document-processing-schema.test.d.ts +1 -0
- package/lib/document-processing/tests/base-document-processing-schema.test.js +337 -0
- package/lib/document-processing/tests/base-document-processing.test.js +114 -8
- package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.d.ts +1 -0
- package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.js +382 -0
- package/lib/document-processing/tests/bedrock-document-processing-nag.test.js +4 -3
- package/lib/document-processing/tests/bedrock-document-processing-security.test.d.ts +1 -0
- package/lib/document-processing/tests/bedrock-document-processing-security.test.js +389 -0
- package/lib/document-processing/tests/bedrock-document-processing.test.js +808 -8
- package/lib/document-processing/tests/chunking-config.test.d.ts +1 -0
- package/lib/document-processing/tests/chunking-config.test.js +238 -0
- package/lib/document-processing/tests/queued-s3-adapter-nag.test.js +9 -2
- package/lib/document-processing/tests/queued-s3-adapter.test.js +17 -6
- package/lib/framework/agents/base-agent.js +1 -1
- package/lib/framework/agents/batch-agent.js +1 -1
- package/lib/framework/agents/default-agent-config.js +1 -1
- package/lib/framework/bedrock/bedrock.js +1 -1
- package/lib/framework/custom-resource/default-runtimes.js +1 -1
- package/lib/framework/foundation/access-log.js +1 -1
- package/lib/framework/foundation/eventbridge-broker.js +1 -1
- package/lib/framework/foundation/network.js +1 -1
- package/lib/framework/tests/access-log.test.js +5 -2
- package/lib/framework/tests/batch-agent.test.js +5 -2
- package/lib/framework/tests/bedrock.test.js +5 -2
- package/lib/framework/tests/eventbridge-broker.test.js +5 -2
- package/lib/framework/tests/framework-nag.test.js +16 -8
- package/lib/framework/tests/network.test.js +9 -4
- package/lib/tsconfig.tsbuildinfo +1 -1
- package/lib/utilities/data-loader.js +1 -1
- package/lib/utilities/lambda-iam-utils.js +1 -1
- package/lib/utilities/observability/cloudfront-distribution-observability-property-injector.js +1 -1
- package/lib/utilities/observability/default-observability-config.js +1 -1
- package/lib/utilities/observability/lambda-observability-property-injector.js +1 -1
- package/lib/utilities/observability/log-group-data-protection-utils.js +1 -1
- package/lib/utilities/observability/powertools-config.d.ts +10 -1
- package/lib/utilities/observability/powertools-config.js +19 -3
- package/lib/utilities/observability/state-machine-observability-property-injector.js +1 -1
- package/lib/utilities/test-utils.d.ts +43 -0
- package/lib/utilities/test-utils.js +56 -0
- package/lib/utilities/tests/data-loader-nag.test.js +3 -2
- package/lib/utilities/tests/data-loader.test.js +3 -2
- package/lib/webapp/frontend-construct.js +1 -1
- package/lib/webapp/tests/frontend-construct-nag.test.js +3 -2
- package/lib/webapp/tests/frontend-construct.test.js +3 -2
- package/package.json +6 -5
- package/lib/document-processing/resources/default-error-handler/index.js +0 -46
- package/lib/document-processing/resources/default-pdf-processor/index.js +0 -46
- package/lib/document-processing/resources/default-pdf-validator/index.js +0 -36
|
@@ -0,0 +1,491 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Error handling module for PDF chunking Lambda.
|
|
3
|
+
|
|
4
|
+
This module provides centralized error handling, classification, and response
|
|
5
|
+
generation for the PDF chunking Lambda function.
|
|
6
|
+
|
|
7
|
+
Requirements: 1.5, 7.1, 7.2, 7.3
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
import time
|
|
12
|
+
import random
|
|
13
|
+
from typing import Dict, Any, Optional, Callable, TypeVar
|
|
14
|
+
from functools import wraps
|
|
15
|
+
from botocore.exceptions import ClientError
|
|
16
|
+
|
|
17
|
+
# Configure module logger
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
# Type variable for generic retry decorator
|
|
21
|
+
T = TypeVar('T')
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class PDFChunkingError(Exception):
|
|
25
|
+
"""Base exception for PDF chunking errors."""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
message: str,
|
|
30
|
+
error_type: str,
|
|
31
|
+
document_id: Optional[str] = None,
|
|
32
|
+
recoverable: bool = False,
|
|
33
|
+
details: Optional[Dict[str, Any]] = None
|
|
34
|
+
):
|
|
35
|
+
super().__init__(message)
|
|
36
|
+
self.message = message
|
|
37
|
+
self.error_type = error_type
|
|
38
|
+
self.document_id = document_id
|
|
39
|
+
self.recoverable = recoverable
|
|
40
|
+
self.details = details or {}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class InvalidPDFFormatError(PDFChunkingError):
|
|
44
|
+
"""Raised when the file is not a valid PDF format."""
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
message: str,
|
|
49
|
+
document_id: Optional[str] = None,
|
|
50
|
+
details: Optional[Dict[str, Any]] = None
|
|
51
|
+
):
|
|
52
|
+
super().__init__(
|
|
53
|
+
message=message,
|
|
54
|
+
error_type='InvalidPDFFormat',
|
|
55
|
+
document_id=document_id,
|
|
56
|
+
recoverable=False,
|
|
57
|
+
details=details
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class CorruptedPDFError(PDFChunkingError):
|
|
62
|
+
"""Raised when the PDF file is corrupted and cannot be read."""
|
|
63
|
+
|
|
64
|
+
def __init__(
|
|
65
|
+
self,
|
|
66
|
+
message: str,
|
|
67
|
+
document_id: Optional[str] = None,
|
|
68
|
+
details: Optional[Dict[str, Any]] = None
|
|
69
|
+
):
|
|
70
|
+
super().__init__(
|
|
71
|
+
message=message,
|
|
72
|
+
error_type='CorruptedPDF',
|
|
73
|
+
document_id=document_id,
|
|
74
|
+
recoverable=False,
|
|
75
|
+
details=details
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class EncryptedPDFError(PDFChunkingError):
|
|
80
|
+
"""Raised when the PDF file is encrypted and cannot be processed."""
|
|
81
|
+
|
|
82
|
+
def __init__(
|
|
83
|
+
self,
|
|
84
|
+
message: str,
|
|
85
|
+
document_id: Optional[str] = None,
|
|
86
|
+
details: Optional[Dict[str, Any]] = None
|
|
87
|
+
):
|
|
88
|
+
super().__init__(
|
|
89
|
+
message=message,
|
|
90
|
+
error_type='EncryptedPDF',
|
|
91
|
+
document_id=document_id,
|
|
92
|
+
recoverable=False,
|
|
93
|
+
details=details
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class S3AccessDeniedError(PDFChunkingError):
|
|
98
|
+
"""Raised when S3 access is denied."""
|
|
99
|
+
|
|
100
|
+
def __init__(
|
|
101
|
+
self,
|
|
102
|
+
message: str,
|
|
103
|
+
document_id: Optional[str] = None,
|
|
104
|
+
bucket: Optional[str] = None,
|
|
105
|
+
key: Optional[str] = None,
|
|
106
|
+
details: Optional[Dict[str, Any]] = None
|
|
107
|
+
):
|
|
108
|
+
error_details = details or {}
|
|
109
|
+
error_details.update({
|
|
110
|
+
'bucket': bucket,
|
|
111
|
+
'key': key
|
|
112
|
+
})
|
|
113
|
+
super().__init__(
|
|
114
|
+
message=message,
|
|
115
|
+
error_type='S3AccessDenied',
|
|
116
|
+
document_id=document_id,
|
|
117
|
+
recoverable=False,
|
|
118
|
+
details=error_details
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class S3NotFoundError(PDFChunkingError):
|
|
123
|
+
"""Raised when S3 object is not found."""
|
|
124
|
+
|
|
125
|
+
def __init__(
|
|
126
|
+
self,
|
|
127
|
+
message: str,
|
|
128
|
+
document_id: Optional[str] = None,
|
|
129
|
+
bucket: Optional[str] = None,
|
|
130
|
+
key: Optional[str] = None,
|
|
131
|
+
details: Optional[Dict[str, Any]] = None
|
|
132
|
+
):
|
|
133
|
+
error_details = details or {}
|
|
134
|
+
error_details.update({
|
|
135
|
+
'bucket': bucket,
|
|
136
|
+
'key': key
|
|
137
|
+
})
|
|
138
|
+
super().__init__(
|
|
139
|
+
message=message,
|
|
140
|
+
error_type='S3NotFound',
|
|
141
|
+
document_id=document_id,
|
|
142
|
+
recoverable=False,
|
|
143
|
+
details=error_details
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class S3ThrottlingError(PDFChunkingError):
|
|
148
|
+
"""Raised when S3 requests are throttled."""
|
|
149
|
+
|
|
150
|
+
def __init__(
|
|
151
|
+
self,
|
|
152
|
+
message: str,
|
|
153
|
+
document_id: Optional[str] = None,
|
|
154
|
+
details: Optional[Dict[str, Any]] = None
|
|
155
|
+
):
|
|
156
|
+
super().__init__(
|
|
157
|
+
message=message,
|
|
158
|
+
error_type='S3Throttling',
|
|
159
|
+
document_id=document_id,
|
|
160
|
+
recoverable=True,
|
|
161
|
+
details=details
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class DynamoDBWriteError(PDFChunkingError):
|
|
166
|
+
"""Raised when DynamoDB write operation fails."""
|
|
167
|
+
|
|
168
|
+
def __init__(
|
|
169
|
+
self,
|
|
170
|
+
message: str,
|
|
171
|
+
document_id: Optional[str] = None,
|
|
172
|
+
details: Optional[Dict[str, Any]] = None
|
|
173
|
+
):
|
|
174
|
+
super().__init__(
|
|
175
|
+
message=message,
|
|
176
|
+
error_type='DynamoDBWriteError',
|
|
177
|
+
document_id=document_id,
|
|
178
|
+
recoverable=True,
|
|
179
|
+
details=details
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
class ChunkingTimeoutError(PDFChunkingError):
|
|
184
|
+
"""Raised when chunking operation times out."""
|
|
185
|
+
|
|
186
|
+
def __init__(
|
|
187
|
+
self,
|
|
188
|
+
message: str,
|
|
189
|
+
document_id: Optional[str] = None,
|
|
190
|
+
details: Optional[Dict[str, Any]] = None
|
|
191
|
+
):
|
|
192
|
+
super().__init__(
|
|
193
|
+
message=message,
|
|
194
|
+
error_type='ChunkingTimeout',
|
|
195
|
+
document_id=document_id,
|
|
196
|
+
recoverable=False,
|
|
197
|
+
details=details
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
class ConfigurationError(PDFChunkingError):
|
|
202
|
+
"""Raised when chunking configuration is invalid."""
|
|
203
|
+
|
|
204
|
+
def __init__(
|
|
205
|
+
self,
|
|
206
|
+
message: str,
|
|
207
|
+
document_id: Optional[str] = None,
|
|
208
|
+
details: Optional[Dict[str, Any]] = None
|
|
209
|
+
):
|
|
210
|
+
super().__init__(
|
|
211
|
+
message=message,
|
|
212
|
+
error_type='ConfigurationError',
|
|
213
|
+
document_id=document_id,
|
|
214
|
+
recoverable=False,
|
|
215
|
+
details=details
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def classify_s3_error(
|
|
220
|
+
error: ClientError,
|
|
221
|
+
document_id: Optional[str] = None,
|
|
222
|
+
bucket: Optional[str] = None,
|
|
223
|
+
key: Optional[str] = None
|
|
224
|
+
) -> PDFChunkingError:
|
|
225
|
+
"""
|
|
226
|
+
Classify an S3 ClientError into a specific PDFChunkingError.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
error: The boto3 ClientError from S3 operation
|
|
230
|
+
document_id: Document identifier for logging
|
|
231
|
+
bucket: S3 bucket name
|
|
232
|
+
key: S3 object key
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
Appropriate PDFChunkingError subclass
|
|
236
|
+
"""
|
|
237
|
+
error_code = error.response.get('Error', {}).get('Code', 'Unknown')
|
|
238
|
+
error_message = error.response.get('Error', {}).get('Message', str(error))
|
|
239
|
+
|
|
240
|
+
if error_code == 'AccessDenied':
|
|
241
|
+
return S3AccessDeniedError(
|
|
242
|
+
message=f"Access denied to S3 object s3://{bucket}/{key}: {error_message}",
|
|
243
|
+
document_id=document_id,
|
|
244
|
+
bucket=bucket,
|
|
245
|
+
key=key
|
|
246
|
+
)
|
|
247
|
+
elif error_code in ['NoSuchKey', 'NoSuchBucket', '404']:
|
|
248
|
+
return S3NotFoundError(
|
|
249
|
+
message=f"S3 object not found: s3://{bucket}/{key}",
|
|
250
|
+
document_id=document_id,
|
|
251
|
+
bucket=bucket,
|
|
252
|
+
key=key
|
|
253
|
+
)
|
|
254
|
+
elif error_code in ['SlowDown', 'Throttling', 'RequestLimitExceeded']:
|
|
255
|
+
return S3ThrottlingError(
|
|
256
|
+
message=f"S3 request throttled: {error_message}",
|
|
257
|
+
document_id=document_id,
|
|
258
|
+
details={'error_code': error_code}
|
|
259
|
+
)
|
|
260
|
+
else:
|
|
261
|
+
# Return generic error for other S3 errors
|
|
262
|
+
return PDFChunkingError(
|
|
263
|
+
message=f"S3 error ({error_code}): {error_message}",
|
|
264
|
+
error_type=f'S3Error_{error_code}',
|
|
265
|
+
document_id=document_id,
|
|
266
|
+
recoverable=error_code in ['ServiceUnavailable', 'InternalError'],
|
|
267
|
+
details={'error_code': error_code, 'bucket': bucket, 'key': key}
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def classify_pdf_error(
|
|
272
|
+
error: Exception,
|
|
273
|
+
document_id: Optional[str] = None
|
|
274
|
+
) -> PDFChunkingError:
|
|
275
|
+
"""
|
|
276
|
+
Classify a PDF processing error into a specific PDFChunkingError.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
error: The exception from PDF processing
|
|
280
|
+
document_id: Document identifier for logging
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
Appropriate PDFChunkingError subclass
|
|
284
|
+
"""
|
|
285
|
+
error_str = str(error).lower()
|
|
286
|
+
|
|
287
|
+
if 'encrypted' in error_str or 'password' in error_str:
|
|
288
|
+
return EncryptedPDFError(
|
|
289
|
+
message=f"PDF is encrypted and cannot be processed: {str(error)}",
|
|
290
|
+
document_id=document_id
|
|
291
|
+
)
|
|
292
|
+
elif 'invalid' in error_str or 'not a pdf' in error_str or 'not a valid pdf' in error_str or 'magic' in error_str:
|
|
293
|
+
return InvalidPDFFormatError(
|
|
294
|
+
message=f"Invalid PDF format: {str(error)}",
|
|
295
|
+
document_id=document_id
|
|
296
|
+
)
|
|
297
|
+
elif 'corrupt' in error_str or 'damaged' in error_str or 'malformed' in error_str:
|
|
298
|
+
return CorruptedPDFError(
|
|
299
|
+
message=f"PDF file is corrupted: {str(error)}",
|
|
300
|
+
document_id=document_id
|
|
301
|
+
)
|
|
302
|
+
else:
|
|
303
|
+
# Check for PyPDF2 specific errors
|
|
304
|
+
error_type = type(error).__name__
|
|
305
|
+
if 'PdfReadError' in error_type:
|
|
306
|
+
return CorruptedPDFError(
|
|
307
|
+
message=f"Failed to read PDF: {str(error)}",
|
|
308
|
+
document_id=document_id,
|
|
309
|
+
details={'original_error_type': error_type}
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
# Generic PDF error
|
|
313
|
+
return PDFChunkingError(
|
|
314
|
+
message=f"PDF processing error: {str(error)}",
|
|
315
|
+
error_type='PDFProcessingError',
|
|
316
|
+
document_id=document_id,
|
|
317
|
+
recoverable=False,
|
|
318
|
+
details={'original_error_type': error_type}
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def create_error_response(
|
|
323
|
+
document_id: str,
|
|
324
|
+
error: PDFChunkingError
|
|
325
|
+
) -> Dict[str, Any]:
|
|
326
|
+
"""
|
|
327
|
+
Create a standardized error response for the Lambda handler.
|
|
328
|
+
|
|
329
|
+
Args:
|
|
330
|
+
document_id: Document identifier
|
|
331
|
+
error: The PDFChunkingError to convert to response
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
Error response dictionary
|
|
335
|
+
"""
|
|
336
|
+
return {
|
|
337
|
+
'documentId': document_id,
|
|
338
|
+
'requiresChunking': False,
|
|
339
|
+
'error': {
|
|
340
|
+
'type': error.error_type,
|
|
341
|
+
'message': error.message,
|
|
342
|
+
'recoverable': error.recoverable,
|
|
343
|
+
'details': error.details
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def log_error(
|
|
349
|
+
error: PDFChunkingError,
|
|
350
|
+
include_stack_trace: bool = True
|
|
351
|
+
) -> None:
|
|
352
|
+
"""
|
|
353
|
+
Log an error with structured logging.
|
|
354
|
+
|
|
355
|
+
Args:
|
|
356
|
+
error: The PDFChunkingError to log
|
|
357
|
+
include_stack_trace: Whether to include stack trace
|
|
358
|
+
"""
|
|
359
|
+
log_data = {
|
|
360
|
+
'documentId': error.document_id or 'unknown',
|
|
361
|
+
'errorType': error.error_type,
|
|
362
|
+
'errorMessage': error.message,
|
|
363
|
+
'recoverable': error.recoverable,
|
|
364
|
+
'details': error.details
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
if include_stack_trace:
|
|
368
|
+
logger.error(
|
|
369
|
+
f"Error processing document {error.document_id}: {error.message}",
|
|
370
|
+
extra=log_data,
|
|
371
|
+
exc_info=True
|
|
372
|
+
)
|
|
373
|
+
else:
|
|
374
|
+
logger.error(
|
|
375
|
+
f"Error processing document {error.document_id}: {error.message}",
|
|
376
|
+
extra=log_data
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def retry_with_exponential_backoff(
|
|
381
|
+
max_retries: int = 3,
|
|
382
|
+
base_delay: float = 1.0,
|
|
383
|
+
max_delay: float = 30.0,
|
|
384
|
+
jitter: bool = True,
|
|
385
|
+
retryable_exceptions: tuple = (S3ThrottlingError, DynamoDBWriteError)
|
|
386
|
+
):
|
|
387
|
+
"""
|
|
388
|
+
Decorator for retrying operations with exponential backoff.
|
|
389
|
+
|
|
390
|
+
Args:
|
|
391
|
+
max_retries: Maximum number of retry attempts
|
|
392
|
+
base_delay: Base delay in seconds between retries
|
|
393
|
+
max_delay: Maximum delay in seconds
|
|
394
|
+
jitter: Whether to add random jitter to delay
|
|
395
|
+
retryable_exceptions: Tuple of exception types to retry
|
|
396
|
+
|
|
397
|
+
Returns:
|
|
398
|
+
Decorated function
|
|
399
|
+
"""
|
|
400
|
+
def decorator(func: Callable[..., T]) -> Callable[..., T]:
|
|
401
|
+
@wraps(func)
|
|
402
|
+
def wrapper(*args, **kwargs) -> T:
|
|
403
|
+
last_exception = None
|
|
404
|
+
|
|
405
|
+
for attempt in range(max_retries + 1):
|
|
406
|
+
try:
|
|
407
|
+
return func(*args, **kwargs)
|
|
408
|
+
except retryable_exceptions as e:
|
|
409
|
+
last_exception = e
|
|
410
|
+
|
|
411
|
+
if attempt < max_retries:
|
|
412
|
+
# Calculate delay with exponential backoff
|
|
413
|
+
delay = min(base_delay * (2 ** attempt), max_delay)
|
|
414
|
+
|
|
415
|
+
# Add jitter if enabled
|
|
416
|
+
if jitter:
|
|
417
|
+
delay = delay * (0.5 + random.random())
|
|
418
|
+
|
|
419
|
+
logger.warning(
|
|
420
|
+
f"Retryable error on attempt {attempt + 1}/{max_retries + 1}: "
|
|
421
|
+
f"{str(e)}. Retrying in {delay:.2f}s",
|
|
422
|
+
extra={
|
|
423
|
+
'attempt': attempt + 1,
|
|
424
|
+
'maxRetries': max_retries + 1,
|
|
425
|
+
'delay': delay,
|
|
426
|
+
'errorType': type(e).__name__
|
|
427
|
+
}
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
time.sleep(delay)
|
|
431
|
+
else:
|
|
432
|
+
logger.error(
|
|
433
|
+
f"Max retries ({max_retries + 1}) exceeded: {str(e)}",
|
|
434
|
+
extra={
|
|
435
|
+
'maxRetries': max_retries + 1,
|
|
436
|
+
'errorType': type(e).__name__
|
|
437
|
+
}
|
|
438
|
+
)
|
|
439
|
+
raise
|
|
440
|
+
|
|
441
|
+
# Should not reach here, but raise last exception if we do
|
|
442
|
+
if last_exception:
|
|
443
|
+
raise last_exception
|
|
444
|
+
|
|
445
|
+
return wrapper
|
|
446
|
+
return decorator
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def validate_pdf_magic_bytes(data: bytes, document_id: Optional[str] = None) -> None:
|
|
450
|
+
"""
|
|
451
|
+
Validate that the file data starts with PDF magic bytes.
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
data: File data bytes
|
|
455
|
+
document_id: Document identifier for error messages
|
|
456
|
+
|
|
457
|
+
Raises:
|
|
458
|
+
InvalidPDFFormatError: If file is not a valid PDF
|
|
459
|
+
"""
|
|
460
|
+
if not data or len(data) < 5:
|
|
461
|
+
raise InvalidPDFFormatError(
|
|
462
|
+
message="File is empty or too small to be a valid PDF",
|
|
463
|
+
document_id=document_id,
|
|
464
|
+
details={'file_size': len(data) if data else 0}
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
# Check for PDF magic bytes: %PDF-
|
|
468
|
+
pdf_magic = b'%PDF-'
|
|
469
|
+
if data[:5] != pdf_magic:
|
|
470
|
+
# Try to identify what type of file it is
|
|
471
|
+
file_type = 'unknown'
|
|
472
|
+
if data[:4] == b'PK\x03\x04':
|
|
473
|
+
file_type = 'ZIP/Office document'
|
|
474
|
+
elif data[:5] == b'<html' or data[:5] == b'<!DOC':
|
|
475
|
+
file_type = 'HTML'
|
|
476
|
+
elif data[:4] == b'RIFF':
|
|
477
|
+
file_type = 'RIFF (audio/video)'
|
|
478
|
+
elif data[:3] == b'\xff\xd8\xff':
|
|
479
|
+
file_type = 'JPEG image'
|
|
480
|
+
elif data[:8] == b'\x89PNG\r\n\x1a\n':
|
|
481
|
+
file_type = 'PNG image'
|
|
482
|
+
|
|
483
|
+
raise InvalidPDFFormatError(
|
|
484
|
+
message=f"File is not a valid PDF. Expected PDF magic bytes (%PDF-), "
|
|
485
|
+
f"but found {file_type} format",
|
|
486
|
+
document_id=document_id,
|
|
487
|
+
details={
|
|
488
|
+
'detected_type': file_type,
|
|
489
|
+
'first_bytes': data[:10].hex() if data else None
|
|
490
|
+
}
|
|
491
|
+
)
|