@cdklabs/cdk-appmod-catalog-blueprints 1.5.0 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.jsii +2537 -204
- package/lib/document-processing/adapter/adapter.d.ts +4 -2
- package/lib/document-processing/adapter/adapter.js +1 -1
- package/lib/document-processing/adapter/queued-s3-adapter.d.ts +9 -2
- package/lib/document-processing/adapter/queued-s3-adapter.js +29 -15
- package/lib/document-processing/agentic-document-processing.d.ts +4 -0
- package/lib/document-processing/agentic-document-processing.js +20 -10
- package/lib/document-processing/base-document-processing.d.ts +54 -2
- package/lib/document-processing/base-document-processing.js +136 -82
- package/lib/document-processing/bedrock-document-processing.d.ts +202 -2
- package/lib/document-processing/bedrock-document-processing.js +717 -77
- package/lib/document-processing/chunking-config.d.ts +614 -0
- package/lib/document-processing/chunking-config.js +5 -0
- package/lib/document-processing/default-document-processing-config.js +1 -1
- package/lib/document-processing/index.d.ts +1 -0
- package/lib/document-processing/index.js +2 -1
- package/lib/document-processing/resources/aggregation/handler.py +567 -0
- package/lib/document-processing/resources/aggregation/requirements.txt +7 -0
- package/lib/document-processing/resources/aggregation/test_handler.py +362 -0
- package/lib/document-processing/resources/cleanup/handler.py +276 -0
- package/lib/document-processing/resources/cleanup/requirements.txt +5 -0
- package/lib/document-processing/resources/cleanup/test_handler.py +436 -0
- package/lib/document-processing/resources/default-bedrock-invoke/index.py +85 -3
- package/lib/document-processing/resources/default-bedrock-invoke/test_index.py +622 -0
- package/lib/document-processing/resources/pdf-chunking/README.md +313 -0
- package/lib/document-processing/resources/pdf-chunking/chunking_strategies.py +460 -0
- package/lib/document-processing/resources/pdf-chunking/error_handling.py +491 -0
- package/lib/document-processing/resources/pdf-chunking/handler.py +958 -0
- package/lib/document-processing/resources/pdf-chunking/metrics.py +435 -0
- package/lib/document-processing/resources/pdf-chunking/requirements.txt +3 -0
- package/lib/document-processing/resources/pdf-chunking/strategy_selection.py +420 -0
- package/lib/document-processing/resources/pdf-chunking/structured_logging.py +457 -0
- package/lib/document-processing/resources/pdf-chunking/test_chunking_strategies.py +353 -0
- package/lib/document-processing/resources/pdf-chunking/test_error_handling.py +487 -0
- package/lib/document-processing/resources/pdf-chunking/test_handler.py +609 -0
- package/lib/document-processing/resources/pdf-chunking/test_integration.py +694 -0
- package/lib/document-processing/resources/pdf-chunking/test_metrics.py +532 -0
- package/lib/document-processing/resources/pdf-chunking/test_strategy_selection.py +471 -0
- package/lib/document-processing/resources/pdf-chunking/test_structured_logging.py +449 -0
- package/lib/document-processing/resources/pdf-chunking/test_token_estimation.py +374 -0
- package/lib/document-processing/resources/pdf-chunking/token_estimation.py +189 -0
- package/lib/document-processing/tests/agentic-document-processing-nag.test.js +4 -3
- package/lib/document-processing/tests/agentic-document-processing.test.js +488 -4
- package/lib/document-processing/tests/base-document-processing-nag.test.js +9 -2
- package/lib/document-processing/tests/base-document-processing-schema.test.d.ts +1 -0
- package/lib/document-processing/tests/base-document-processing-schema.test.js +337 -0
- package/lib/document-processing/tests/base-document-processing.test.js +114 -8
- package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.d.ts +1 -0
- package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.js +382 -0
- package/lib/document-processing/tests/bedrock-document-processing-nag.test.js +4 -3
- package/lib/document-processing/tests/bedrock-document-processing-security.test.d.ts +1 -0
- package/lib/document-processing/tests/bedrock-document-processing-security.test.js +389 -0
- package/lib/document-processing/tests/bedrock-document-processing.test.js +808 -8
- package/lib/document-processing/tests/chunking-config.test.d.ts +1 -0
- package/lib/document-processing/tests/chunking-config.test.js +238 -0
- package/lib/document-processing/tests/queued-s3-adapter-nag.test.js +9 -2
- package/lib/document-processing/tests/queued-s3-adapter.test.js +17 -6
- package/lib/framework/agents/base-agent.js +1 -1
- package/lib/framework/agents/batch-agent.js +1 -1
- package/lib/framework/agents/default-agent-config.js +1 -1
- package/lib/framework/bedrock/bedrock.js +1 -1
- package/lib/framework/custom-resource/default-runtimes.js +1 -1
- package/lib/framework/foundation/access-log.js +1 -1
- package/lib/framework/foundation/eventbridge-broker.js +1 -1
- package/lib/framework/foundation/network.js +1 -1
- package/lib/framework/tests/access-log.test.js +5 -2
- package/lib/framework/tests/batch-agent.test.js +5 -2
- package/lib/framework/tests/bedrock.test.js +5 -2
- package/lib/framework/tests/eventbridge-broker.test.js +5 -2
- package/lib/framework/tests/framework-nag.test.js +16 -8
- package/lib/framework/tests/network.test.js +9 -4
- package/lib/tsconfig.tsbuildinfo +1 -1
- package/lib/utilities/data-loader.js +1 -1
- package/lib/utilities/lambda-iam-utils.js +1 -1
- package/lib/utilities/observability/cloudfront-distribution-observability-property-injector.js +1 -1
- package/lib/utilities/observability/default-observability-config.js +1 -1
- package/lib/utilities/observability/lambda-observability-property-injector.js +1 -1
- package/lib/utilities/observability/log-group-data-protection-utils.js +1 -1
- package/lib/utilities/observability/powertools-config.d.ts +10 -1
- package/lib/utilities/observability/powertools-config.js +19 -3
- package/lib/utilities/observability/state-machine-observability-property-injector.js +1 -1
- package/lib/utilities/test-utils.d.ts +43 -0
- package/lib/utilities/test-utils.js +56 -0
- package/lib/utilities/tests/data-loader-nag.test.js +3 -2
- package/lib/utilities/tests/data-loader.test.js +3 -2
- package/lib/webapp/frontend-construct.js +1 -1
- package/lib/webapp/tests/frontend-construct-nag.test.js +3 -2
- package/lib/webapp/tests/frontend-construct.test.js +3 -2
- package/package.json +6 -5
- package/lib/document-processing/resources/default-error-handler/index.js +0 -46
- package/lib/document-processing/resources/default-pdf-processor/index.js +0 -46
- package/lib/document-processing/resources/default-pdf-validator/index.js +0 -36
|
@@ -0,0 +1,958 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PDF Analysis and Chunking Lambda Handler.
|
|
3
|
+
|
|
4
|
+
This Lambda function is the first step in the Step Functions workflow for chunked
|
|
5
|
+
document processing. It analyzes PDFs to determine if chunking is needed, and if so,
|
|
6
|
+
splits the PDF into chunks and uploads them to S3.
|
|
7
|
+
|
|
8
|
+
This is a single Lambda that does both analysis and chunking to avoid downloading
|
|
9
|
+
the PDF twice.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import logging
|
|
14
|
+
import os
|
|
15
|
+
import time
|
|
16
|
+
from typing import Dict, Any, Optional
|
|
17
|
+
import boto3
|
|
18
|
+
from botocore.exceptions import ClientError
|
|
19
|
+
|
|
20
|
+
# Import local modules
|
|
21
|
+
from token_estimation import analyze_pdf_tokens, estimate_tokens_fast
|
|
22
|
+
from chunking_strategies import (
|
|
23
|
+
calculate_chunks_fixed_pages,
|
|
24
|
+
calculate_chunks_token_based,
|
|
25
|
+
calculate_chunks_hybrid,
|
|
26
|
+
validate_configuration,
|
|
27
|
+
ConfigurationError
|
|
28
|
+
)
|
|
29
|
+
from error_handling import (
|
|
30
|
+
PDFChunkingError,
|
|
31
|
+
InvalidPDFFormatError,
|
|
32
|
+
CorruptedPDFError,
|
|
33
|
+
EncryptedPDFError,
|
|
34
|
+
S3AccessDeniedError,
|
|
35
|
+
S3NotFoundError,
|
|
36
|
+
S3ThrottlingError,
|
|
37
|
+
DynamoDBWriteError,
|
|
38
|
+
ChunkingTimeoutError,
|
|
39
|
+
classify_s3_error,
|
|
40
|
+
classify_pdf_error,
|
|
41
|
+
create_error_response as create_typed_error_response,
|
|
42
|
+
log_error,
|
|
43
|
+
retry_with_exponential_backoff,
|
|
44
|
+
validate_pdf_magic_bytes
|
|
45
|
+
)
|
|
46
|
+
from metrics import (
|
|
47
|
+
get_metrics,
|
|
48
|
+
emit_chunking_metrics,
|
|
49
|
+
emit_chunking_operation,
|
|
50
|
+
emit_chunk_count,
|
|
51
|
+
emit_tokens_per_chunk,
|
|
52
|
+
emit_chunk_processing_time,
|
|
53
|
+
emit_strategy_usage,
|
|
54
|
+
timed_operation
|
|
55
|
+
)
|
|
56
|
+
from structured_logging import (
|
|
57
|
+
get_logger,
|
|
58
|
+
log_strategy_selection,
|
|
59
|
+
log_chunking_operation,
|
|
60
|
+
with_correlation_id,
|
|
61
|
+
is_observability_enabled
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# Configure structured logging
|
|
65
|
+
structured_logger = get_logger(__name__)
|
|
66
|
+
|
|
67
|
+
# Keep standard logger for backward compatibility
|
|
68
|
+
logger = logging.getLogger()
|
|
69
|
+
logger.setLevel(os.environ.get('LOG_LEVEL', 'INFO'))
|
|
70
|
+
|
|
71
|
+
# Initialize AWS clients
|
|
72
|
+
s3_client = boto3.client('s3')
|
|
73
|
+
|
|
74
|
+
# Get Powertools metrics instance
|
|
75
|
+
metrics = get_metrics()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@metrics.log_metrics
|
|
79
|
+
@with_correlation_id
|
|
80
|
+
def handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
|
|
81
|
+
"""
|
|
82
|
+
Lambda handler for PDF analysis and chunking.
|
|
83
|
+
|
|
84
|
+
This function:
|
|
85
|
+
1. Parses the event for document metadata and configuration
|
|
86
|
+
2. Analyzes the PDF to determine token count and page count
|
|
87
|
+
3. Determines if chunking is required based on strategy and thresholds
|
|
88
|
+
4. If no chunking needed: returns analysis metadata only
|
|
89
|
+
5. If chunking needed: splits PDF and uploads chunks to S3
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
event: Step Functions event with:
|
|
93
|
+
- documentId: Unique document identifier
|
|
94
|
+
- contentType: Type of content (should be 'file')
|
|
95
|
+
- content: Object with bucket, key, location, filename
|
|
96
|
+
- config: Optional chunking configuration
|
|
97
|
+
context: Lambda context object
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
NoChunkingResponse or ChunkingResponse based on analysis
|
|
101
|
+
|
|
102
|
+
Raises:
|
|
103
|
+
Exception: For any processing errors (caught and returned as error response)
|
|
104
|
+
"""
|
|
105
|
+
start_time = time.time()
|
|
106
|
+
|
|
107
|
+
try:
|
|
108
|
+
# Parse event
|
|
109
|
+
document_id = event.get('documentId')
|
|
110
|
+
content_type = event.get('contentType')
|
|
111
|
+
content = event.get('content', {})
|
|
112
|
+
bucket = content.get('bucket')
|
|
113
|
+
key = content.get('key')
|
|
114
|
+
config = event.get('config', {})
|
|
115
|
+
|
|
116
|
+
# Validate required fields
|
|
117
|
+
if not document_id:
|
|
118
|
+
raise ValueError("Missing required field: documentId")
|
|
119
|
+
if not bucket or not key:
|
|
120
|
+
raise ValueError("Missing required fields: content.bucket or content.key")
|
|
121
|
+
if content_type and content_type != 'file':
|
|
122
|
+
raise ValueError(f"Unsupported contentType: {content_type}. Only 'file' is supported.")
|
|
123
|
+
|
|
124
|
+
# Set document context for structured logging
|
|
125
|
+
structured_logger.set_document_context(document_id=document_id)
|
|
126
|
+
|
|
127
|
+
# Validate file extension (should be PDF)
|
|
128
|
+
if not key.lower().endswith('.pdf'):
|
|
129
|
+
structured_logger.warning(
|
|
130
|
+
f"File {key} does not have .pdf extension. Will validate PDF format using magic bytes.",
|
|
131
|
+
extra={'bucket': bucket, 'key': key}
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
structured_logger.info(
|
|
135
|
+
f"Processing document {document_id} from s3://{bucket}/{key}",
|
|
136
|
+
extra={
|
|
137
|
+
'bucket': bucket,
|
|
138
|
+
'key': key,
|
|
139
|
+
'strategy': config.get('strategy', 'hybrid'),
|
|
140
|
+
'source': event.get('source', 'unknown'),
|
|
141
|
+
'event': 'processing_started'
|
|
142
|
+
}
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Merge configuration with environment variables and defaults
|
|
146
|
+
merged_config = _merge_configuration(config)
|
|
147
|
+
strategy = merged_config['strategy']
|
|
148
|
+
processing_mode = merged_config.get('processingMode', 'parallel')
|
|
149
|
+
|
|
150
|
+
# Add metrics dimension for strategy
|
|
151
|
+
metrics.add_dimension(name="Strategy", value=strategy)
|
|
152
|
+
|
|
153
|
+
# Validate configuration
|
|
154
|
+
if not validate_configuration(merged_config):
|
|
155
|
+
raise ConfigurationError(f"Invalid chunking configuration: {merged_config}")
|
|
156
|
+
|
|
157
|
+
# Analyze PDF tokens
|
|
158
|
+
structured_logger.info(
|
|
159
|
+
f"Analyzing PDF tokens for document {document_id}",
|
|
160
|
+
extra={'event': 'token_analysis_started'}
|
|
161
|
+
)
|
|
162
|
+
token_analysis = analyze_pdf_tokens(bucket, key, merged_config)
|
|
163
|
+
|
|
164
|
+
# Log strategy selection with full context
|
|
165
|
+
log_strategy_selection(
|
|
166
|
+
logger=structured_logger,
|
|
167
|
+
strategy=strategy,
|
|
168
|
+
requires_chunking=token_analysis['requires_chunking'],
|
|
169
|
+
reason=_get_no_chunking_reason(token_analysis, merged_config) if not token_analysis['requires_chunking'] else f"Document exceeds thresholds for {strategy} strategy",
|
|
170
|
+
document_pages=token_analysis['total_pages'],
|
|
171
|
+
document_tokens=token_analysis['total_tokens'],
|
|
172
|
+
page_threshold=merged_config.get('pageThreshold', 100),
|
|
173
|
+
token_threshold=merged_config.get('tokenThreshold', 150000),
|
|
174
|
+
page_threshold_exceeded=token_analysis['total_pages'] > merged_config.get('pageThreshold', 100),
|
|
175
|
+
token_threshold_exceeded=token_analysis['total_tokens'] > merged_config.get('tokenThreshold', 150000)
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# Check if chunking is required
|
|
179
|
+
if not token_analysis['requires_chunking']:
|
|
180
|
+
# No chunking needed - return analysis only
|
|
181
|
+
structured_logger.info(
|
|
182
|
+
f"Document {document_id} does not require chunking",
|
|
183
|
+
extra={
|
|
184
|
+
'event': 'chunking_not_required',
|
|
185
|
+
'totalPages': token_analysis['total_pages'],
|
|
186
|
+
'totalTokens': token_analysis['total_tokens']
|
|
187
|
+
}
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# Emit metrics for non-chunked document
|
|
191
|
+
processing_time_ms = (time.time() - start_time) * 1000
|
|
192
|
+
emit_chunking_metrics(
|
|
193
|
+
document_id=document_id,
|
|
194
|
+
strategy=strategy,
|
|
195
|
+
requires_chunking=False,
|
|
196
|
+
processing_time_ms=processing_time_ms,
|
|
197
|
+
processing_mode=processing_mode
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
log_chunking_operation(
|
|
201
|
+
logger=structured_logger,
|
|
202
|
+
operation='analyze',
|
|
203
|
+
document_id=document_id,
|
|
204
|
+
success=True,
|
|
205
|
+
duration_ms=processing_time_ms,
|
|
206
|
+
extra={'requiresChunking': False}
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
return {
|
|
210
|
+
'documentId': document_id,
|
|
211
|
+
'requiresChunking': False,
|
|
212
|
+
'tokenAnalysis': {
|
|
213
|
+
'totalTokens': token_analysis['total_tokens'],
|
|
214
|
+
'totalPages': token_analysis['total_pages'],
|
|
215
|
+
'avgTokensPerPage': token_analysis['avg_tokens_per_page']
|
|
216
|
+
},
|
|
217
|
+
'reason': _get_no_chunking_reason(token_analysis, merged_config)
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
# Chunking is required - proceed to split PDF
|
|
221
|
+
structured_logger.info(
|
|
222
|
+
f"Document {document_id} requires chunking",
|
|
223
|
+
extra={
|
|
224
|
+
'event': 'chunking_required',
|
|
225
|
+
'totalPages': token_analysis['total_pages'],
|
|
226
|
+
'totalTokens': token_analysis['total_tokens']
|
|
227
|
+
}
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
# Calculate chunk boundaries
|
|
231
|
+
chunks_metadata = _calculate_chunk_boundaries(
|
|
232
|
+
token_analysis,
|
|
233
|
+
merged_config
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# Split PDF and upload chunks
|
|
237
|
+
chunk_results = _split_and_upload_pdf(
|
|
238
|
+
document_id,
|
|
239
|
+
bucket,
|
|
240
|
+
key,
|
|
241
|
+
chunks_metadata,
|
|
242
|
+
token_analysis
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
# Calculate tokens per chunk for metrics
|
|
246
|
+
tokens_per_chunk = [chunk.get('estimatedTokens', 0) for chunk in chunk_results]
|
|
247
|
+
|
|
248
|
+
# Emit metrics for chunked document
|
|
249
|
+
processing_time_ms = (time.time() - start_time) * 1000
|
|
250
|
+
emit_chunking_metrics(
|
|
251
|
+
document_id=document_id,
|
|
252
|
+
strategy=strategy,
|
|
253
|
+
requires_chunking=True,
|
|
254
|
+
chunk_count=len(chunk_results),
|
|
255
|
+
tokens_per_chunk=tokens_per_chunk,
|
|
256
|
+
processing_time_ms=processing_time_ms,
|
|
257
|
+
processing_mode=processing_mode
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
# Log successful chunking operation
|
|
261
|
+
log_chunking_operation(
|
|
262
|
+
logger=structured_logger,
|
|
263
|
+
operation='split',
|
|
264
|
+
document_id=document_id,
|
|
265
|
+
chunk_count=len(chunk_results),
|
|
266
|
+
success=True,
|
|
267
|
+
duration_ms=processing_time_ms,
|
|
268
|
+
extra={
|
|
269
|
+
'strategy': strategy,
|
|
270
|
+
'totalPages': token_analysis['total_pages'],
|
|
271
|
+
'totalTokens': token_analysis['total_tokens']
|
|
272
|
+
}
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
# Return chunking response
|
|
276
|
+
return {
|
|
277
|
+
'documentId': document_id,
|
|
278
|
+
'requiresChunking': True,
|
|
279
|
+
'tokenAnalysis': {
|
|
280
|
+
'totalTokens': token_analysis['total_tokens'],
|
|
281
|
+
'totalPages': token_analysis['total_pages'],
|
|
282
|
+
'avgTokensPerPage': token_analysis['avg_tokens_per_page'],
|
|
283
|
+
'tokensPerPage': token_analysis['tokens_per_page']
|
|
284
|
+
},
|
|
285
|
+
'strategy': merged_config['strategy'],
|
|
286
|
+
'chunks': chunk_results,
|
|
287
|
+
'config': {
|
|
288
|
+
'strategy': merged_config['strategy'],
|
|
289
|
+
'totalPages': token_analysis['total_pages'],
|
|
290
|
+
'totalTokens': token_analysis['total_tokens'],
|
|
291
|
+
**_get_strategy_config(merged_config)
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
except ConfigurationError as e:
|
|
296
|
+
error = PDFChunkingError(
|
|
297
|
+
message=str(e),
|
|
298
|
+
error_type='ConfigurationError',
|
|
299
|
+
document_id=event.get('documentId'),
|
|
300
|
+
recoverable=False
|
|
301
|
+
)
|
|
302
|
+
log_error(error, include_stack_trace=True)
|
|
303
|
+
return create_typed_error_response(
|
|
304
|
+
event.get('documentId', 'unknown'),
|
|
305
|
+
error
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
except InvalidPDFFormatError as e:
|
|
309
|
+
log_error(e, include_stack_trace=True)
|
|
310
|
+
return create_typed_error_response(
|
|
311
|
+
event.get('documentId', 'unknown'),
|
|
312
|
+
e
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
except CorruptedPDFError as e:
|
|
316
|
+
log_error(e, include_stack_trace=True)
|
|
317
|
+
return create_typed_error_response(
|
|
318
|
+
event.get('documentId', 'unknown'),
|
|
319
|
+
e
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
except EncryptedPDFError as e:
|
|
323
|
+
log_error(e, include_stack_trace=True)
|
|
324
|
+
return create_typed_error_response(
|
|
325
|
+
event.get('documentId', 'unknown'),
|
|
326
|
+
e
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
except S3AccessDeniedError as e:
|
|
330
|
+
log_error(e, include_stack_trace=True)
|
|
331
|
+
return create_typed_error_response(
|
|
332
|
+
event.get('documentId', 'unknown'),
|
|
333
|
+
e
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
except S3NotFoundError as e:
|
|
337
|
+
log_error(e, include_stack_trace=True)
|
|
338
|
+
return create_typed_error_response(
|
|
339
|
+
event.get('documentId', 'unknown'),
|
|
340
|
+
e
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
except S3ThrottlingError as e:
|
|
344
|
+
log_error(e, include_stack_trace=True)
|
|
345
|
+
return create_typed_error_response(
|
|
346
|
+
event.get('documentId', 'unknown'),
|
|
347
|
+
e
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
except ClientError as e:
|
|
351
|
+
# Classify the S3 error into a specific type
|
|
352
|
+
classified_error = classify_s3_error(
|
|
353
|
+
e,
|
|
354
|
+
document_id=event.get('documentId'),
|
|
355
|
+
bucket=event.get('content', {}).get('bucket'),
|
|
356
|
+
key=event.get('content', {}).get('key')
|
|
357
|
+
)
|
|
358
|
+
log_error(classified_error, include_stack_trace=True)
|
|
359
|
+
return create_typed_error_response(
|
|
360
|
+
event.get('documentId', 'unknown'),
|
|
361
|
+
classified_error
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
except PDFChunkingError as e:
|
|
365
|
+
log_error(e, include_stack_trace=True)
|
|
366
|
+
return create_typed_error_response(
|
|
367
|
+
event.get('documentId', 'unknown'),
|
|
368
|
+
e
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
except Exception as e:
|
|
372
|
+
# Classify unknown errors
|
|
373
|
+
document_id = event.get('documentId', 'unknown')
|
|
374
|
+
|
|
375
|
+
# Check if it's a PDF-related error
|
|
376
|
+
error_str = str(e).lower()
|
|
377
|
+
if any(keyword in error_str for keyword in ['pdf', 'pypdf', 'page', 'reader']):
|
|
378
|
+
classified_error = classify_pdf_error(e, document_id)
|
|
379
|
+
else:
|
|
380
|
+
classified_error = PDFChunkingError(
|
|
381
|
+
message=str(e),
|
|
382
|
+
error_type='UnexpectedError',
|
|
383
|
+
document_id=document_id,
|
|
384
|
+
recoverable=False,
|
|
385
|
+
details={'original_error_type': type(e).__name__}
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
log_error(classified_error, include_stack_trace=True)
|
|
389
|
+
return create_typed_error_response(
|
|
390
|
+
document_id,
|
|
391
|
+
classified_error
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def _merge_configuration(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
396
|
+
"""
|
|
397
|
+
Merge configuration from event, environment variables, and defaults.
|
|
398
|
+
|
|
399
|
+
Precedence (highest to lowest):
|
|
400
|
+
1. Event configuration (per-document)
|
|
401
|
+
2. Environment variables
|
|
402
|
+
3. Default values
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
config: Configuration from event
|
|
406
|
+
|
|
407
|
+
Returns:
|
|
408
|
+
Merged configuration dictionary
|
|
409
|
+
"""
|
|
410
|
+
# Default configuration
|
|
411
|
+
# Note: maxPagesPerChunk is 99 (not 100) because Bedrock has a hard limit of 100 pages
|
|
412
|
+
# per PDF, and we need a safety margin to avoid hitting that limit exactly
|
|
413
|
+
merged = {
|
|
414
|
+
'strategy': 'hybrid',
|
|
415
|
+
'pageThreshold': 100,
|
|
416
|
+
'tokenThreshold': 150000,
|
|
417
|
+
'chunkSize': 50,
|
|
418
|
+
'overlapPages': 5,
|
|
419
|
+
'maxTokensPerChunk': 100000,
|
|
420
|
+
'overlapTokens': 5000,
|
|
421
|
+
'targetTokensPerChunk': 80000,
|
|
422
|
+
'maxPagesPerChunk': 99,
|
|
423
|
+
'processingMode': 'parallel',
|
|
424
|
+
'maxConcurrency': 10
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
# Override with environment variables
|
|
428
|
+
env_mapping = {
|
|
429
|
+
'CHUNKING_STRATEGY': 'strategy',
|
|
430
|
+
'PAGE_THRESHOLD': 'pageThreshold',
|
|
431
|
+
'TOKEN_THRESHOLD': 'tokenThreshold',
|
|
432
|
+
'CHUNK_SIZE': 'chunkSize',
|
|
433
|
+
'OVERLAP_PAGES': 'overlapPages',
|
|
434
|
+
'MAX_TOKENS_PER_CHUNK': 'maxTokensPerChunk',
|
|
435
|
+
'OVERLAP_TOKENS': 'overlapTokens',
|
|
436
|
+
'TARGET_TOKENS_PER_CHUNK': 'targetTokensPerChunk',
|
|
437
|
+
'MAX_PAGES_PER_CHUNK': 'maxPagesPerChunk',
|
|
438
|
+
'PROCESSING_MODE': 'processingMode',
|
|
439
|
+
'MAX_CONCURRENCY': 'maxConcurrency'
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
for env_var, config_key in env_mapping.items():
|
|
443
|
+
env_value = os.environ.get(env_var)
|
|
444
|
+
if env_value is not None:
|
|
445
|
+
# Convert to appropriate type
|
|
446
|
+
if config_key in ['strategy', 'processingMode']:
|
|
447
|
+
merged[config_key] = env_value
|
|
448
|
+
else:
|
|
449
|
+
merged[config_key] = int(env_value)
|
|
450
|
+
|
|
451
|
+
# Override with event configuration (highest precedence)
|
|
452
|
+
for key, value in config.items():
|
|
453
|
+
if value is not None:
|
|
454
|
+
merged[key] = value
|
|
455
|
+
|
|
456
|
+
# Normalize key names (support both camelCase and snake_case)
|
|
457
|
+
normalized = {}
|
|
458
|
+
key_mapping = {
|
|
459
|
+
'chunkingStrategy': 'strategy',
|
|
460
|
+
'chunking_strategy': 'strategy',
|
|
461
|
+
'page_threshold': 'pageThreshold',
|
|
462
|
+
'token_threshold': 'tokenThreshold',
|
|
463
|
+
'chunk_size': 'chunkSize',
|
|
464
|
+
'overlap_pages': 'overlapPages',
|
|
465
|
+
'max_tokens_per_chunk': 'maxTokensPerChunk',
|
|
466
|
+
'overlap_tokens': 'overlapTokens',
|
|
467
|
+
'target_tokens_per_chunk': 'targetTokensPerChunk',
|
|
468
|
+
'max_pages_per_chunk': 'maxPagesPerChunk',
|
|
469
|
+
'processing_mode': 'processingMode',
|
|
470
|
+
'max_concurrency': 'maxConcurrency'
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
for key, value in merged.items():
|
|
474
|
+
normalized_key = key_mapping.get(key, key)
|
|
475
|
+
normalized[normalized_key] = value
|
|
476
|
+
|
|
477
|
+
return normalized
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
def _get_no_chunking_reason(
|
|
481
|
+
token_analysis: Dict[str, Any],
|
|
482
|
+
config: Dict[str, Any]
|
|
483
|
+
) -> str:
|
|
484
|
+
"""
|
|
485
|
+
Generate human-readable reason for not chunking.
|
|
486
|
+
|
|
487
|
+
Args:
|
|
488
|
+
token_analysis: Token analysis results
|
|
489
|
+
config: Chunking configuration
|
|
490
|
+
|
|
491
|
+
Returns:
|
|
492
|
+
Reason string
|
|
493
|
+
"""
|
|
494
|
+
strategy = config.get('strategy', 'hybrid')
|
|
495
|
+
total_pages = token_analysis['total_pages']
|
|
496
|
+
total_tokens = token_analysis['total_tokens']
|
|
497
|
+
page_threshold = config.get('pageThreshold', 100)
|
|
498
|
+
token_threshold = config.get('tokenThreshold', 150000)
|
|
499
|
+
|
|
500
|
+
if strategy == 'fixed-pages':
|
|
501
|
+
return (
|
|
502
|
+
f"Document has {total_pages} pages, "
|
|
503
|
+
f"below threshold of {page_threshold} (fixed-pages strategy)"
|
|
504
|
+
)
|
|
505
|
+
elif strategy == 'token-based':
|
|
506
|
+
return (
|
|
507
|
+
f"Document has {total_tokens} tokens, "
|
|
508
|
+
f"below threshold of {token_threshold} (token-based strategy)"
|
|
509
|
+
)
|
|
510
|
+
else: # hybrid
|
|
511
|
+
return (
|
|
512
|
+
f"Document has {total_pages} pages and {total_tokens} tokens, "
|
|
513
|
+
f"below thresholds of {page_threshold} pages and {token_threshold} tokens (hybrid strategy)"
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
def _calculate_chunk_boundaries(
|
|
518
|
+
token_analysis: Dict[str, Any],
|
|
519
|
+
config: Dict[str, Any]
|
|
520
|
+
) -> list:
|
|
521
|
+
"""
|
|
522
|
+
Calculate chunk boundaries based on strategy.
|
|
523
|
+
|
|
524
|
+
Args:
|
|
525
|
+
token_analysis: Token analysis results
|
|
526
|
+
config: Chunking configuration
|
|
527
|
+
|
|
528
|
+
Returns:
|
|
529
|
+
List of chunk metadata dictionaries
|
|
530
|
+
"""
|
|
531
|
+
strategy = config['strategy']
|
|
532
|
+
total_pages = token_analysis['total_pages']
|
|
533
|
+
tokens_per_page = token_analysis['tokens_per_page']
|
|
534
|
+
|
|
535
|
+
if strategy == 'fixed-pages':
|
|
536
|
+
return calculate_chunks_fixed_pages(
|
|
537
|
+
total_pages,
|
|
538
|
+
config['chunkSize'],
|
|
539
|
+
config['overlapPages']
|
|
540
|
+
)
|
|
541
|
+
elif strategy == 'token-based':
|
|
542
|
+
return calculate_chunks_token_based(
|
|
543
|
+
tokens_per_page,
|
|
544
|
+
config['maxTokensPerChunk'],
|
|
545
|
+
config['overlapTokens']
|
|
546
|
+
)
|
|
547
|
+
else: # hybrid
|
|
548
|
+
return calculate_chunks_hybrid(
|
|
549
|
+
tokens_per_page,
|
|
550
|
+
config['targetTokensPerChunk'],
|
|
551
|
+
config['maxPagesPerChunk'],
|
|
552
|
+
config['overlapTokens']
|
|
553
|
+
)
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
def _get_strategy_config(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
557
|
+
"""
|
|
558
|
+
Extract strategy-specific configuration.
|
|
559
|
+
|
|
560
|
+
Args:
|
|
561
|
+
config: Full configuration
|
|
562
|
+
|
|
563
|
+
Returns:
|
|
564
|
+
Strategy-specific configuration
|
|
565
|
+
"""
|
|
566
|
+
strategy = config['strategy']
|
|
567
|
+
|
|
568
|
+
if strategy == 'fixed-pages':
|
|
569
|
+
return {
|
|
570
|
+
'chunkSize': config['chunkSize'],
|
|
571
|
+
'overlapPages': config['overlapPages'],
|
|
572
|
+
'pageThreshold': config['pageThreshold']
|
|
573
|
+
}
|
|
574
|
+
elif strategy == 'token-based':
|
|
575
|
+
return {
|
|
576
|
+
'maxTokensPerChunk': config['maxTokensPerChunk'],
|
|
577
|
+
'overlapTokens': config['overlapTokens'],
|
|
578
|
+
'tokenThreshold': config['tokenThreshold']
|
|
579
|
+
}
|
|
580
|
+
else: # hybrid
|
|
581
|
+
return {
|
|
582
|
+
'targetTokensPerChunk': config['targetTokensPerChunk'],
|
|
583
|
+
'maxPagesPerChunk': config['maxPagesPerChunk'],
|
|
584
|
+
'overlapTokens': config['overlapTokens'],
|
|
585
|
+
'pageThreshold': config['pageThreshold'],
|
|
586
|
+
'tokenThreshold': config['tokenThreshold']
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
def _create_error_response(
|
|
591
|
+
document_id: str,
|
|
592
|
+
error_type: str,
|
|
593
|
+
error_message: str
|
|
594
|
+
) -> Dict[str, Any]:
|
|
595
|
+
"""
|
|
596
|
+
Create standardized error response.
|
|
597
|
+
|
|
598
|
+
Args:
|
|
599
|
+
document_id: Document identifier
|
|
600
|
+
error_type: Type of error
|
|
601
|
+
error_message: Error message
|
|
602
|
+
|
|
603
|
+
Returns:
|
|
604
|
+
Error response dictionary
|
|
605
|
+
"""
|
|
606
|
+
return {
|
|
607
|
+
'documentId': document_id,
|
|
608
|
+
'requiresChunking': False,
|
|
609
|
+
'error': {
|
|
610
|
+
'type': error_type,
|
|
611
|
+
'message': error_message
|
|
612
|
+
}
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
def _split_and_upload_pdf(
|
|
618
|
+
document_id: str,
|
|
619
|
+
bucket: str,
|
|
620
|
+
key: str,
|
|
621
|
+
chunks_metadata: list,
|
|
622
|
+
token_analysis: Dict[str, Any]
|
|
623
|
+
) -> list:
|
|
624
|
+
"""
|
|
625
|
+
Split PDF into chunks and upload to S3.
|
|
626
|
+
|
|
627
|
+
This function:
|
|
628
|
+
1. Downloads the PDF from S3 using streaming
|
|
629
|
+
2. Splits the PDF based on chunk boundaries
|
|
630
|
+
3. Generates chunk IDs: {documentId}_chunk_{index}
|
|
631
|
+
4. Uploads chunks to S3 chunks/{documentId}/ prefix
|
|
632
|
+
5. Generates ChunkMetadata for each chunk
|
|
633
|
+
|
|
634
|
+
Args:
|
|
635
|
+
document_id: Document identifier
|
|
636
|
+
bucket: S3 bucket name
|
|
637
|
+
key: S3 object key for source PDF
|
|
638
|
+
chunks_metadata: List of chunk boundary metadata
|
|
639
|
+
token_analysis: Token analysis results
|
|
640
|
+
|
|
641
|
+
Returns:
|
|
642
|
+
List of ChunkMetadata dictionaries
|
|
643
|
+
|
|
644
|
+
Raises:
|
|
645
|
+
InvalidPDFFormatError: If file is not a valid PDF
|
|
646
|
+
CorruptedPDFError: If PDF is corrupted
|
|
647
|
+
EncryptedPDFError: If PDF is encrypted
|
|
648
|
+
S3AccessDeniedError: If S3 access is denied
|
|
649
|
+
S3NotFoundError: If S3 object is not found
|
|
650
|
+
"""
|
|
651
|
+
try:
|
|
652
|
+
import PyPDF2
|
|
653
|
+
import io
|
|
654
|
+
|
|
655
|
+
logger.info(f"Splitting PDF {document_id} into {len(chunks_metadata)} chunks")
|
|
656
|
+
|
|
657
|
+
# Download PDF from S3 using streaming
|
|
658
|
+
try:
|
|
659
|
+
pdf_obj = s3_client.get_object(Bucket=bucket, Key=key)
|
|
660
|
+
pdf_bytes = pdf_obj['Body'].read()
|
|
661
|
+
|
|
662
|
+
# Validate file is actually a PDF by checking magic bytes
|
|
663
|
+
validate_pdf_magic_bytes(pdf_bytes, document_id)
|
|
664
|
+
|
|
665
|
+
except ClientError as e:
|
|
666
|
+
raise classify_s3_error(e, document_id, bucket, key)
|
|
667
|
+
|
|
668
|
+
# Attempt to read PDF
|
|
669
|
+
try:
|
|
670
|
+
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
|
|
671
|
+
|
|
672
|
+
# Validate PDF is not encrypted
|
|
673
|
+
if pdf_reader.is_encrypted:
|
|
674
|
+
raise EncryptedPDFError(
|
|
675
|
+
message=f"PDF {document_id} is encrypted. Encrypted PDFs are not supported.",
|
|
676
|
+
document_id=document_id
|
|
677
|
+
)
|
|
678
|
+
|
|
679
|
+
# Validate page count matches analysis
|
|
680
|
+
actual_pages = len(pdf_reader.pages)
|
|
681
|
+
expected_pages = token_analysis['total_pages']
|
|
682
|
+
if actual_pages != expected_pages:
|
|
683
|
+
logger.warning(
|
|
684
|
+
f"Page count mismatch for {document_id}: "
|
|
685
|
+
f"expected {expected_pages}, got {actual_pages}"
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
except PyPDF2.errors.PdfReadError as e:
|
|
689
|
+
raise CorruptedPDFError(
|
|
690
|
+
message=f"Invalid or corrupted PDF format for {document_id}: {str(e)}",
|
|
691
|
+
document_id=document_id,
|
|
692
|
+
details={'original_error': str(e)}
|
|
693
|
+
)
|
|
694
|
+
except EncryptedPDFError:
|
|
695
|
+
raise
|
|
696
|
+
except Exception as e:
|
|
697
|
+
if "encrypted" in str(e).lower():
|
|
698
|
+
raise EncryptedPDFError(
|
|
699
|
+
message=f"PDF {document_id} is encrypted: {str(e)}",
|
|
700
|
+
document_id=document_id
|
|
701
|
+
)
|
|
702
|
+
raise classify_pdf_error(e, document_id)
|
|
703
|
+
|
|
704
|
+
chunk_results = []
|
|
705
|
+
total_chunks = len(chunks_metadata)
|
|
706
|
+
corrupted_pages = []
|
|
707
|
+
|
|
708
|
+
for chunk_meta in chunks_metadata:
|
|
709
|
+
chunk_index = chunk_meta['chunk_index']
|
|
710
|
+
start_page = chunk_meta['start_page']
|
|
711
|
+
end_page = chunk_meta['end_page']
|
|
712
|
+
page_count = chunk_meta['page_count']
|
|
713
|
+
|
|
714
|
+
# Generate chunk ID
|
|
715
|
+
chunk_id = f"{document_id}_chunk_{chunk_index}"
|
|
716
|
+
|
|
717
|
+
logger.info(
|
|
718
|
+
f"Creating chunk {chunk_index + 1}/{total_chunks}: "
|
|
719
|
+
f"pages {start_page}-{end_page} ({page_count} pages)"
|
|
720
|
+
)
|
|
721
|
+
|
|
722
|
+
# Create new PDF for this chunk
|
|
723
|
+
pdf_writer = PyPDF2.PdfWriter()
|
|
724
|
+
|
|
725
|
+
# Add pages to chunk (end_page is inclusive)
|
|
726
|
+
pages_added = 0
|
|
727
|
+
for page_num in range(start_page, end_page + 1):
|
|
728
|
+
if page_num < len(pdf_reader.pages):
|
|
729
|
+
try:
|
|
730
|
+
pdf_writer.add_page(pdf_reader.pages[page_num])
|
|
731
|
+
pages_added += 1
|
|
732
|
+
except Exception as e:
|
|
733
|
+
# Handle corrupted pages - skip and log warning
|
|
734
|
+
logger.warning(
|
|
735
|
+
f"Skipping corrupted page {page_num} in document {document_id}: {str(e)}"
|
|
736
|
+
)
|
|
737
|
+
corrupted_pages.append(page_num)
|
|
738
|
+
continue
|
|
739
|
+
|
|
740
|
+
# Skip chunk if no pages were successfully added
|
|
741
|
+
if pages_added == 0:
|
|
742
|
+
logger.warning(
|
|
743
|
+
f"Skipping chunk {chunk_index} for document {document_id}: "
|
|
744
|
+
f"no valid pages in range {start_page}-{end_page}"
|
|
745
|
+
)
|
|
746
|
+
continue
|
|
747
|
+
|
|
748
|
+
# Write chunk to bytes
|
|
749
|
+
try:
|
|
750
|
+
chunk_bytes = io.BytesIO()
|
|
751
|
+
pdf_writer.write(chunk_bytes)
|
|
752
|
+
chunk_bytes.seek(0)
|
|
753
|
+
except Exception as e:
|
|
754
|
+
logger.error(
|
|
755
|
+
f"Failed to write chunk {chunk_index} for document {document_id}: {str(e)}"
|
|
756
|
+
)
|
|
757
|
+
raise CorruptedPDFError(
|
|
758
|
+
message=f"Failed to create chunk {chunk_index}: {str(e)}",
|
|
759
|
+
document_id=document_id,
|
|
760
|
+
details={'chunk_index': chunk_index}
|
|
761
|
+
)
|
|
762
|
+
|
|
763
|
+
# Upload chunk to S3 with retry logic
|
|
764
|
+
# Chunks are stored in a folder named after the document ID for organization
|
|
765
|
+
chunk_key = f"chunks/{document_id}/{chunk_id}.pdf"
|
|
766
|
+
_upload_chunk_with_retry(
|
|
767
|
+
bucket,
|
|
768
|
+
chunk_key,
|
|
769
|
+
chunk_bytes.getvalue(),
|
|
770
|
+
document_id,
|
|
771
|
+
chunk_index
|
|
772
|
+
)
|
|
773
|
+
|
|
774
|
+
# Calculate estimated tokens for this chunk
|
|
775
|
+
estimated_tokens = sum(
|
|
776
|
+
token_analysis['tokens_per_page'][i]
|
|
777
|
+
for i in range(start_page, min(end_page + 1, len(token_analysis['tokens_per_page'])))
|
|
778
|
+
)
|
|
779
|
+
|
|
780
|
+
# Create chunk metadata
|
|
781
|
+
chunk_result = {
|
|
782
|
+
'chunkId': chunk_id,
|
|
783
|
+
'chunkIndex': chunk_index,
|
|
784
|
+
'totalChunks': total_chunks,
|
|
785
|
+
'startPage': start_page,
|
|
786
|
+
'endPage': end_page,
|
|
787
|
+
'pageCount': pages_added, # Use actual pages added
|
|
788
|
+
'estimatedTokens': estimated_tokens,
|
|
789
|
+
'bucket': bucket,
|
|
790
|
+
'key': chunk_key
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
chunk_results.append(chunk_result)
|
|
794
|
+
|
|
795
|
+
logger.info(
|
|
796
|
+
f"Successfully created chunk {chunk_index + 1}/{total_chunks}: "
|
|
797
|
+
f"{chunk_id} with {estimated_tokens} tokens"
|
|
798
|
+
)
|
|
799
|
+
|
|
800
|
+
# Log summary of corrupted pages
|
|
801
|
+
if corrupted_pages:
|
|
802
|
+
logger.warning(
|
|
803
|
+
f"Document {document_id} had {len(corrupted_pages)} corrupted pages: "
|
|
804
|
+
f"{corrupted_pages[:10]}{'...' if len(corrupted_pages) > 10 else ''}"
|
|
805
|
+
)
|
|
806
|
+
|
|
807
|
+
# Ensure at least one chunk was created
|
|
808
|
+
if not chunk_results:
|
|
809
|
+
raise CorruptedPDFError(
|
|
810
|
+
message=f"Failed to create any valid chunks for document {document_id}. "
|
|
811
|
+
f"All pages may be corrupted.",
|
|
812
|
+
document_id=document_id,
|
|
813
|
+
details={'corrupted_pages': corrupted_pages}
|
|
814
|
+
)
|
|
815
|
+
|
|
816
|
+
logger.info(
|
|
817
|
+
f"Successfully split document {document_id} into {len(chunk_results)} chunks"
|
|
818
|
+
)
|
|
819
|
+
|
|
820
|
+
return chunk_results
|
|
821
|
+
|
|
822
|
+
except ImportError as e:
|
|
823
|
+
logger.error(f"PyPDF2 not available: {str(e)}")
|
|
824
|
+
raise PDFChunkingError(
|
|
825
|
+
message="PyPDF2 is required for PDF processing",
|
|
826
|
+
error_type='DependencyError',
|
|
827
|
+
document_id=document_id,
|
|
828
|
+
recoverable=False
|
|
829
|
+
)
|
|
830
|
+
|
|
831
|
+
except (InvalidPDFFormatError, CorruptedPDFError, EncryptedPDFError,
|
|
832
|
+
S3AccessDeniedError, S3NotFoundError, S3ThrottlingError, PDFChunkingError):
|
|
833
|
+
# Re-raise our custom errors
|
|
834
|
+
raise
|
|
835
|
+
|
|
836
|
+
except Exception as e:
|
|
837
|
+
logger.error(
|
|
838
|
+
f"Failed to split PDF {document_id}: {str(e)}",
|
|
839
|
+
exc_info=True
|
|
840
|
+
)
|
|
841
|
+
raise classify_pdf_error(e, document_id)
|
|
842
|
+
|
|
843
|
+
|
|
844
|
+
def _upload_chunk_with_retry(
|
|
845
|
+
bucket: str,
|
|
846
|
+
key: str,
|
|
847
|
+
data: bytes,
|
|
848
|
+
document_id: str,
|
|
849
|
+
chunk_index: int,
|
|
850
|
+
max_retries: int = 3
|
|
851
|
+
) -> None:
|
|
852
|
+
"""
|
|
853
|
+
Upload chunk to S3 with exponential backoff retry.
|
|
854
|
+
|
|
855
|
+
Args:
|
|
856
|
+
bucket: S3 bucket name
|
|
857
|
+
key: S3 object key
|
|
858
|
+
data: Chunk data bytes
|
|
859
|
+
document_id: Document identifier (for logging)
|
|
860
|
+
chunk_index: Chunk index (for logging)
|
|
861
|
+
max_retries: Maximum number of retry attempts
|
|
862
|
+
|
|
863
|
+
Raises:
|
|
864
|
+
S3AccessDeniedError: If access is denied
|
|
865
|
+
S3ThrottlingError: If throttled after all retries
|
|
866
|
+
PDFChunkingError: For other S3 errors
|
|
867
|
+
"""
|
|
868
|
+
import time
|
|
869
|
+
import random
|
|
870
|
+
|
|
871
|
+
last_error = None
|
|
872
|
+
|
|
873
|
+
for attempt in range(max_retries):
|
|
874
|
+
try:
|
|
875
|
+
s3_client.put_object(
|
|
876
|
+
Bucket=bucket,
|
|
877
|
+
Key=key,
|
|
878
|
+
Body=data,
|
|
879
|
+
ContentType='application/pdf',
|
|
880
|
+
Metadata={
|
|
881
|
+
'documentId': document_id,
|
|
882
|
+
'chunkIndex': str(chunk_index)
|
|
883
|
+
}
|
|
884
|
+
)
|
|
885
|
+
|
|
886
|
+
if attempt > 0:
|
|
887
|
+
logger.info(
|
|
888
|
+
f"Successfully uploaded chunk {chunk_index} for document {document_id} "
|
|
889
|
+
f"on attempt {attempt + 1}"
|
|
890
|
+
)
|
|
891
|
+
|
|
892
|
+
return
|
|
893
|
+
|
|
894
|
+
except ClientError as e:
|
|
895
|
+
error_code = e.response['Error']['Code']
|
|
896
|
+
last_error = e
|
|
897
|
+
|
|
898
|
+
# Don't retry on access denied or invalid bucket
|
|
899
|
+
if error_code in ['AccessDenied', 'NoSuchBucket', 'InvalidBucketName']:
|
|
900
|
+
logger.error(
|
|
901
|
+
f"Non-retryable S3 error uploading chunk {chunk_index} "
|
|
902
|
+
f"for document {document_id}: {error_code}"
|
|
903
|
+
)
|
|
904
|
+
raise classify_s3_error(e, document_id, bucket, key)
|
|
905
|
+
|
|
906
|
+
# Retry on throttling or server errors
|
|
907
|
+
if attempt < max_retries - 1:
|
|
908
|
+
# Exponential backoff with jitter: 1s, 2s, 4s + random jitter
|
|
909
|
+
base_wait = 2 ** attempt
|
|
910
|
+
jitter = random.uniform(0, 0.5)
|
|
911
|
+
wait_time = base_wait + jitter
|
|
912
|
+
|
|
913
|
+
logger.warning(
|
|
914
|
+
f"S3 error uploading chunk {chunk_index} for document {document_id}: "
|
|
915
|
+
f"{error_code}. Retrying in {wait_time:.2f}s (attempt {attempt + 1}/{max_retries})"
|
|
916
|
+
)
|
|
917
|
+
time.sleep(wait_time)
|
|
918
|
+
else:
|
|
919
|
+
logger.error(
|
|
920
|
+
f"Failed to upload chunk {chunk_index} for document {document_id} "
|
|
921
|
+
f"after {max_retries} attempts: {error_code}"
|
|
922
|
+
)
|
|
923
|
+
raise classify_s3_error(e, document_id, bucket, key)
|
|
924
|
+
|
|
925
|
+
# Should not reach here, but handle edge case
|
|
926
|
+
if last_error:
|
|
927
|
+
raise classify_s3_error(last_error, document_id, bucket, key)
|
|
928
|
+
|
|
929
|
+
|
|
930
|
+
|
|
931
|
+
def _is_valid_pdf(data: bytes) -> bool:
|
|
932
|
+
"""
|
|
933
|
+
Validate that the file is actually a PDF by checking magic bytes.
|
|
934
|
+
|
|
935
|
+
PDF files must start with the magic bytes "%PDF-" (hex: 25 50 44 46 2D).
|
|
936
|
+
This is a quick check before attempting to parse the file with PyPDF2.
|
|
937
|
+
|
|
938
|
+
Args:
|
|
939
|
+
data: File data bytes
|
|
940
|
+
|
|
941
|
+
Returns:
|
|
942
|
+
True if file starts with PDF magic bytes, False otherwise
|
|
943
|
+
|
|
944
|
+
Examples:
|
|
945
|
+
>>> _is_valid_pdf(b'%PDF-1.4\\n...')
|
|
946
|
+
True
|
|
947
|
+
>>> _is_valid_pdf(b'<html>...</html>')
|
|
948
|
+
False
|
|
949
|
+
>>> _is_valid_pdf(b'')
|
|
950
|
+
False
|
|
951
|
+
"""
|
|
952
|
+
if not data or len(data) < 5:
|
|
953
|
+
return False
|
|
954
|
+
|
|
955
|
+
# Check for PDF magic bytes: %PDF-
|
|
956
|
+
# This is the standard PDF file signature
|
|
957
|
+
pdf_magic = b'%PDF-'
|
|
958
|
+
return data[:5] == pdf_magic
|