@cdklabs/cdk-appmod-catalog-blueprints 1.4.1 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/.jsii +2579 -194
  2. package/lib/document-processing/adapter/adapter.d.ts +4 -2
  3. package/lib/document-processing/adapter/adapter.js +1 -1
  4. package/lib/document-processing/adapter/queued-s3-adapter.d.ts +9 -2
  5. package/lib/document-processing/adapter/queued-s3-adapter.js +29 -15
  6. package/lib/document-processing/agentic-document-processing.d.ts +4 -0
  7. package/lib/document-processing/agentic-document-processing.js +20 -10
  8. package/lib/document-processing/base-document-processing.d.ts +54 -2
  9. package/lib/document-processing/base-document-processing.js +136 -82
  10. package/lib/document-processing/bedrock-document-processing.d.ts +202 -2
  11. package/lib/document-processing/bedrock-document-processing.js +717 -77
  12. package/lib/document-processing/chunking-config.d.ts +614 -0
  13. package/lib/document-processing/chunking-config.js +5 -0
  14. package/lib/document-processing/default-document-processing-config.js +1 -1
  15. package/lib/document-processing/index.d.ts +1 -0
  16. package/lib/document-processing/index.js +2 -1
  17. package/lib/document-processing/resources/aggregation/handler.py +567 -0
  18. package/lib/document-processing/resources/aggregation/requirements.txt +7 -0
  19. package/lib/document-processing/resources/aggregation/test_handler.py +362 -0
  20. package/lib/document-processing/resources/cleanup/handler.py +276 -0
  21. package/lib/document-processing/resources/cleanup/requirements.txt +5 -0
  22. package/lib/document-processing/resources/cleanup/test_handler.py +436 -0
  23. package/lib/document-processing/resources/default-bedrock-invoke/index.py +85 -3
  24. package/lib/document-processing/resources/default-bedrock-invoke/test_index.py +622 -0
  25. package/lib/document-processing/resources/pdf-chunking/README.md +313 -0
  26. package/lib/document-processing/resources/pdf-chunking/chunking_strategies.py +460 -0
  27. package/lib/document-processing/resources/pdf-chunking/error_handling.py +491 -0
  28. package/lib/document-processing/resources/pdf-chunking/handler.py +958 -0
  29. package/lib/document-processing/resources/pdf-chunking/metrics.py +435 -0
  30. package/lib/document-processing/resources/pdf-chunking/requirements.txt +3 -0
  31. package/lib/document-processing/resources/pdf-chunking/strategy_selection.py +420 -0
  32. package/lib/document-processing/resources/pdf-chunking/structured_logging.py +457 -0
  33. package/lib/document-processing/resources/pdf-chunking/test_chunking_strategies.py +353 -0
  34. package/lib/document-processing/resources/pdf-chunking/test_error_handling.py +487 -0
  35. package/lib/document-processing/resources/pdf-chunking/test_handler.py +609 -0
  36. package/lib/document-processing/resources/pdf-chunking/test_integration.py +694 -0
  37. package/lib/document-processing/resources/pdf-chunking/test_metrics.py +532 -0
  38. package/lib/document-processing/resources/pdf-chunking/test_strategy_selection.py +471 -0
  39. package/lib/document-processing/resources/pdf-chunking/test_structured_logging.py +449 -0
  40. package/lib/document-processing/resources/pdf-chunking/test_token_estimation.py +374 -0
  41. package/lib/document-processing/resources/pdf-chunking/token_estimation.py +189 -0
  42. package/lib/document-processing/tests/agentic-document-processing-nag.test.js +4 -3
  43. package/lib/document-processing/tests/agentic-document-processing.test.js +488 -4
  44. package/lib/document-processing/tests/base-document-processing-nag.test.js +9 -2
  45. package/lib/document-processing/tests/base-document-processing-schema.test.d.ts +1 -0
  46. package/lib/document-processing/tests/base-document-processing-schema.test.js +337 -0
  47. package/lib/document-processing/tests/base-document-processing.test.js +114 -8
  48. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.d.ts +1 -0
  49. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.js +382 -0
  50. package/lib/document-processing/tests/bedrock-document-processing-nag.test.js +4 -3
  51. package/lib/document-processing/tests/bedrock-document-processing-security.test.d.ts +1 -0
  52. package/lib/document-processing/tests/bedrock-document-processing-security.test.js +389 -0
  53. package/lib/document-processing/tests/bedrock-document-processing.test.js +808 -8
  54. package/lib/document-processing/tests/chunking-config.test.d.ts +1 -0
  55. package/lib/document-processing/tests/chunking-config.test.js +238 -0
  56. package/lib/document-processing/tests/queued-s3-adapter-nag.test.js +9 -2
  57. package/lib/document-processing/tests/queued-s3-adapter.test.js +17 -6
  58. package/lib/framework/agents/base-agent.js +1 -1
  59. package/lib/framework/agents/batch-agent.js +1 -1
  60. package/lib/framework/agents/default-agent-config.js +1 -1
  61. package/lib/framework/bedrock/bedrock.js +1 -1
  62. package/lib/framework/custom-resource/default-runtimes.js +1 -1
  63. package/lib/framework/foundation/access-log.js +1 -1
  64. package/lib/framework/foundation/eventbridge-broker.js +1 -1
  65. package/lib/framework/foundation/network.d.ts +4 -2
  66. package/lib/framework/foundation/network.js +52 -41
  67. package/lib/framework/tests/access-log.test.js +5 -2
  68. package/lib/framework/tests/batch-agent.test.js +5 -2
  69. package/lib/framework/tests/bedrock.test.js +5 -2
  70. package/lib/framework/tests/eventbridge-broker.test.js +5 -2
  71. package/lib/framework/tests/framework-nag.test.js +26 -7
  72. package/lib/framework/tests/network.test.js +30 -2
  73. package/lib/tsconfig.tsbuildinfo +1 -1
  74. package/lib/utilities/data-loader.js +1 -1
  75. package/lib/utilities/lambda-iam-utils.js +1 -1
  76. package/lib/utilities/observability/cloudfront-distribution-observability-property-injector.js +1 -1
  77. package/lib/utilities/observability/default-observability-config.js +1 -1
  78. package/lib/utilities/observability/lambda-observability-property-injector.js +1 -1
  79. package/lib/utilities/observability/log-group-data-protection-utils.js +1 -1
  80. package/lib/utilities/observability/powertools-config.d.ts +10 -1
  81. package/lib/utilities/observability/powertools-config.js +19 -3
  82. package/lib/utilities/observability/state-machine-observability-property-injector.js +1 -1
  83. package/lib/utilities/test-utils.d.ts +43 -0
  84. package/lib/utilities/test-utils.js +56 -0
  85. package/lib/utilities/tests/data-loader-nag.test.js +3 -2
  86. package/lib/utilities/tests/data-loader.test.js +3 -2
  87. package/lib/webapp/frontend-construct.js +1 -1
  88. package/lib/webapp/tests/frontend-construct-nag.test.js +3 -2
  89. package/lib/webapp/tests/frontend-construct.test.js +3 -2
  90. package/package.json +6 -5
  91. package/lib/document-processing/resources/default-error-handler/index.js +0 -46
  92. package/lib/document-processing/resources/default-pdf-processor/index.js +0 -46
  93. package/lib/document-processing/resources/default-pdf-validator/index.js +0 -36
@@ -0,0 +1,958 @@
1
+ """
2
+ PDF Analysis and Chunking Lambda Handler.
3
+
4
+ This Lambda function is the first step in the Step Functions workflow for chunked
5
+ document processing. It analyzes PDFs to determine if chunking is needed, and if so,
6
+ splits the PDF into chunks and uploads them to S3.
7
+
8
+ This is a single Lambda that does both analysis and chunking to avoid downloading
9
+ the PDF twice.
10
+ """
11
+
12
+ import json
13
+ import logging
14
+ import os
15
+ import time
16
+ from typing import Dict, Any, Optional
17
+ import boto3
18
+ from botocore.exceptions import ClientError
19
+
20
+ # Import local modules
21
+ from token_estimation import analyze_pdf_tokens, estimate_tokens_fast
22
+ from chunking_strategies import (
23
+ calculate_chunks_fixed_pages,
24
+ calculate_chunks_token_based,
25
+ calculate_chunks_hybrid,
26
+ validate_configuration,
27
+ ConfigurationError
28
+ )
29
+ from error_handling import (
30
+ PDFChunkingError,
31
+ InvalidPDFFormatError,
32
+ CorruptedPDFError,
33
+ EncryptedPDFError,
34
+ S3AccessDeniedError,
35
+ S3NotFoundError,
36
+ S3ThrottlingError,
37
+ DynamoDBWriteError,
38
+ ChunkingTimeoutError,
39
+ classify_s3_error,
40
+ classify_pdf_error,
41
+ create_error_response as create_typed_error_response,
42
+ log_error,
43
+ retry_with_exponential_backoff,
44
+ validate_pdf_magic_bytes
45
+ )
46
+ from metrics import (
47
+ get_metrics,
48
+ emit_chunking_metrics,
49
+ emit_chunking_operation,
50
+ emit_chunk_count,
51
+ emit_tokens_per_chunk,
52
+ emit_chunk_processing_time,
53
+ emit_strategy_usage,
54
+ timed_operation
55
+ )
56
+ from structured_logging import (
57
+ get_logger,
58
+ log_strategy_selection,
59
+ log_chunking_operation,
60
+ with_correlation_id,
61
+ is_observability_enabled
62
+ )
63
+
64
+ # Configure structured logging
65
+ structured_logger = get_logger(__name__)
66
+
67
+ # Keep standard logger for backward compatibility
68
+ logger = logging.getLogger()
69
+ logger.setLevel(os.environ.get('LOG_LEVEL', 'INFO'))
70
+
71
+ # Initialize AWS clients
72
+ s3_client = boto3.client('s3')
73
+
74
+ # Get Powertools metrics instance
75
+ metrics = get_metrics()
76
+
77
+
78
+ @metrics.log_metrics
79
+ @with_correlation_id
80
+ def handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
81
+ """
82
+ Lambda handler for PDF analysis and chunking.
83
+
84
+ This function:
85
+ 1. Parses the event for document metadata and configuration
86
+ 2. Analyzes the PDF to determine token count and page count
87
+ 3. Determines if chunking is required based on strategy and thresholds
88
+ 4. If no chunking needed: returns analysis metadata only
89
+ 5. If chunking needed: splits PDF and uploads chunks to S3
90
+
91
+ Args:
92
+ event: Step Functions event with:
93
+ - documentId: Unique document identifier
94
+ - contentType: Type of content (should be 'file')
95
+ - content: Object with bucket, key, location, filename
96
+ - config: Optional chunking configuration
97
+ context: Lambda context object
98
+
99
+ Returns:
100
+ NoChunkingResponse or ChunkingResponse based on analysis
101
+
102
+ Raises:
103
+ Exception: For any processing errors (caught and returned as error response)
104
+ """
105
+ start_time = time.time()
106
+
107
+ try:
108
+ # Parse event
109
+ document_id = event.get('documentId')
110
+ content_type = event.get('contentType')
111
+ content = event.get('content', {})
112
+ bucket = content.get('bucket')
113
+ key = content.get('key')
114
+ config = event.get('config', {})
115
+
116
+ # Validate required fields
117
+ if not document_id:
118
+ raise ValueError("Missing required field: documentId")
119
+ if not bucket or not key:
120
+ raise ValueError("Missing required fields: content.bucket or content.key")
121
+ if content_type and content_type != 'file':
122
+ raise ValueError(f"Unsupported contentType: {content_type}. Only 'file' is supported.")
123
+
124
+ # Set document context for structured logging
125
+ structured_logger.set_document_context(document_id=document_id)
126
+
127
+ # Validate file extension (should be PDF)
128
+ if not key.lower().endswith('.pdf'):
129
+ structured_logger.warning(
130
+ f"File {key} does not have .pdf extension. Will validate PDF format using magic bytes.",
131
+ extra={'bucket': bucket, 'key': key}
132
+ )
133
+
134
+ structured_logger.info(
135
+ f"Processing document {document_id} from s3://{bucket}/{key}",
136
+ extra={
137
+ 'bucket': bucket,
138
+ 'key': key,
139
+ 'strategy': config.get('strategy', 'hybrid'),
140
+ 'source': event.get('source', 'unknown'),
141
+ 'event': 'processing_started'
142
+ }
143
+ )
144
+
145
+ # Merge configuration with environment variables and defaults
146
+ merged_config = _merge_configuration(config)
147
+ strategy = merged_config['strategy']
148
+ processing_mode = merged_config.get('processingMode', 'parallel')
149
+
150
+ # Add metrics dimension for strategy
151
+ metrics.add_dimension(name="Strategy", value=strategy)
152
+
153
+ # Validate configuration
154
+ if not validate_configuration(merged_config):
155
+ raise ConfigurationError(f"Invalid chunking configuration: {merged_config}")
156
+
157
+ # Analyze PDF tokens
158
+ structured_logger.info(
159
+ f"Analyzing PDF tokens for document {document_id}",
160
+ extra={'event': 'token_analysis_started'}
161
+ )
162
+ token_analysis = analyze_pdf_tokens(bucket, key, merged_config)
163
+
164
+ # Log strategy selection with full context
165
+ log_strategy_selection(
166
+ logger=structured_logger,
167
+ strategy=strategy,
168
+ requires_chunking=token_analysis['requires_chunking'],
169
+ reason=_get_no_chunking_reason(token_analysis, merged_config) if not token_analysis['requires_chunking'] else f"Document exceeds thresholds for {strategy} strategy",
170
+ document_pages=token_analysis['total_pages'],
171
+ document_tokens=token_analysis['total_tokens'],
172
+ page_threshold=merged_config.get('pageThreshold', 100),
173
+ token_threshold=merged_config.get('tokenThreshold', 150000),
174
+ page_threshold_exceeded=token_analysis['total_pages'] > merged_config.get('pageThreshold', 100),
175
+ token_threshold_exceeded=token_analysis['total_tokens'] > merged_config.get('tokenThreshold', 150000)
176
+ )
177
+
178
+ # Check if chunking is required
179
+ if not token_analysis['requires_chunking']:
180
+ # No chunking needed - return analysis only
181
+ structured_logger.info(
182
+ f"Document {document_id} does not require chunking",
183
+ extra={
184
+ 'event': 'chunking_not_required',
185
+ 'totalPages': token_analysis['total_pages'],
186
+ 'totalTokens': token_analysis['total_tokens']
187
+ }
188
+ )
189
+
190
+ # Emit metrics for non-chunked document
191
+ processing_time_ms = (time.time() - start_time) * 1000
192
+ emit_chunking_metrics(
193
+ document_id=document_id,
194
+ strategy=strategy,
195
+ requires_chunking=False,
196
+ processing_time_ms=processing_time_ms,
197
+ processing_mode=processing_mode
198
+ )
199
+
200
+ log_chunking_operation(
201
+ logger=structured_logger,
202
+ operation='analyze',
203
+ document_id=document_id,
204
+ success=True,
205
+ duration_ms=processing_time_ms,
206
+ extra={'requiresChunking': False}
207
+ )
208
+
209
+ return {
210
+ 'documentId': document_id,
211
+ 'requiresChunking': False,
212
+ 'tokenAnalysis': {
213
+ 'totalTokens': token_analysis['total_tokens'],
214
+ 'totalPages': token_analysis['total_pages'],
215
+ 'avgTokensPerPage': token_analysis['avg_tokens_per_page']
216
+ },
217
+ 'reason': _get_no_chunking_reason(token_analysis, merged_config)
218
+ }
219
+
220
+ # Chunking is required - proceed to split PDF
221
+ structured_logger.info(
222
+ f"Document {document_id} requires chunking",
223
+ extra={
224
+ 'event': 'chunking_required',
225
+ 'totalPages': token_analysis['total_pages'],
226
+ 'totalTokens': token_analysis['total_tokens']
227
+ }
228
+ )
229
+
230
+ # Calculate chunk boundaries
231
+ chunks_metadata = _calculate_chunk_boundaries(
232
+ token_analysis,
233
+ merged_config
234
+ )
235
+
236
+ # Split PDF and upload chunks
237
+ chunk_results = _split_and_upload_pdf(
238
+ document_id,
239
+ bucket,
240
+ key,
241
+ chunks_metadata,
242
+ token_analysis
243
+ )
244
+
245
+ # Calculate tokens per chunk for metrics
246
+ tokens_per_chunk = [chunk.get('estimatedTokens', 0) for chunk in chunk_results]
247
+
248
+ # Emit metrics for chunked document
249
+ processing_time_ms = (time.time() - start_time) * 1000
250
+ emit_chunking_metrics(
251
+ document_id=document_id,
252
+ strategy=strategy,
253
+ requires_chunking=True,
254
+ chunk_count=len(chunk_results),
255
+ tokens_per_chunk=tokens_per_chunk,
256
+ processing_time_ms=processing_time_ms,
257
+ processing_mode=processing_mode
258
+ )
259
+
260
+ # Log successful chunking operation
261
+ log_chunking_operation(
262
+ logger=structured_logger,
263
+ operation='split',
264
+ document_id=document_id,
265
+ chunk_count=len(chunk_results),
266
+ success=True,
267
+ duration_ms=processing_time_ms,
268
+ extra={
269
+ 'strategy': strategy,
270
+ 'totalPages': token_analysis['total_pages'],
271
+ 'totalTokens': token_analysis['total_tokens']
272
+ }
273
+ )
274
+
275
+ # Return chunking response
276
+ return {
277
+ 'documentId': document_id,
278
+ 'requiresChunking': True,
279
+ 'tokenAnalysis': {
280
+ 'totalTokens': token_analysis['total_tokens'],
281
+ 'totalPages': token_analysis['total_pages'],
282
+ 'avgTokensPerPage': token_analysis['avg_tokens_per_page'],
283
+ 'tokensPerPage': token_analysis['tokens_per_page']
284
+ },
285
+ 'strategy': merged_config['strategy'],
286
+ 'chunks': chunk_results,
287
+ 'config': {
288
+ 'strategy': merged_config['strategy'],
289
+ 'totalPages': token_analysis['total_pages'],
290
+ 'totalTokens': token_analysis['total_tokens'],
291
+ **_get_strategy_config(merged_config)
292
+ }
293
+ }
294
+
295
+ except ConfigurationError as e:
296
+ error = PDFChunkingError(
297
+ message=str(e),
298
+ error_type='ConfigurationError',
299
+ document_id=event.get('documentId'),
300
+ recoverable=False
301
+ )
302
+ log_error(error, include_stack_trace=True)
303
+ return create_typed_error_response(
304
+ event.get('documentId', 'unknown'),
305
+ error
306
+ )
307
+
308
+ except InvalidPDFFormatError as e:
309
+ log_error(e, include_stack_trace=True)
310
+ return create_typed_error_response(
311
+ event.get('documentId', 'unknown'),
312
+ e
313
+ )
314
+
315
+ except CorruptedPDFError as e:
316
+ log_error(e, include_stack_trace=True)
317
+ return create_typed_error_response(
318
+ event.get('documentId', 'unknown'),
319
+ e
320
+ )
321
+
322
+ except EncryptedPDFError as e:
323
+ log_error(e, include_stack_trace=True)
324
+ return create_typed_error_response(
325
+ event.get('documentId', 'unknown'),
326
+ e
327
+ )
328
+
329
+ except S3AccessDeniedError as e:
330
+ log_error(e, include_stack_trace=True)
331
+ return create_typed_error_response(
332
+ event.get('documentId', 'unknown'),
333
+ e
334
+ )
335
+
336
+ except S3NotFoundError as e:
337
+ log_error(e, include_stack_trace=True)
338
+ return create_typed_error_response(
339
+ event.get('documentId', 'unknown'),
340
+ e
341
+ )
342
+
343
+ except S3ThrottlingError as e:
344
+ log_error(e, include_stack_trace=True)
345
+ return create_typed_error_response(
346
+ event.get('documentId', 'unknown'),
347
+ e
348
+ )
349
+
350
+ except ClientError as e:
351
+ # Classify the S3 error into a specific type
352
+ classified_error = classify_s3_error(
353
+ e,
354
+ document_id=event.get('documentId'),
355
+ bucket=event.get('content', {}).get('bucket'),
356
+ key=event.get('content', {}).get('key')
357
+ )
358
+ log_error(classified_error, include_stack_trace=True)
359
+ return create_typed_error_response(
360
+ event.get('documentId', 'unknown'),
361
+ classified_error
362
+ )
363
+
364
+ except PDFChunkingError as e:
365
+ log_error(e, include_stack_trace=True)
366
+ return create_typed_error_response(
367
+ event.get('documentId', 'unknown'),
368
+ e
369
+ )
370
+
371
+ except Exception as e:
372
+ # Classify unknown errors
373
+ document_id = event.get('documentId', 'unknown')
374
+
375
+ # Check if it's a PDF-related error
376
+ error_str = str(e).lower()
377
+ if any(keyword in error_str for keyword in ['pdf', 'pypdf', 'page', 'reader']):
378
+ classified_error = classify_pdf_error(e, document_id)
379
+ else:
380
+ classified_error = PDFChunkingError(
381
+ message=str(e),
382
+ error_type='UnexpectedError',
383
+ document_id=document_id,
384
+ recoverable=False,
385
+ details={'original_error_type': type(e).__name__}
386
+ )
387
+
388
+ log_error(classified_error, include_stack_trace=True)
389
+ return create_typed_error_response(
390
+ document_id,
391
+ classified_error
392
+ )
393
+
394
+
395
+ def _merge_configuration(config: Dict[str, Any]) -> Dict[str, Any]:
396
+ """
397
+ Merge configuration from event, environment variables, and defaults.
398
+
399
+ Precedence (highest to lowest):
400
+ 1. Event configuration (per-document)
401
+ 2. Environment variables
402
+ 3. Default values
403
+
404
+ Args:
405
+ config: Configuration from event
406
+
407
+ Returns:
408
+ Merged configuration dictionary
409
+ """
410
+ # Default configuration
411
+ # Note: maxPagesPerChunk is 99 (not 100) because Bedrock has a hard limit of 100 pages
412
+ # per PDF, and we need a safety margin to avoid hitting that limit exactly
413
+ merged = {
414
+ 'strategy': 'hybrid',
415
+ 'pageThreshold': 100,
416
+ 'tokenThreshold': 150000,
417
+ 'chunkSize': 50,
418
+ 'overlapPages': 5,
419
+ 'maxTokensPerChunk': 100000,
420
+ 'overlapTokens': 5000,
421
+ 'targetTokensPerChunk': 80000,
422
+ 'maxPagesPerChunk': 99,
423
+ 'processingMode': 'parallel',
424
+ 'maxConcurrency': 10
425
+ }
426
+
427
+ # Override with environment variables
428
+ env_mapping = {
429
+ 'CHUNKING_STRATEGY': 'strategy',
430
+ 'PAGE_THRESHOLD': 'pageThreshold',
431
+ 'TOKEN_THRESHOLD': 'tokenThreshold',
432
+ 'CHUNK_SIZE': 'chunkSize',
433
+ 'OVERLAP_PAGES': 'overlapPages',
434
+ 'MAX_TOKENS_PER_CHUNK': 'maxTokensPerChunk',
435
+ 'OVERLAP_TOKENS': 'overlapTokens',
436
+ 'TARGET_TOKENS_PER_CHUNK': 'targetTokensPerChunk',
437
+ 'MAX_PAGES_PER_CHUNK': 'maxPagesPerChunk',
438
+ 'PROCESSING_MODE': 'processingMode',
439
+ 'MAX_CONCURRENCY': 'maxConcurrency'
440
+ }
441
+
442
+ for env_var, config_key in env_mapping.items():
443
+ env_value = os.environ.get(env_var)
444
+ if env_value is not None:
445
+ # Convert to appropriate type
446
+ if config_key in ['strategy', 'processingMode']:
447
+ merged[config_key] = env_value
448
+ else:
449
+ merged[config_key] = int(env_value)
450
+
451
+ # Override with event configuration (highest precedence)
452
+ for key, value in config.items():
453
+ if value is not None:
454
+ merged[key] = value
455
+
456
+ # Normalize key names (support both camelCase and snake_case)
457
+ normalized = {}
458
+ key_mapping = {
459
+ 'chunkingStrategy': 'strategy',
460
+ 'chunking_strategy': 'strategy',
461
+ 'page_threshold': 'pageThreshold',
462
+ 'token_threshold': 'tokenThreshold',
463
+ 'chunk_size': 'chunkSize',
464
+ 'overlap_pages': 'overlapPages',
465
+ 'max_tokens_per_chunk': 'maxTokensPerChunk',
466
+ 'overlap_tokens': 'overlapTokens',
467
+ 'target_tokens_per_chunk': 'targetTokensPerChunk',
468
+ 'max_pages_per_chunk': 'maxPagesPerChunk',
469
+ 'processing_mode': 'processingMode',
470
+ 'max_concurrency': 'maxConcurrency'
471
+ }
472
+
473
+ for key, value in merged.items():
474
+ normalized_key = key_mapping.get(key, key)
475
+ normalized[normalized_key] = value
476
+
477
+ return normalized
478
+
479
+
480
+ def _get_no_chunking_reason(
481
+ token_analysis: Dict[str, Any],
482
+ config: Dict[str, Any]
483
+ ) -> str:
484
+ """
485
+ Generate human-readable reason for not chunking.
486
+
487
+ Args:
488
+ token_analysis: Token analysis results
489
+ config: Chunking configuration
490
+
491
+ Returns:
492
+ Reason string
493
+ """
494
+ strategy = config.get('strategy', 'hybrid')
495
+ total_pages = token_analysis['total_pages']
496
+ total_tokens = token_analysis['total_tokens']
497
+ page_threshold = config.get('pageThreshold', 100)
498
+ token_threshold = config.get('tokenThreshold', 150000)
499
+
500
+ if strategy == 'fixed-pages':
501
+ return (
502
+ f"Document has {total_pages} pages, "
503
+ f"below threshold of {page_threshold} (fixed-pages strategy)"
504
+ )
505
+ elif strategy == 'token-based':
506
+ return (
507
+ f"Document has {total_tokens} tokens, "
508
+ f"below threshold of {token_threshold} (token-based strategy)"
509
+ )
510
+ else: # hybrid
511
+ return (
512
+ f"Document has {total_pages} pages and {total_tokens} tokens, "
513
+ f"below thresholds of {page_threshold} pages and {token_threshold} tokens (hybrid strategy)"
514
+ )
515
+
516
+
517
+ def _calculate_chunk_boundaries(
518
+ token_analysis: Dict[str, Any],
519
+ config: Dict[str, Any]
520
+ ) -> list:
521
+ """
522
+ Calculate chunk boundaries based on strategy.
523
+
524
+ Args:
525
+ token_analysis: Token analysis results
526
+ config: Chunking configuration
527
+
528
+ Returns:
529
+ List of chunk metadata dictionaries
530
+ """
531
+ strategy = config['strategy']
532
+ total_pages = token_analysis['total_pages']
533
+ tokens_per_page = token_analysis['tokens_per_page']
534
+
535
+ if strategy == 'fixed-pages':
536
+ return calculate_chunks_fixed_pages(
537
+ total_pages,
538
+ config['chunkSize'],
539
+ config['overlapPages']
540
+ )
541
+ elif strategy == 'token-based':
542
+ return calculate_chunks_token_based(
543
+ tokens_per_page,
544
+ config['maxTokensPerChunk'],
545
+ config['overlapTokens']
546
+ )
547
+ else: # hybrid
548
+ return calculate_chunks_hybrid(
549
+ tokens_per_page,
550
+ config['targetTokensPerChunk'],
551
+ config['maxPagesPerChunk'],
552
+ config['overlapTokens']
553
+ )
554
+
555
+
556
+ def _get_strategy_config(config: Dict[str, Any]) -> Dict[str, Any]:
557
+ """
558
+ Extract strategy-specific configuration.
559
+
560
+ Args:
561
+ config: Full configuration
562
+
563
+ Returns:
564
+ Strategy-specific configuration
565
+ """
566
+ strategy = config['strategy']
567
+
568
+ if strategy == 'fixed-pages':
569
+ return {
570
+ 'chunkSize': config['chunkSize'],
571
+ 'overlapPages': config['overlapPages'],
572
+ 'pageThreshold': config['pageThreshold']
573
+ }
574
+ elif strategy == 'token-based':
575
+ return {
576
+ 'maxTokensPerChunk': config['maxTokensPerChunk'],
577
+ 'overlapTokens': config['overlapTokens'],
578
+ 'tokenThreshold': config['tokenThreshold']
579
+ }
580
+ else: # hybrid
581
+ return {
582
+ 'targetTokensPerChunk': config['targetTokensPerChunk'],
583
+ 'maxPagesPerChunk': config['maxPagesPerChunk'],
584
+ 'overlapTokens': config['overlapTokens'],
585
+ 'pageThreshold': config['pageThreshold'],
586
+ 'tokenThreshold': config['tokenThreshold']
587
+ }
588
+
589
+
590
+ def _create_error_response(
591
+ document_id: str,
592
+ error_type: str,
593
+ error_message: str
594
+ ) -> Dict[str, Any]:
595
+ """
596
+ Create standardized error response.
597
+
598
+ Args:
599
+ document_id: Document identifier
600
+ error_type: Type of error
601
+ error_message: Error message
602
+
603
+ Returns:
604
+ Error response dictionary
605
+ """
606
+ return {
607
+ 'documentId': document_id,
608
+ 'requiresChunking': False,
609
+ 'error': {
610
+ 'type': error_type,
611
+ 'message': error_message
612
+ }
613
+ }
614
+
615
+
616
+
617
+ def _split_and_upload_pdf(
618
+ document_id: str,
619
+ bucket: str,
620
+ key: str,
621
+ chunks_metadata: list,
622
+ token_analysis: Dict[str, Any]
623
+ ) -> list:
624
+ """
625
+ Split PDF into chunks and upload to S3.
626
+
627
+ This function:
628
+ 1. Downloads the PDF from S3 using streaming
629
+ 2. Splits the PDF based on chunk boundaries
630
+ 3. Generates chunk IDs: {documentId}_chunk_{index}
631
+ 4. Uploads chunks to S3 chunks/{documentId}/ prefix
632
+ 5. Generates ChunkMetadata for each chunk
633
+
634
+ Args:
635
+ document_id: Document identifier
636
+ bucket: S3 bucket name
637
+ key: S3 object key for source PDF
638
+ chunks_metadata: List of chunk boundary metadata
639
+ token_analysis: Token analysis results
640
+
641
+ Returns:
642
+ List of ChunkMetadata dictionaries
643
+
644
+ Raises:
645
+ InvalidPDFFormatError: If file is not a valid PDF
646
+ CorruptedPDFError: If PDF is corrupted
647
+ EncryptedPDFError: If PDF is encrypted
648
+ S3AccessDeniedError: If S3 access is denied
649
+ S3NotFoundError: If S3 object is not found
650
+ """
651
+ try:
652
+ import PyPDF2
653
+ import io
654
+
655
+ logger.info(f"Splitting PDF {document_id} into {len(chunks_metadata)} chunks")
656
+
657
+ # Download PDF from S3 using streaming
658
+ try:
659
+ pdf_obj = s3_client.get_object(Bucket=bucket, Key=key)
660
+ pdf_bytes = pdf_obj['Body'].read()
661
+
662
+ # Validate file is actually a PDF by checking magic bytes
663
+ validate_pdf_magic_bytes(pdf_bytes, document_id)
664
+
665
+ except ClientError as e:
666
+ raise classify_s3_error(e, document_id, bucket, key)
667
+
668
+ # Attempt to read PDF
669
+ try:
670
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
671
+
672
+ # Validate PDF is not encrypted
673
+ if pdf_reader.is_encrypted:
674
+ raise EncryptedPDFError(
675
+ message=f"PDF {document_id} is encrypted. Encrypted PDFs are not supported.",
676
+ document_id=document_id
677
+ )
678
+
679
+ # Validate page count matches analysis
680
+ actual_pages = len(pdf_reader.pages)
681
+ expected_pages = token_analysis['total_pages']
682
+ if actual_pages != expected_pages:
683
+ logger.warning(
684
+ f"Page count mismatch for {document_id}: "
685
+ f"expected {expected_pages}, got {actual_pages}"
686
+ )
687
+
688
+ except PyPDF2.errors.PdfReadError as e:
689
+ raise CorruptedPDFError(
690
+ message=f"Invalid or corrupted PDF format for {document_id}: {str(e)}",
691
+ document_id=document_id,
692
+ details={'original_error': str(e)}
693
+ )
694
+ except EncryptedPDFError:
695
+ raise
696
+ except Exception as e:
697
+ if "encrypted" in str(e).lower():
698
+ raise EncryptedPDFError(
699
+ message=f"PDF {document_id} is encrypted: {str(e)}",
700
+ document_id=document_id
701
+ )
702
+ raise classify_pdf_error(e, document_id)
703
+
704
+ chunk_results = []
705
+ total_chunks = len(chunks_metadata)
706
+ corrupted_pages = []
707
+
708
+ for chunk_meta in chunks_metadata:
709
+ chunk_index = chunk_meta['chunk_index']
710
+ start_page = chunk_meta['start_page']
711
+ end_page = chunk_meta['end_page']
712
+ page_count = chunk_meta['page_count']
713
+
714
+ # Generate chunk ID
715
+ chunk_id = f"{document_id}_chunk_{chunk_index}"
716
+
717
+ logger.info(
718
+ f"Creating chunk {chunk_index + 1}/{total_chunks}: "
719
+ f"pages {start_page}-{end_page} ({page_count} pages)"
720
+ )
721
+
722
+ # Create new PDF for this chunk
723
+ pdf_writer = PyPDF2.PdfWriter()
724
+
725
+ # Add pages to chunk (end_page is inclusive)
726
+ pages_added = 0
727
+ for page_num in range(start_page, end_page + 1):
728
+ if page_num < len(pdf_reader.pages):
729
+ try:
730
+ pdf_writer.add_page(pdf_reader.pages[page_num])
731
+ pages_added += 1
732
+ except Exception as e:
733
+ # Handle corrupted pages - skip and log warning
734
+ logger.warning(
735
+ f"Skipping corrupted page {page_num} in document {document_id}: {str(e)}"
736
+ )
737
+ corrupted_pages.append(page_num)
738
+ continue
739
+
740
+ # Skip chunk if no pages were successfully added
741
+ if pages_added == 0:
742
+ logger.warning(
743
+ f"Skipping chunk {chunk_index} for document {document_id}: "
744
+ f"no valid pages in range {start_page}-{end_page}"
745
+ )
746
+ continue
747
+
748
+ # Write chunk to bytes
749
+ try:
750
+ chunk_bytes = io.BytesIO()
751
+ pdf_writer.write(chunk_bytes)
752
+ chunk_bytes.seek(0)
753
+ except Exception as e:
754
+ logger.error(
755
+ f"Failed to write chunk {chunk_index} for document {document_id}: {str(e)}"
756
+ )
757
+ raise CorruptedPDFError(
758
+ message=f"Failed to create chunk {chunk_index}: {str(e)}",
759
+ document_id=document_id,
760
+ details={'chunk_index': chunk_index}
761
+ )
762
+
763
+ # Upload chunk to S3 with retry logic
764
+ # Chunks are stored in a folder named after the document ID for organization
765
+ chunk_key = f"chunks/{document_id}/{chunk_id}.pdf"
766
+ _upload_chunk_with_retry(
767
+ bucket,
768
+ chunk_key,
769
+ chunk_bytes.getvalue(),
770
+ document_id,
771
+ chunk_index
772
+ )
773
+
774
+ # Calculate estimated tokens for this chunk
775
+ estimated_tokens = sum(
776
+ token_analysis['tokens_per_page'][i]
777
+ for i in range(start_page, min(end_page + 1, len(token_analysis['tokens_per_page'])))
778
+ )
779
+
780
+ # Create chunk metadata
781
+ chunk_result = {
782
+ 'chunkId': chunk_id,
783
+ 'chunkIndex': chunk_index,
784
+ 'totalChunks': total_chunks,
785
+ 'startPage': start_page,
786
+ 'endPage': end_page,
787
+ 'pageCount': pages_added, # Use actual pages added
788
+ 'estimatedTokens': estimated_tokens,
789
+ 'bucket': bucket,
790
+ 'key': chunk_key
791
+ }
792
+
793
+ chunk_results.append(chunk_result)
794
+
795
+ logger.info(
796
+ f"Successfully created chunk {chunk_index + 1}/{total_chunks}: "
797
+ f"{chunk_id} with {estimated_tokens} tokens"
798
+ )
799
+
800
+ # Log summary of corrupted pages
801
+ if corrupted_pages:
802
+ logger.warning(
803
+ f"Document {document_id} had {len(corrupted_pages)} corrupted pages: "
804
+ f"{corrupted_pages[:10]}{'...' if len(corrupted_pages) > 10 else ''}"
805
+ )
806
+
807
+ # Ensure at least one chunk was created
808
+ if not chunk_results:
809
+ raise CorruptedPDFError(
810
+ message=f"Failed to create any valid chunks for document {document_id}. "
811
+ f"All pages may be corrupted.",
812
+ document_id=document_id,
813
+ details={'corrupted_pages': corrupted_pages}
814
+ )
815
+
816
+ logger.info(
817
+ f"Successfully split document {document_id} into {len(chunk_results)} chunks"
818
+ )
819
+
820
+ return chunk_results
821
+
822
+ except ImportError as e:
823
+ logger.error(f"PyPDF2 not available: {str(e)}")
824
+ raise PDFChunkingError(
825
+ message="PyPDF2 is required for PDF processing",
826
+ error_type='DependencyError',
827
+ document_id=document_id,
828
+ recoverable=False
829
+ )
830
+
831
+ except (InvalidPDFFormatError, CorruptedPDFError, EncryptedPDFError,
832
+ S3AccessDeniedError, S3NotFoundError, S3ThrottlingError, PDFChunkingError):
833
+ # Re-raise our custom errors
834
+ raise
835
+
836
+ except Exception as e:
837
+ logger.error(
838
+ f"Failed to split PDF {document_id}: {str(e)}",
839
+ exc_info=True
840
+ )
841
+ raise classify_pdf_error(e, document_id)
842
+
843
+
844
+ def _upload_chunk_with_retry(
845
+ bucket: str,
846
+ key: str,
847
+ data: bytes,
848
+ document_id: str,
849
+ chunk_index: int,
850
+ max_retries: int = 3
851
+ ) -> None:
852
+ """
853
+ Upload chunk to S3 with exponential backoff retry.
854
+
855
+ Args:
856
+ bucket: S3 bucket name
857
+ key: S3 object key
858
+ data: Chunk data bytes
859
+ document_id: Document identifier (for logging)
860
+ chunk_index: Chunk index (for logging)
861
+ max_retries: Maximum number of retry attempts
862
+
863
+ Raises:
864
+ S3AccessDeniedError: If access is denied
865
+ S3ThrottlingError: If throttled after all retries
866
+ PDFChunkingError: For other S3 errors
867
+ """
868
+ import time
869
+ import random
870
+
871
+ last_error = None
872
+
873
+ for attempt in range(max_retries):
874
+ try:
875
+ s3_client.put_object(
876
+ Bucket=bucket,
877
+ Key=key,
878
+ Body=data,
879
+ ContentType='application/pdf',
880
+ Metadata={
881
+ 'documentId': document_id,
882
+ 'chunkIndex': str(chunk_index)
883
+ }
884
+ )
885
+
886
+ if attempt > 0:
887
+ logger.info(
888
+ f"Successfully uploaded chunk {chunk_index} for document {document_id} "
889
+ f"on attempt {attempt + 1}"
890
+ )
891
+
892
+ return
893
+
894
+ except ClientError as e:
895
+ error_code = e.response['Error']['Code']
896
+ last_error = e
897
+
898
+ # Don't retry on access denied or invalid bucket
899
+ if error_code in ['AccessDenied', 'NoSuchBucket', 'InvalidBucketName']:
900
+ logger.error(
901
+ f"Non-retryable S3 error uploading chunk {chunk_index} "
902
+ f"for document {document_id}: {error_code}"
903
+ )
904
+ raise classify_s3_error(e, document_id, bucket, key)
905
+
906
+ # Retry on throttling or server errors
907
+ if attempt < max_retries - 1:
908
+ # Exponential backoff with jitter: 1s, 2s, 4s + random jitter
909
+ base_wait = 2 ** attempt
910
+ jitter = random.uniform(0, 0.5)
911
+ wait_time = base_wait + jitter
912
+
913
+ logger.warning(
914
+ f"S3 error uploading chunk {chunk_index} for document {document_id}: "
915
+ f"{error_code}. Retrying in {wait_time:.2f}s (attempt {attempt + 1}/{max_retries})"
916
+ )
917
+ time.sleep(wait_time)
918
+ else:
919
+ logger.error(
920
+ f"Failed to upload chunk {chunk_index} for document {document_id} "
921
+ f"after {max_retries} attempts: {error_code}"
922
+ )
923
+ raise classify_s3_error(e, document_id, bucket, key)
924
+
925
+ # Should not reach here, but handle edge case
926
+ if last_error:
927
+ raise classify_s3_error(last_error, document_id, bucket, key)
928
+
929
+
930
+
931
+ def _is_valid_pdf(data: bytes) -> bool:
932
+ """
933
+ Validate that the file is actually a PDF by checking magic bytes.
934
+
935
+ PDF files must start with the magic bytes "%PDF-" (hex: 25 50 44 46 2D).
936
+ This is a quick check before attempting to parse the file with PyPDF2.
937
+
938
+ Args:
939
+ data: File data bytes
940
+
941
+ Returns:
942
+ True if file starts with PDF magic bytes, False otherwise
943
+
944
+ Examples:
945
+ >>> _is_valid_pdf(b'%PDF-1.4\\n...')
946
+ True
947
+ >>> _is_valid_pdf(b'<html>...</html>')
948
+ False
949
+ >>> _is_valid_pdf(b'')
950
+ False
951
+ """
952
+ if not data or len(data) < 5:
953
+ return False
954
+
955
+ # Check for PDF magic bytes: %PDF-
956
+ # This is the standard PDF file signature
957
+ pdf_magic = b'%PDF-'
958
+ return data[:5] == pdf_magic