@cdklabs/cdk-appmod-catalog-blueprints 1.4.1 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/.jsii +2579 -194
  2. package/lib/document-processing/adapter/adapter.d.ts +4 -2
  3. package/lib/document-processing/adapter/adapter.js +1 -1
  4. package/lib/document-processing/adapter/queued-s3-adapter.d.ts +9 -2
  5. package/lib/document-processing/adapter/queued-s3-adapter.js +29 -15
  6. package/lib/document-processing/agentic-document-processing.d.ts +4 -0
  7. package/lib/document-processing/agentic-document-processing.js +20 -10
  8. package/lib/document-processing/base-document-processing.d.ts +54 -2
  9. package/lib/document-processing/base-document-processing.js +136 -82
  10. package/lib/document-processing/bedrock-document-processing.d.ts +202 -2
  11. package/lib/document-processing/bedrock-document-processing.js +717 -77
  12. package/lib/document-processing/chunking-config.d.ts +614 -0
  13. package/lib/document-processing/chunking-config.js +5 -0
  14. package/lib/document-processing/default-document-processing-config.js +1 -1
  15. package/lib/document-processing/index.d.ts +1 -0
  16. package/lib/document-processing/index.js +2 -1
  17. package/lib/document-processing/resources/aggregation/handler.py +567 -0
  18. package/lib/document-processing/resources/aggregation/requirements.txt +7 -0
  19. package/lib/document-processing/resources/aggregation/test_handler.py +362 -0
  20. package/lib/document-processing/resources/cleanup/handler.py +276 -0
  21. package/lib/document-processing/resources/cleanup/requirements.txt +5 -0
  22. package/lib/document-processing/resources/cleanup/test_handler.py +436 -0
  23. package/lib/document-processing/resources/default-bedrock-invoke/index.py +85 -3
  24. package/lib/document-processing/resources/default-bedrock-invoke/test_index.py +622 -0
  25. package/lib/document-processing/resources/pdf-chunking/README.md +313 -0
  26. package/lib/document-processing/resources/pdf-chunking/chunking_strategies.py +460 -0
  27. package/lib/document-processing/resources/pdf-chunking/error_handling.py +491 -0
  28. package/lib/document-processing/resources/pdf-chunking/handler.py +958 -0
  29. package/lib/document-processing/resources/pdf-chunking/metrics.py +435 -0
  30. package/lib/document-processing/resources/pdf-chunking/requirements.txt +3 -0
  31. package/lib/document-processing/resources/pdf-chunking/strategy_selection.py +420 -0
  32. package/lib/document-processing/resources/pdf-chunking/structured_logging.py +457 -0
  33. package/lib/document-processing/resources/pdf-chunking/test_chunking_strategies.py +353 -0
  34. package/lib/document-processing/resources/pdf-chunking/test_error_handling.py +487 -0
  35. package/lib/document-processing/resources/pdf-chunking/test_handler.py +609 -0
  36. package/lib/document-processing/resources/pdf-chunking/test_integration.py +694 -0
  37. package/lib/document-processing/resources/pdf-chunking/test_metrics.py +532 -0
  38. package/lib/document-processing/resources/pdf-chunking/test_strategy_selection.py +471 -0
  39. package/lib/document-processing/resources/pdf-chunking/test_structured_logging.py +449 -0
  40. package/lib/document-processing/resources/pdf-chunking/test_token_estimation.py +374 -0
  41. package/lib/document-processing/resources/pdf-chunking/token_estimation.py +189 -0
  42. package/lib/document-processing/tests/agentic-document-processing-nag.test.js +4 -3
  43. package/lib/document-processing/tests/agentic-document-processing.test.js +488 -4
  44. package/lib/document-processing/tests/base-document-processing-nag.test.js +9 -2
  45. package/lib/document-processing/tests/base-document-processing-schema.test.d.ts +1 -0
  46. package/lib/document-processing/tests/base-document-processing-schema.test.js +337 -0
  47. package/lib/document-processing/tests/base-document-processing.test.js +114 -8
  48. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.d.ts +1 -0
  49. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.js +382 -0
  50. package/lib/document-processing/tests/bedrock-document-processing-nag.test.js +4 -3
  51. package/lib/document-processing/tests/bedrock-document-processing-security.test.d.ts +1 -0
  52. package/lib/document-processing/tests/bedrock-document-processing-security.test.js +389 -0
  53. package/lib/document-processing/tests/bedrock-document-processing.test.js +808 -8
  54. package/lib/document-processing/tests/chunking-config.test.d.ts +1 -0
  55. package/lib/document-processing/tests/chunking-config.test.js +238 -0
  56. package/lib/document-processing/tests/queued-s3-adapter-nag.test.js +9 -2
  57. package/lib/document-processing/tests/queued-s3-adapter.test.js +17 -6
  58. package/lib/framework/agents/base-agent.js +1 -1
  59. package/lib/framework/agents/batch-agent.js +1 -1
  60. package/lib/framework/agents/default-agent-config.js +1 -1
  61. package/lib/framework/bedrock/bedrock.js +1 -1
  62. package/lib/framework/custom-resource/default-runtimes.js +1 -1
  63. package/lib/framework/foundation/access-log.js +1 -1
  64. package/lib/framework/foundation/eventbridge-broker.js +1 -1
  65. package/lib/framework/foundation/network.d.ts +4 -2
  66. package/lib/framework/foundation/network.js +52 -41
  67. package/lib/framework/tests/access-log.test.js +5 -2
  68. package/lib/framework/tests/batch-agent.test.js +5 -2
  69. package/lib/framework/tests/bedrock.test.js +5 -2
  70. package/lib/framework/tests/eventbridge-broker.test.js +5 -2
  71. package/lib/framework/tests/framework-nag.test.js +26 -7
  72. package/lib/framework/tests/network.test.js +30 -2
  73. package/lib/tsconfig.tsbuildinfo +1 -1
  74. package/lib/utilities/data-loader.js +1 -1
  75. package/lib/utilities/lambda-iam-utils.js +1 -1
  76. package/lib/utilities/observability/cloudfront-distribution-observability-property-injector.js +1 -1
  77. package/lib/utilities/observability/default-observability-config.js +1 -1
  78. package/lib/utilities/observability/lambda-observability-property-injector.js +1 -1
  79. package/lib/utilities/observability/log-group-data-protection-utils.js +1 -1
  80. package/lib/utilities/observability/powertools-config.d.ts +10 -1
  81. package/lib/utilities/observability/powertools-config.js +19 -3
  82. package/lib/utilities/observability/state-machine-observability-property-injector.js +1 -1
  83. package/lib/utilities/test-utils.d.ts +43 -0
  84. package/lib/utilities/test-utils.js +56 -0
  85. package/lib/utilities/tests/data-loader-nag.test.js +3 -2
  86. package/lib/utilities/tests/data-loader.test.js +3 -2
  87. package/lib/webapp/frontend-construct.js +1 -1
  88. package/lib/webapp/tests/frontend-construct-nag.test.js +3 -2
  89. package/lib/webapp/tests/frontend-construct.test.js +3 -2
  90. package/package.json +6 -5
  91. package/lib/document-processing/resources/default-error-handler/index.js +0 -46
  92. package/lib/document-processing/resources/default-pdf-processor/index.js +0 -46
  93. package/lib/document-processing/resources/default-pdf-validator/index.js +0 -36
@@ -0,0 +1,362 @@
1
+ """
2
+ Unit tests for aggregation Lambda handler
3
+ """
4
+
5
+ import unittest
6
+ from unittest.mock import MagicMock
7
+ from handler import (
8
+ handler,
9
+ aggregate_classifications,
10
+ deduplicate_entities,
11
+ calculate_chunks_summary
12
+ )
13
+
14
+
15
+ class TestAggregationHandler(unittest.TestCase):
16
+ """Test cases for the main handler function"""
17
+
18
+ def test_handler_success(self):
19
+ """Test successful aggregation"""
20
+ event = {
21
+ 'documentId': 'doc-123',
22
+ 'chunkResults': [
23
+ {
24
+ 'chunkIndex': 0,
25
+ 'classificationResult': {
26
+ 'documentClassification': 'INVOICE'
27
+ },
28
+ 'processingResult': {
29
+ 'entities': [
30
+ {'type': 'AMOUNT', 'value': '100.00'},
31
+ {'type': 'DATE', 'value': '2024-01-01', 'page': 1}
32
+ ]
33
+ }
34
+ },
35
+ {
36
+ 'chunkIndex': 1,
37
+ 'classificationResult': {
38
+ 'documentClassification': 'INVOICE'
39
+ },
40
+ 'processingResult': {
41
+ 'entities': [
42
+ {'type': 'AMOUNT', 'value': '100.00'}, # Duplicate without page
43
+ {'type': 'VENDOR', 'value': 'Acme Corp'}
44
+ ]
45
+ }
46
+ }
47
+ ]
48
+ }
49
+
50
+ context = MagicMock()
51
+ result = handler(event, context)
52
+
53
+ self.assertEqual(result['documentId'], 'doc-123')
54
+ self.assertEqual(result['classification'], 'INVOICE')
55
+ self.assertEqual(result['classificationConfidence'], 1.0)
56
+ self.assertEqual(len(result['entities']), 3) # Deduplicated AMOUNT
57
+ self.assertFalse(result['partialResult'])
58
+ self.assertEqual(result['chunksSummary']['totalChunks'], 2)
59
+ self.assertEqual(result['chunksSummary']['successfulChunks'], 2)
60
+ self.assertEqual(result['chunksSummary']['failedChunks'], 0)
61
+
62
+ def test_handler_missing_document_id(self):
63
+ """Test handler with missing documentId"""
64
+ event = {
65
+ 'chunkResults': []
66
+ }
67
+
68
+ context = MagicMock()
69
+
70
+ with self.assertRaises(ValueError) as cm:
71
+ handler(event, context)
72
+
73
+ self.assertIn('documentId', str(cm.exception))
74
+
75
+ def test_handler_missing_chunk_results(self):
76
+ """Test handler with missing chunkResults"""
77
+ event = {
78
+ 'documentId': 'doc-123'
79
+ }
80
+
81
+ context = MagicMock()
82
+
83
+ with self.assertRaises(ValueError) as cm:
84
+ handler(event, context)
85
+
86
+ self.assertIn('chunkResults', str(cm.exception))
87
+
88
+ def test_handler_partial_result(self):
89
+ """Test handler with partial results (< 50% success)"""
90
+ event = {
91
+ 'documentId': 'doc-123',
92
+ 'chunkResults': [
93
+ {
94
+ 'chunkIndex': 0,
95
+ 'classificationResult': {
96
+ 'documentClassification': 'INVOICE'
97
+ },
98
+ 'processingResult': {
99
+ 'entities': []
100
+ }
101
+ },
102
+ {
103
+ 'chunkIndex': 1,
104
+ 'error': 'Processing failed'
105
+ },
106
+ {
107
+ 'chunkIndex': 2,
108
+ 'error': 'Processing failed'
109
+ }
110
+ ]
111
+ }
112
+
113
+ context = MagicMock()
114
+ result = handler(event, context)
115
+
116
+ self.assertTrue(result['partialResult'])
117
+ self.assertEqual(result['chunksSummary']['successfulChunks'], 1)
118
+ self.assertEqual(result['chunksSummary']['failedChunks'], 2)
119
+
120
+
121
+ class TestAggregateClassifications(unittest.TestCase):
122
+ """Test cases for classification aggregation"""
123
+
124
+ def test_majority_voting_clear_majority(self):
125
+ """Test majority voting with clear majority"""
126
+ chunk_results = [
127
+ {'classificationResult': {'documentClassification': 'INVOICE'}},
128
+ {'classificationResult': {'documentClassification': 'INVOICE'}},
129
+ {'classificationResult': {'documentClassification': 'RECEIPT'}}
130
+ ]
131
+
132
+ classification, confidence = aggregate_classifications(chunk_results)
133
+
134
+ self.assertEqual(classification, 'INVOICE')
135
+ self.assertAlmostEqual(confidence, 2/3, places=2)
136
+
137
+ def test_majority_voting_tie_alphabetical(self):
138
+ """Test majority voting with tie (alphabetical selection)"""
139
+ chunk_results = [
140
+ {'classificationResult': {'documentClassification': 'RECEIPT'}},
141
+ {'classificationResult': {'documentClassification': 'INVOICE'}}
142
+ ]
143
+
144
+ classification, confidence = aggregate_classifications(chunk_results)
145
+
146
+ # Should select 'INVOICE' (alphabetically first)
147
+ self.assertEqual(classification, 'INVOICE')
148
+ self.assertAlmostEqual(confidence, 0.5, places=2)
149
+
150
+ def test_no_classifications(self):
151
+ """Test with no classification results"""
152
+ chunk_results = [
153
+ {'error': 'Failed'},
154
+ {'processingResult': {'entities': []}}
155
+ ]
156
+
157
+ classification, confidence = aggregate_classifications(chunk_results)
158
+
159
+ self.assertIsNone(classification)
160
+ self.assertEqual(confidence, 0.0)
161
+
162
+ def test_skip_failed_chunks(self):
163
+ """Test that failed chunks are skipped"""
164
+ chunk_results = [
165
+ {'classificationResult': {'documentClassification': 'INVOICE'}},
166
+ {'error': 'Processing failed'},
167
+ {'classificationResult': {'documentClassification': 'INVOICE'}}
168
+ ]
169
+
170
+ classification, confidence = aggregate_classifications(chunk_results)
171
+
172
+ self.assertEqual(classification, 'INVOICE')
173
+ self.assertEqual(confidence, 1.0) # 2/2 successful chunks
174
+
175
+
176
+ class TestDeduplicateEntities(unittest.TestCase):
177
+ """Test cases for entity deduplication"""
178
+
179
+ def test_deduplicate_exact_duplicates(self):
180
+ """Test deduplication of exact duplicates without page numbers"""
181
+ chunk_results = [
182
+ {
183
+ 'chunkIndex': 0,
184
+ 'processingResult': {
185
+ 'entities': [
186
+ {'type': 'AMOUNT', 'value': '100.00'},
187
+ {'type': 'VENDOR', 'value': 'Acme Corp'}
188
+ ]
189
+ }
190
+ },
191
+ {
192
+ 'chunkIndex': 1,
193
+ 'processingResult': {
194
+ 'entities': [
195
+ {'type': 'AMOUNT', 'value': '100.00'}, # Duplicate
196
+ {'type': 'DATE', 'value': '2024-01-01'}
197
+ ]
198
+ }
199
+ }
200
+ ]
201
+
202
+ entities = deduplicate_entities(chunk_results)
203
+
204
+ # Should have 3 unique entities (AMOUNT deduplicated)
205
+ self.assertEqual(len(entities), 3)
206
+
207
+ # Check that AMOUNT appears only once
208
+ amount_entities = [e for e in entities if e['type'] == 'AMOUNT']
209
+ self.assertEqual(len(amount_entities), 1)
210
+
211
+ def test_preserve_entities_with_page_numbers(self):
212
+ """Test that entities with page numbers are preserved"""
213
+ chunk_results = [
214
+ {
215
+ 'chunkIndex': 0,
216
+ 'processingResult': {
217
+ 'entities': [
218
+ {'type': 'NAME', 'value': 'John Doe', 'page': 1},
219
+ {'type': 'NAME', 'value': 'John Doe', 'page': 2}
220
+ ]
221
+ }
222
+ }
223
+ ]
224
+
225
+ entities = deduplicate_entities(chunk_results)
226
+
227
+ # Both entities should be preserved
228
+ self.assertEqual(len(entities), 2)
229
+ self.assertEqual(entities[0]['page'], 1)
230
+ self.assertEqual(entities[1]['page'], 2)
231
+
232
+ def test_sort_by_chunk_and_page(self):
233
+ """Test that entities are sorted by chunk index and page number"""
234
+ chunk_results = [
235
+ {
236
+ 'chunkIndex': 1,
237
+ 'processingResult': {
238
+ 'entities': [
239
+ {'type': 'NAME', 'value': 'Jane Doe', 'page': 5}
240
+ ]
241
+ }
242
+ },
243
+ {
244
+ 'chunkIndex': 0,
245
+ 'processingResult': {
246
+ 'entities': [
247
+ {'type': 'NAME', 'value': 'John Doe', 'page': 2}
248
+ ]
249
+ }
250
+ }
251
+ ]
252
+
253
+ entities = deduplicate_entities(chunk_results)
254
+
255
+ # Should be sorted by chunk index first
256
+ self.assertEqual(entities[0]['chunkIndex'], 0)
257
+ self.assertEqual(entities[1]['chunkIndex'], 1)
258
+
259
+ def test_skip_failed_chunks(self):
260
+ """Test that failed chunks are skipped"""
261
+ chunk_results = [
262
+ {
263
+ 'chunkIndex': 0,
264
+ 'processingResult': {
265
+ 'entities': [
266
+ {'type': 'AMOUNT', 'value': '100.00'}
267
+ ]
268
+ }
269
+ },
270
+ {
271
+ 'chunkIndex': 1,
272
+ 'error': 'Processing failed'
273
+ }
274
+ ]
275
+
276
+ entities = deduplicate_entities(chunk_results)
277
+
278
+ self.assertEqual(len(entities), 1)
279
+ self.assertEqual(entities[0]['type'], 'AMOUNT')
280
+
281
+ def test_skip_invalid_entities(self):
282
+ """Test that entities without type or value are skipped"""
283
+ chunk_results = [
284
+ {
285
+ 'chunkIndex': 0,
286
+ 'processingResult': {
287
+ 'entities': [
288
+ {'type': 'AMOUNT', 'value': '100.00'},
289
+ {'type': 'VENDOR'}, # Missing value
290
+ {'value': 'Acme Corp'}, # Missing type
291
+ {} # Missing both
292
+ ]
293
+ }
294
+ }
295
+ ]
296
+
297
+ entities = deduplicate_entities(chunk_results)
298
+
299
+ # Only the valid entity should be included
300
+ self.assertEqual(len(entities), 1)
301
+ self.assertEqual(entities[0]['type'], 'AMOUNT')
302
+
303
+
304
+ class TestCalculateChunksSummary(unittest.TestCase):
305
+ """Test cases for chunks summary calculation"""
306
+
307
+ def test_all_successful(self):
308
+ """Test summary with all successful chunks"""
309
+ chunk_results = [
310
+ {'chunkIndex': 0, 'classificationResult': {}},
311
+ {'chunkIndex': 1, 'classificationResult': {}}
312
+ ]
313
+
314
+ summary = calculate_chunks_summary(chunk_results)
315
+
316
+ self.assertEqual(summary['totalChunks'], 2)
317
+ self.assertEqual(summary['successfulChunks'], 2)
318
+ self.assertEqual(summary['failedChunks'], 0)
319
+
320
+ def test_all_failed(self):
321
+ """Test summary with all failed chunks"""
322
+ chunk_results = [
323
+ {'chunkIndex': 0, 'error': 'Failed'},
324
+ {'chunkIndex': 1, 'error': 'Failed'}
325
+ ]
326
+
327
+ summary = calculate_chunks_summary(chunk_results)
328
+
329
+ self.assertEqual(summary['totalChunks'], 2)
330
+ self.assertEqual(summary['successfulChunks'], 0)
331
+ self.assertEqual(summary['failedChunks'], 2)
332
+
333
+ def test_mixed_results(self):
334
+ """Test summary with mixed success and failure"""
335
+ chunk_results = [
336
+ {'chunkIndex': 0, 'classificationResult': {}},
337
+ {'chunkIndex': 1, 'error': 'Failed'},
338
+ {'chunkIndex': 2, 'classificationResult': {}}
339
+ ]
340
+
341
+ summary = calculate_chunks_summary(chunk_results)
342
+
343
+ self.assertEqual(summary['totalChunks'], 3)
344
+ self.assertEqual(summary['successfulChunks'], 2)
345
+ self.assertEqual(summary['failedChunks'], 1)
346
+
347
+ def test_50_percent_threshold(self):
348
+ """Test partial results with 50% failure threshold"""
349
+ chunk_results = [
350
+ {'chunkIndex': 0, 'classificationResult': {}},
351
+ {'chunkIndex': 1, 'error': 'Failed'}
352
+ ]
353
+
354
+ summary = calculate_chunks_summary(chunk_results)
355
+ success_rate = summary['successfulChunks'] / summary['totalChunks']
356
+
357
+ # Exactly 50% success rate
358
+ self.assertEqual(success_rate, 0.5)
359
+
360
+
361
+ if __name__ == '__main__':
362
+ unittest.main()
@@ -0,0 +1,276 @@
1
+ """
2
+ Cleanup Lambda Handler
3
+
4
+ This Lambda function removes temporary chunk files from S3 after successful aggregation.
5
+ It uses batch delete for efficiency and handles errors gracefully without failing the workflow.
6
+
7
+ Requirements: 8.4, 7.5
8
+ """
9
+
10
+ import json
11
+ import logging
12
+ import os
13
+ import sys
14
+ from typing import Any, Dict, List
15
+
16
+ import boto3
17
+ from botocore.exceptions import ClientError
18
+
19
+ # Try to import structured logging from pdf-chunking module
20
+ # Fall back to standard logging if not available
21
+ try:
22
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'pdf-chunking'))
23
+ from structured_logging import (
24
+ get_logger,
25
+ log_chunking_operation,
26
+ with_correlation_id,
27
+ is_observability_enabled
28
+ )
29
+ structured_logger = get_logger(__name__)
30
+ USE_STRUCTURED_LOGGING = True
31
+ except ImportError:
32
+ USE_STRUCTURED_LOGGING = False
33
+ structured_logger = None
34
+
35
+ # Configure standard logging as fallback
36
+ logger = logging.getLogger()
37
+ logger.setLevel(os.environ.get('LOG_LEVEL', 'INFO'))
38
+
39
+ # Initialize S3 client
40
+ s3_client = boto3.client('s3')
41
+
42
+
43
+ def handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
44
+ """
45
+ Lambda handler for cleaning up temporary chunk files from S3.
46
+
47
+ Args:
48
+ event: Lambda event payload containing:
49
+ - documentId: Document identifier
50
+ - chunks: Array of chunk metadata with bucket and key information
51
+ context: Lambda context object
52
+
53
+ Returns:
54
+ CleanupResponse with deletedChunks count and errors array
55
+ """
56
+ # Set up structured logging context if available
57
+ if USE_STRUCTURED_LOGGING and structured_logger:
58
+ correlation_id = event.get('correlationId')
59
+ structured_logger.set_correlation_id(correlation_id)
60
+
61
+ try:
62
+ # Parse event
63
+ document_id = event.get('documentId')
64
+ chunks = event.get('chunks', [])
65
+
66
+ if not document_id:
67
+ error_msg = "Missing required field: documentId"
68
+ if USE_STRUCTURED_LOGGING and structured_logger:
69
+ structured_logger.error(error_msg, extra={'event': 'cleanup_error'})
70
+ else:
71
+ logger.error(error_msg)
72
+ return {
73
+ 'documentId': None,
74
+ 'deletedChunks': 0,
75
+ 'errors': [error_msg]
76
+ }
77
+
78
+ # Set document context for structured logging
79
+ if USE_STRUCTURED_LOGGING and structured_logger:
80
+ structured_logger.set_document_context(document_id=document_id)
81
+
82
+ if not chunks:
83
+ if USE_STRUCTURED_LOGGING and structured_logger:
84
+ structured_logger.info(
85
+ f"No chunks to clean up for document {document_id}",
86
+ extra={'event': 'cleanup_skipped', 'reason': 'no_chunks'}
87
+ )
88
+ else:
89
+ logger.info(f"No chunks to clean up for document {document_id}")
90
+ return {
91
+ 'documentId': document_id,
92
+ 'deletedChunks': 0,
93
+ 'errors': []
94
+ }
95
+
96
+ if USE_STRUCTURED_LOGGING and structured_logger:
97
+ structured_logger.info(
98
+ f"Starting cleanup for document {document_id}",
99
+ extra={
100
+ 'event': 'cleanup_started',
101
+ 'chunkCount': len(chunks)
102
+ }
103
+ )
104
+ else:
105
+ logger.info(f"Starting cleanup for document {document_id} with {len(chunks)} chunks")
106
+
107
+ # Extract S3 keys for all chunks
108
+ chunk_keys = []
109
+ for chunk in chunks:
110
+ bucket = chunk.get('bucket')
111
+ key = chunk.get('key')
112
+
113
+ if bucket and key:
114
+ chunk_keys.append({
115
+ 'bucket': bucket,
116
+ 'key': key,
117
+ 'chunkId': chunk.get('chunkId', 'unknown')
118
+ })
119
+ else:
120
+ if USE_STRUCTURED_LOGGING and structured_logger:
121
+ structured_logger.warning(
122
+ f"Chunk missing bucket or key",
123
+ extra={'chunk': chunk, 'event': 'invalid_chunk'}
124
+ )
125
+ else:
126
+ logger.warning(f"Chunk missing bucket or key: {chunk}")
127
+
128
+ if not chunk_keys:
129
+ if USE_STRUCTURED_LOGGING and structured_logger:
130
+ structured_logger.warning(
131
+ f"No valid chunk keys found for document {document_id}",
132
+ extra={'event': 'cleanup_skipped', 'reason': 'no_valid_keys'}
133
+ )
134
+ else:
135
+ logger.warning(f"No valid chunk keys found for document {document_id}")
136
+ return {
137
+ 'documentId': document_id,
138
+ 'deletedChunks': 0,
139
+ 'errors': ['No valid chunk keys found']
140
+ }
141
+
142
+ # Delete chunks using batch delete
143
+ deleted_count, errors = delete_chunks_batch(chunk_keys, document_id)
144
+
145
+ if USE_STRUCTURED_LOGGING and structured_logger:
146
+ structured_logger.info(
147
+ f"Cleanup completed for document {document_id}",
148
+ extra={
149
+ 'event': 'cleanup_completed',
150
+ 'deletedChunks': deleted_count,
151
+ 'totalChunks': len(chunk_keys),
152
+ 'errorCount': len(errors)
153
+ }
154
+ )
155
+ else:
156
+ logger.info(
157
+ f"Cleanup completed for document {document_id}: "
158
+ f"deleted {deleted_count}/{len(chunk_keys)} chunks, "
159
+ f"{len(errors)} errors"
160
+ )
161
+
162
+ return {
163
+ 'documentId': document_id,
164
+ 'deletedChunks': deleted_count,
165
+ 'errors': errors
166
+ }
167
+
168
+ except Exception as e:
169
+ if USE_STRUCTURED_LOGGING and structured_logger:
170
+ structured_logger.error(
171
+ f"Unexpected error during cleanup: {str(e)}",
172
+ extra={'event': 'cleanup_error', 'errorType': type(e).__name__},
173
+ exc_info=True
174
+ )
175
+ else:
176
+ logger.error(f"Unexpected error during cleanup: {str(e)}", exc_info=True)
177
+ return {
178
+ 'documentId': event.get('documentId'),
179
+ 'deletedChunks': 0,
180
+ 'errors': [f"Unexpected error: {str(e)}"]
181
+ }
182
+
183
+
184
+ def delete_chunks_batch(
185
+ chunk_keys: List[Dict[str, str]],
186
+ document_id: str
187
+ ) -> tuple[int, List[str]]:
188
+ """
189
+ Delete chunks from S3 using batch delete operations.
190
+
191
+ S3 batch delete supports up to 1000 objects per request.
192
+ This function groups chunks by bucket and processes them in batches.
193
+
194
+ Args:
195
+ chunk_keys: List of dicts with 'bucket', 'key', and 'chunkId'
196
+ document_id: Document identifier for logging
197
+
198
+ Returns:
199
+ Tuple of (deleted_count, errors_list)
200
+ """
201
+ deleted_count = 0
202
+ errors = []
203
+
204
+ # Group chunks by bucket
205
+ chunks_by_bucket: Dict[str, List[Dict[str, str]]] = {}
206
+ for chunk in chunk_keys:
207
+ bucket = chunk['bucket']
208
+ if bucket not in chunks_by_bucket:
209
+ chunks_by_bucket[bucket] = []
210
+ chunks_by_bucket[bucket].append(chunk)
211
+
212
+ # Process each bucket
213
+ for bucket, chunks in chunks_by_bucket.items():
214
+ logger.info(f"Deleting {len(chunks)} chunks from bucket {bucket}")
215
+
216
+ # Process in batches of 1000 (S3 limit)
217
+ batch_size = 1000
218
+ for i in range(0, len(chunks), batch_size):
219
+ batch = chunks[i:i + batch_size]
220
+
221
+ # Prepare delete request
222
+ objects_to_delete = [{'Key': chunk['key']} for chunk in batch]
223
+
224
+ try:
225
+ response = s3_client.delete_objects(
226
+ Bucket=bucket,
227
+ Delete={'Objects': objects_to_delete}
228
+ )
229
+
230
+ # Count successful deletions
231
+ deleted = response.get('Deleted', [])
232
+ deleted_count += len(deleted)
233
+
234
+ # Log any errors from S3
235
+ s3_errors = response.get('Errors', [])
236
+ for error in s3_errors:
237
+ error_key = error.get('Key', 'unknown')
238
+ error_code = error.get('Code', 'unknown')
239
+ error_message = error.get('Message', 'unknown')
240
+
241
+ error_msg = (
242
+ f"Failed to delete {error_key}: "
243
+ f"{error_code} - {error_message}"
244
+ )
245
+ logger.warning(error_msg)
246
+ errors.append(error_msg)
247
+
248
+ logger.info(
249
+ f"Batch delete completed: {len(deleted)} deleted, "
250
+ f"{len(s3_errors)} errors"
251
+ )
252
+
253
+ except ClientError as e:
254
+ error_code = e.response.get('Error', {}).get('Code', 'Unknown')
255
+ error_message = e.response.get('Error', {}).get('Message', str(e))
256
+
257
+ error_msg = (
258
+ f"S3 batch delete failed for bucket {bucket}: "
259
+ f"{error_code} - {error_message}"
260
+ )
261
+ logger.error(error_msg)
262
+ errors.append(error_msg)
263
+
264
+ # Log which chunks failed
265
+ for chunk in batch:
266
+ logger.error(
267
+ f"Failed to delete chunk {chunk['chunkId']} "
268
+ f"at s3://{bucket}/{chunk['key']}"
269
+ )
270
+
271
+ except Exception as e:
272
+ error_msg = f"Unexpected error during batch delete: {str(e)}"
273
+ logger.error(error_msg, exc_info=True)
274
+ errors.append(error_msg)
275
+
276
+ return deleted_count, errors
@@ -0,0 +1,5 @@
1
+ # Cleanup Lambda Dependencies
2
+ # Requirements: 8.4
3
+
4
+ # AWS SDK for Python (S3 operations)
5
+ boto3>=1.26.0