@cdklabs/cdk-appmod-catalog-blueprints 1.4.1 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/.jsii +2579 -194
  2. package/lib/document-processing/adapter/adapter.d.ts +4 -2
  3. package/lib/document-processing/adapter/adapter.js +1 -1
  4. package/lib/document-processing/adapter/queued-s3-adapter.d.ts +9 -2
  5. package/lib/document-processing/adapter/queued-s3-adapter.js +29 -15
  6. package/lib/document-processing/agentic-document-processing.d.ts +4 -0
  7. package/lib/document-processing/agentic-document-processing.js +20 -10
  8. package/lib/document-processing/base-document-processing.d.ts +54 -2
  9. package/lib/document-processing/base-document-processing.js +136 -82
  10. package/lib/document-processing/bedrock-document-processing.d.ts +202 -2
  11. package/lib/document-processing/bedrock-document-processing.js +717 -77
  12. package/lib/document-processing/chunking-config.d.ts +614 -0
  13. package/lib/document-processing/chunking-config.js +5 -0
  14. package/lib/document-processing/default-document-processing-config.js +1 -1
  15. package/lib/document-processing/index.d.ts +1 -0
  16. package/lib/document-processing/index.js +2 -1
  17. package/lib/document-processing/resources/aggregation/handler.py +567 -0
  18. package/lib/document-processing/resources/aggregation/requirements.txt +7 -0
  19. package/lib/document-processing/resources/aggregation/test_handler.py +362 -0
  20. package/lib/document-processing/resources/cleanup/handler.py +276 -0
  21. package/lib/document-processing/resources/cleanup/requirements.txt +5 -0
  22. package/lib/document-processing/resources/cleanup/test_handler.py +436 -0
  23. package/lib/document-processing/resources/default-bedrock-invoke/index.py +85 -3
  24. package/lib/document-processing/resources/default-bedrock-invoke/test_index.py +622 -0
  25. package/lib/document-processing/resources/pdf-chunking/README.md +313 -0
  26. package/lib/document-processing/resources/pdf-chunking/chunking_strategies.py +460 -0
  27. package/lib/document-processing/resources/pdf-chunking/error_handling.py +491 -0
  28. package/lib/document-processing/resources/pdf-chunking/handler.py +958 -0
  29. package/lib/document-processing/resources/pdf-chunking/metrics.py +435 -0
  30. package/lib/document-processing/resources/pdf-chunking/requirements.txt +3 -0
  31. package/lib/document-processing/resources/pdf-chunking/strategy_selection.py +420 -0
  32. package/lib/document-processing/resources/pdf-chunking/structured_logging.py +457 -0
  33. package/lib/document-processing/resources/pdf-chunking/test_chunking_strategies.py +353 -0
  34. package/lib/document-processing/resources/pdf-chunking/test_error_handling.py +487 -0
  35. package/lib/document-processing/resources/pdf-chunking/test_handler.py +609 -0
  36. package/lib/document-processing/resources/pdf-chunking/test_integration.py +694 -0
  37. package/lib/document-processing/resources/pdf-chunking/test_metrics.py +532 -0
  38. package/lib/document-processing/resources/pdf-chunking/test_strategy_selection.py +471 -0
  39. package/lib/document-processing/resources/pdf-chunking/test_structured_logging.py +449 -0
  40. package/lib/document-processing/resources/pdf-chunking/test_token_estimation.py +374 -0
  41. package/lib/document-processing/resources/pdf-chunking/token_estimation.py +189 -0
  42. package/lib/document-processing/tests/agentic-document-processing-nag.test.js +4 -3
  43. package/lib/document-processing/tests/agentic-document-processing.test.js +488 -4
  44. package/lib/document-processing/tests/base-document-processing-nag.test.js +9 -2
  45. package/lib/document-processing/tests/base-document-processing-schema.test.d.ts +1 -0
  46. package/lib/document-processing/tests/base-document-processing-schema.test.js +337 -0
  47. package/lib/document-processing/tests/base-document-processing.test.js +114 -8
  48. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.d.ts +1 -0
  49. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.js +382 -0
  50. package/lib/document-processing/tests/bedrock-document-processing-nag.test.js +4 -3
  51. package/lib/document-processing/tests/bedrock-document-processing-security.test.d.ts +1 -0
  52. package/lib/document-processing/tests/bedrock-document-processing-security.test.js +389 -0
  53. package/lib/document-processing/tests/bedrock-document-processing.test.js +808 -8
  54. package/lib/document-processing/tests/chunking-config.test.d.ts +1 -0
  55. package/lib/document-processing/tests/chunking-config.test.js +238 -0
  56. package/lib/document-processing/tests/queued-s3-adapter-nag.test.js +9 -2
  57. package/lib/document-processing/tests/queued-s3-adapter.test.js +17 -6
  58. package/lib/framework/agents/base-agent.js +1 -1
  59. package/lib/framework/agents/batch-agent.js +1 -1
  60. package/lib/framework/agents/default-agent-config.js +1 -1
  61. package/lib/framework/bedrock/bedrock.js +1 -1
  62. package/lib/framework/custom-resource/default-runtimes.js +1 -1
  63. package/lib/framework/foundation/access-log.js +1 -1
  64. package/lib/framework/foundation/eventbridge-broker.js +1 -1
  65. package/lib/framework/foundation/network.d.ts +4 -2
  66. package/lib/framework/foundation/network.js +52 -41
  67. package/lib/framework/tests/access-log.test.js +5 -2
  68. package/lib/framework/tests/batch-agent.test.js +5 -2
  69. package/lib/framework/tests/bedrock.test.js +5 -2
  70. package/lib/framework/tests/eventbridge-broker.test.js +5 -2
  71. package/lib/framework/tests/framework-nag.test.js +26 -7
  72. package/lib/framework/tests/network.test.js +30 -2
  73. package/lib/tsconfig.tsbuildinfo +1 -1
  74. package/lib/utilities/data-loader.js +1 -1
  75. package/lib/utilities/lambda-iam-utils.js +1 -1
  76. package/lib/utilities/observability/cloudfront-distribution-observability-property-injector.js +1 -1
  77. package/lib/utilities/observability/default-observability-config.js +1 -1
  78. package/lib/utilities/observability/lambda-observability-property-injector.js +1 -1
  79. package/lib/utilities/observability/log-group-data-protection-utils.js +1 -1
  80. package/lib/utilities/observability/powertools-config.d.ts +10 -1
  81. package/lib/utilities/observability/powertools-config.js +19 -3
  82. package/lib/utilities/observability/state-machine-observability-property-injector.js +1 -1
  83. package/lib/utilities/test-utils.d.ts +43 -0
  84. package/lib/utilities/test-utils.js +56 -0
  85. package/lib/utilities/tests/data-loader-nag.test.js +3 -2
  86. package/lib/utilities/tests/data-loader.test.js +3 -2
  87. package/lib/webapp/frontend-construct.js +1 -1
  88. package/lib/webapp/tests/frontend-construct-nag.test.js +3 -2
  89. package/lib/webapp/tests/frontend-construct.test.js +3 -2
  90. package/package.json +6 -5
  91. package/lib/document-processing/resources/default-error-handler/index.js +0 -46
  92. package/lib/document-processing/resources/default-pdf-processor/index.js +0 -46
  93. package/lib/document-processing/resources/default-pdf-validator/index.js +0 -36
@@ -0,0 +1,436 @@
1
+ """
2
+ Unit tests for cleanup Lambda handler
3
+
4
+ Tests cover:
5
+ - Batch delete with multiple chunks
6
+ - Error handling for S3 delete failures
7
+ - Non-blocking error handling
8
+
9
+ Requirements: 8.4
10
+ """
11
+
12
+ import json
13
+ import unittest
14
+ from unittest.mock import MagicMock, patch, call
15
+
16
+ from handler import handler, delete_chunks_batch
17
+
18
+
19
+ class TestCleanupHandler(unittest.TestCase):
20
+ """Test cases for cleanup Lambda handler"""
21
+
22
+ def setUp(self):
23
+ """Set up test fixtures"""
24
+ self.document_id = "test-doc-123"
25
+ self.bucket = "test-bucket"
26
+
27
+ # Sample chunks
28
+ self.chunks = [
29
+ {
30
+ 'chunkId': f'{self.document_id}_chunk_0',
31
+ 'bucket': self.bucket,
32
+ 'key': f'chunks/{self.document_id}_chunk_0.pdf'
33
+ },
34
+ {
35
+ 'chunkId': f'{self.document_id}_chunk_1',
36
+ 'bucket': self.bucket,
37
+ 'key': f'chunks/{self.document_id}_chunk_1.pdf'
38
+ },
39
+ {
40
+ 'chunkId': f'{self.document_id}_chunk_2',
41
+ 'bucket': self.bucket,
42
+ 'key': f'chunks/{self.document_id}_chunk_2.pdf'
43
+ }
44
+ ]
45
+
46
+ @patch('handler.s3_client')
47
+ def test_successful_cleanup(self, mock_s3):
48
+ """Test successful cleanup of all chunks"""
49
+ # Mock successful S3 delete
50
+ mock_s3.delete_objects.return_value = {
51
+ 'Deleted': [
52
+ {'Key': chunk['key']} for chunk in self.chunks
53
+ ],
54
+ 'Errors': []
55
+ }
56
+
57
+ # Create event
58
+ event = {
59
+ 'documentId': self.document_id,
60
+ 'chunks': self.chunks
61
+ }
62
+
63
+ # Call handler
64
+ result = handler(event, None)
65
+
66
+ # Verify response
67
+ self.assertEqual(result['documentId'], self.document_id)
68
+ self.assertEqual(result['deletedChunks'], 3)
69
+ self.assertEqual(result['errors'], [])
70
+
71
+ # Verify S3 delete was called
72
+ mock_s3.delete_objects.assert_called_once()
73
+ call_args = mock_s3.delete_objects.call_args
74
+ self.assertEqual(call_args[1]['Bucket'], self.bucket)
75
+ self.assertEqual(len(call_args[1]['Delete']['Objects']), 3)
76
+
77
+ @patch('handler.s3_client')
78
+ def test_partial_delete_failure(self, mock_s3):
79
+ """Test cleanup with some S3 delete failures"""
80
+ # Mock partial S3 delete failure
81
+ mock_s3.delete_objects.return_value = {
82
+ 'Deleted': [
83
+ {'Key': self.chunks[0]['key']},
84
+ {'Key': self.chunks[1]['key']}
85
+ ],
86
+ 'Errors': [
87
+ {
88
+ 'Key': self.chunks[2]['key'],
89
+ 'Code': 'AccessDenied',
90
+ 'Message': 'Access Denied'
91
+ }
92
+ ]
93
+ }
94
+
95
+ # Create event
96
+ event = {
97
+ 'documentId': self.document_id,
98
+ 'chunks': self.chunks
99
+ }
100
+
101
+ # Call handler
102
+ result = handler(event, None)
103
+
104
+ # Verify response
105
+ self.assertEqual(result['documentId'], self.document_id)
106
+ self.assertEqual(result['deletedChunks'], 2)
107
+ self.assertEqual(len(result['errors']), 1)
108
+ self.assertIn('AccessDenied', result['errors'][0])
109
+ self.assertIn(self.chunks[2]['key'], result['errors'][0])
110
+
111
+ @patch('handler.s3_client')
112
+ def test_complete_delete_failure(self, mock_s3):
113
+ """Test cleanup when all S3 deletes fail"""
114
+ # Mock complete S3 delete failure
115
+ from botocore.exceptions import ClientError
116
+ mock_s3.delete_objects.side_effect = ClientError(
117
+ {
118
+ 'Error': {
119
+ 'Code': 'NoSuchBucket',
120
+ 'Message': 'The specified bucket does not exist'
121
+ }
122
+ },
123
+ 'DeleteObjects'
124
+ )
125
+
126
+ # Create event
127
+ event = {
128
+ 'documentId': self.document_id,
129
+ 'chunks': self.chunks
130
+ }
131
+
132
+ # Call handler
133
+ result = handler(event, None)
134
+
135
+ # Verify response - should not fail workflow
136
+ self.assertEqual(result['documentId'], self.document_id)
137
+ self.assertEqual(result['deletedChunks'], 0)
138
+ self.assertEqual(len(result['errors']), 1)
139
+ self.assertIn('NoSuchBucket', result['errors'][0])
140
+
141
+ @patch('handler.s3_client')
142
+ def test_batch_delete_with_multiple_chunks(self, mock_s3):
143
+ """Test batch delete with multiple chunks"""
144
+ # Create 5 chunks
145
+ chunks = [
146
+ {
147
+ 'chunkId': f'{self.document_id}_chunk_{i}',
148
+ 'bucket': self.bucket,
149
+ 'key': f'chunks/{self.document_id}_chunk_{i}.pdf'
150
+ }
151
+ for i in range(5)
152
+ ]
153
+
154
+ # Mock successful S3 delete
155
+ mock_s3.delete_objects.return_value = {
156
+ 'Deleted': [{'Key': chunk['key']} for chunk in chunks],
157
+ 'Errors': []
158
+ }
159
+
160
+ # Create event
161
+ event = {
162
+ 'documentId': self.document_id,
163
+ 'chunks': chunks
164
+ }
165
+
166
+ # Call handler
167
+ result = handler(event, None)
168
+
169
+ # Verify response
170
+ self.assertEqual(result['documentId'], self.document_id)
171
+ self.assertEqual(result['deletedChunks'], 5)
172
+ self.assertEqual(result['errors'], [])
173
+
174
+ @patch('handler.s3_client')
175
+ def test_batch_delete_respects_1000_limit(self, mock_s3):
176
+ """Test that batch delete respects S3's 1000 object limit"""
177
+ # Create 1500 chunks (should require 2 batches)
178
+ chunks = [
179
+ {
180
+ 'chunkId': f'{self.document_id}_chunk_{i}',
181
+ 'bucket': self.bucket,
182
+ 'key': f'chunks/{self.document_id}_chunk_{i}.pdf'
183
+ }
184
+ for i in range(1500)
185
+ ]
186
+
187
+ # Mock successful S3 delete
188
+ def mock_delete_objects(**kwargs):
189
+ objects = kwargs['Delete']['Objects']
190
+ return {
191
+ 'Deleted': [{'Key': obj['Key']} for obj in objects],
192
+ 'Errors': []
193
+ }
194
+
195
+ mock_s3.delete_objects.side_effect = mock_delete_objects
196
+
197
+ # Create event
198
+ event = {
199
+ 'documentId': self.document_id,
200
+ 'chunks': chunks
201
+ }
202
+
203
+ # Call handler
204
+ result = handler(event, None)
205
+
206
+ # Verify response
207
+ self.assertEqual(result['documentId'], self.document_id)
208
+ self.assertEqual(result['deletedChunks'], 1500)
209
+ self.assertEqual(result['errors'], [])
210
+
211
+ # Verify S3 delete was called twice (2 batches)
212
+ self.assertEqual(mock_s3.delete_objects.call_count, 2)
213
+
214
+ # Verify first batch has 1000 objects
215
+ first_call = mock_s3.delete_objects.call_args_list[0]
216
+ self.assertEqual(len(first_call[1]['Delete']['Objects']), 1000)
217
+
218
+ # Verify second batch has 500 objects
219
+ second_call = mock_s3.delete_objects.call_args_list[1]
220
+ self.assertEqual(len(second_call[1]['Delete']['Objects']), 500)
221
+
222
+ def test_missing_document_id(self):
223
+ """Test handler with missing documentId"""
224
+ event = {
225
+ 'chunks': self.chunks
226
+ }
227
+
228
+ result = handler(event, None)
229
+
230
+ # Verify error response
231
+ self.assertIsNone(result['documentId'])
232
+ self.assertEqual(result['deletedChunks'], 0)
233
+ self.assertEqual(len(result['errors']), 1)
234
+ self.assertIn('documentId', result['errors'][0])
235
+
236
+ def test_empty_chunks_array(self):
237
+ """Test handler with empty chunks array"""
238
+ event = {
239
+ 'documentId': self.document_id,
240
+ 'chunks': []
241
+ }
242
+
243
+ result = handler(event, None)
244
+
245
+ # Verify response
246
+ self.assertEqual(result['documentId'], self.document_id)
247
+ self.assertEqual(result['deletedChunks'], 0)
248
+ self.assertEqual(result['errors'], [])
249
+
250
+ def test_chunks_missing_bucket_or_key(self):
251
+ """Test handler with chunks missing bucket or key"""
252
+ chunks = [
253
+ {
254
+ 'chunkId': f'{self.document_id}_chunk_0',
255
+ 'bucket': self.bucket,
256
+ # Missing 'key'
257
+ },
258
+ {
259
+ 'chunkId': f'{self.document_id}_chunk_1',
260
+ # Missing 'bucket'
261
+ 'key': f'chunks/{self.document_id}_chunk_1.pdf'
262
+ },
263
+ {
264
+ 'chunkId': f'{self.document_id}_chunk_2',
265
+ 'bucket': self.bucket,
266
+ 'key': f'chunks/{self.document_id}_chunk_2.pdf'
267
+ }
268
+ ]
269
+
270
+ event = {
271
+ 'documentId': self.document_id,
272
+ 'chunks': chunks
273
+ }
274
+
275
+ # Mock S3 delete for the valid chunk
276
+ with patch('handler.s3_client') as mock_s3:
277
+ mock_s3.delete_objects.return_value = {
278
+ 'Deleted': [{'Key': chunks[2]['key']}],
279
+ 'Errors': []
280
+ }
281
+
282
+ result = handler(event, None)
283
+
284
+ # Verify only valid chunk was deleted
285
+ self.assertEqual(result['documentId'], self.document_id)
286
+ self.assertEqual(result['deletedChunks'], 1)
287
+
288
+ @patch('handler.s3_client')
289
+ def test_multiple_buckets(self, mock_s3):
290
+ """Test cleanup with chunks in multiple buckets"""
291
+ chunks = [
292
+ {
293
+ 'chunkId': f'{self.document_id}_chunk_0',
294
+ 'bucket': 'bucket-1',
295
+ 'key': f'chunks/{self.document_id}_chunk_0.pdf'
296
+ },
297
+ {
298
+ 'chunkId': f'{self.document_id}_chunk_1',
299
+ 'bucket': 'bucket-2',
300
+ 'key': f'chunks/{self.document_id}_chunk_1.pdf'
301
+ },
302
+ {
303
+ 'chunkId': f'{self.document_id}_chunk_2',
304
+ 'bucket': 'bucket-1',
305
+ 'key': f'chunks/{self.document_id}_chunk_2.pdf'
306
+ }
307
+ ]
308
+
309
+ # Mock successful S3 delete
310
+ def mock_delete_objects(**kwargs):
311
+ objects = kwargs['Delete']['Objects']
312
+ return {
313
+ 'Deleted': [{'Key': obj['Key']} for obj in objects],
314
+ 'Errors': []
315
+ }
316
+
317
+ mock_s3.delete_objects.side_effect = mock_delete_objects
318
+
319
+ # Create event
320
+ event = {
321
+ 'documentId': self.document_id,
322
+ 'chunks': chunks
323
+ }
324
+
325
+ # Call handler
326
+ result = handler(event, None)
327
+
328
+ # Verify response
329
+ self.assertEqual(result['documentId'], self.document_id)
330
+ self.assertEqual(result['deletedChunks'], 3)
331
+ self.assertEqual(result['errors'], [])
332
+
333
+ # Verify S3 delete was called twice (once per bucket)
334
+ self.assertEqual(mock_s3.delete_objects.call_count, 2)
335
+
336
+ @patch('handler.s3_client')
337
+ def test_non_blocking_error_handling(self, mock_s3):
338
+ """Test that errors don't block workflow completion"""
339
+ # Mock S3 delete failure
340
+ from botocore.exceptions import ClientError
341
+ mock_s3.delete_objects.side_effect = ClientError(
342
+ {
343
+ 'Error': {
344
+ 'Code': 'InternalError',
345
+ 'Message': 'Internal Server Error'
346
+ }
347
+ },
348
+ 'DeleteObjects'
349
+ )
350
+
351
+ # Create event
352
+ event = {
353
+ 'documentId': self.document_id,
354
+ 'chunks': self.chunks
355
+ }
356
+
357
+ # Call handler - should not raise exception
358
+ result = handler(event, None)
359
+
360
+ # Verify response contains error but doesn't fail
361
+ self.assertEqual(result['documentId'], self.document_id)
362
+ self.assertEqual(result['deletedChunks'], 0)
363
+ self.assertGreater(len(result['errors']), 0)
364
+ self.assertIn('InternalError', result['errors'][0])
365
+
366
+
367
+ class TestDeleteChunksBatch(unittest.TestCase):
368
+ """Test cases for delete_chunks_batch function"""
369
+
370
+ @patch('handler.s3_client')
371
+ def test_delete_chunks_batch_success(self, mock_s3):
372
+ """Test successful batch delete"""
373
+ chunk_keys = [
374
+ {
375
+ 'bucket': 'test-bucket',
376
+ 'key': 'chunks/chunk_0.pdf',
377
+ 'chunkId': 'chunk_0'
378
+ },
379
+ {
380
+ 'bucket': 'test-bucket',
381
+ 'key': 'chunks/chunk_1.pdf',
382
+ 'chunkId': 'chunk_1'
383
+ }
384
+ ]
385
+
386
+ mock_s3.delete_objects.return_value = {
387
+ 'Deleted': [
388
+ {'Key': 'chunks/chunk_0.pdf'},
389
+ {'Key': 'chunks/chunk_1.pdf'}
390
+ ],
391
+ 'Errors': []
392
+ }
393
+
394
+ deleted_count, errors = delete_chunks_batch(chunk_keys, 'test-doc')
395
+
396
+ self.assertEqual(deleted_count, 2)
397
+ self.assertEqual(errors, [])
398
+
399
+ @patch('handler.s3_client')
400
+ def test_delete_chunks_batch_with_errors(self, mock_s3):
401
+ """Test batch delete with some errors"""
402
+ chunk_keys = [
403
+ {
404
+ 'bucket': 'test-bucket',
405
+ 'key': 'chunks/chunk_0.pdf',
406
+ 'chunkId': 'chunk_0'
407
+ },
408
+ {
409
+ 'bucket': 'test-bucket',
410
+ 'key': 'chunks/chunk_1.pdf',
411
+ 'chunkId': 'chunk_1'
412
+ }
413
+ ]
414
+
415
+ mock_s3.delete_objects.return_value = {
416
+ 'Deleted': [
417
+ {'Key': 'chunks/chunk_0.pdf'}
418
+ ],
419
+ 'Errors': [
420
+ {
421
+ 'Key': 'chunks/chunk_1.pdf',
422
+ 'Code': 'AccessDenied',
423
+ 'Message': 'Access Denied'
424
+ }
425
+ ]
426
+ }
427
+
428
+ deleted_count, errors = delete_chunks_batch(chunk_keys, 'test-doc')
429
+
430
+ self.assertEqual(deleted_count, 1)
431
+ self.assertEqual(len(errors), 1)
432
+ self.assertIn('AccessDenied', errors[0])
433
+
434
+
435
+ if __name__ == '__main__':
436
+ unittest.main()
@@ -10,6 +10,67 @@ bedrock = boto3.client('bedrock-runtime')
10
10
  metrics = Metrics()
11
11
  tracer = Tracer()
12
12
 
13
+
14
+ def parse_chunk_metadata(event):
15
+ """
16
+ Parse optional chunk metadata from the event payload.
17
+
18
+ Returns a dictionary with chunk information if present, None otherwise.
19
+ Supports both direct chunk metadata and nested chunk object format.
20
+ """
21
+ # Check for direct chunkMetadata field
22
+ if 'chunkMetadata' in event:
23
+ return event['chunkMetadata']
24
+
25
+ # Check for chunk object (from Map State iteration)
26
+ if 'chunk' in event:
27
+ chunk = event['chunk']
28
+ return {
29
+ 'chunkIndex': chunk.get('chunkIndex', event.get('chunkIndex', 0)),
30
+ 'totalChunks': event.get('totalChunks', 1),
31
+ 'startPage': chunk.get('startPage', 0),
32
+ 'endPage': chunk.get('endPage', 0),
33
+ 'pageCount': chunk.get('pageCount', 0),
34
+ 'estimatedTokens': chunk.get('estimatedTokens', 0),
35
+ 'overlapPages': chunk.get('overlapPages', 0),
36
+ }
37
+
38
+ return None
39
+
40
+
41
+ def build_chunk_context_prompt(chunk_metadata):
42
+ """
43
+ Build a context prompt for chunk-aware processing.
44
+
45
+ Args:
46
+ chunk_metadata: Dictionary containing chunk information
47
+
48
+ Returns:
49
+ String with chunk context to prepend to the main prompt
50
+ """
51
+ if not chunk_metadata:
52
+ return ""
53
+
54
+ chunk_index = chunk_metadata.get('chunkIndex', 0)
55
+ total_chunks = chunk_metadata.get('totalChunks', 1)
56
+ start_page = chunk_metadata.get('startPage', 0)
57
+ end_page = chunk_metadata.get('endPage', 0)
58
+ overlap_pages = chunk_metadata.get('overlapPages', 0)
59
+
60
+ # Build context string
61
+ context_parts = [
62
+ f"You are analyzing chunk {chunk_index + 1} of {total_chunks} from pages {start_page + 1} to {end_page + 1}."
63
+ ]
64
+
65
+ # Add overlap information if applicable
66
+ if overlap_pages > 0 and chunk_index > 0:
67
+ context_parts.append(
68
+ f"Note: This chunk includes {overlap_pages} overlapping pages from the previous chunk for context."
69
+ )
70
+
71
+ return "\n".join(context_parts) + "\n\n"
72
+
73
+
13
74
  @metrics.log_metrics
14
75
  @tracer.capture_lambda_handler
15
76
  def handler(event, context):
@@ -18,20 +79,40 @@ def handler(event, context):
18
79
  tracer.put_annotation(key="documentId", value=event["documentId"])
19
80
  metrics.add_dimension(name="invoke_type", value=invoke_type)
20
81
  content_type = event["contentType"]
82
+
83
+ # Parse optional chunk metadata
84
+ chunk_metadata = parse_chunk_metadata(event)
85
+ if chunk_metadata:
86
+ tracer.put_annotation(key="chunkIndex", value=str(chunk_metadata.get('chunkIndex', 0)))
87
+ tracer.put_annotation(key="totalChunks", value=str(chunk_metadata.get('totalChunks', 1)))
88
+ metrics.add_dimension(name="is_chunked", value="true")
89
+ else:
90
+ metrics.add_dimension(name="is_chunked", value="false")
91
+
21
92
  # Format prompt if classification result exists
22
93
  prompt = os.environ['PROMPT']
23
94
  if 'classificationResult' in event:
24
95
  classification = event['classificationResult']['documentClassification']
25
96
  prompt = prompt.replace("[ACTUAL_CLASSIFICATION]", classification)
26
97
 
98
+ # Add chunk context to prompt if processing a chunk
99
+ chunk_context = build_chunk_context_prompt(chunk_metadata)
100
+ if chunk_context:
101
+ prompt = chunk_context + prompt
102
+
27
103
  # Build content based on file type
28
104
  content = [{'type': 'text', 'text': prompt}]
29
105
  if content_type == 'file':
30
106
  content_location = event['content']['location']
31
107
 
32
108
  if content_location == 's3':
33
- bucket = event['content']['bucket']
34
- key = event['content']['key']
109
+ # Use chunk-specific S3 location if available, otherwise use original content
110
+ if chunk_metadata and 'bucket' in chunk_metadata and 'key' in chunk_metadata:
111
+ bucket = chunk_metadata['bucket']
112
+ key = chunk_metadata['key']
113
+ else:
114
+ bucket = event['content']['bucket']
115
+ key = event['content']['key']
35
116
 
36
117
  # Check file type
37
118
  ext = key.lower().split('.')[-1]
@@ -61,11 +142,12 @@ def handler(event, context):
61
142
  })
62
143
 
63
144
  # Invoke Bedrock
145
+ max_tokens = int(os.getenv('INVOKE_MAX_TOKENS', '1000'))
64
146
  response = bedrock.invoke_model(
65
147
  modelId=os.environ['MODEL_ID'],
66
148
  body=json.dumps({
67
149
  'anthropic_version': 'bedrock-2023-05-31',
68
- 'max_tokens': os.getenv('INVOKE_MAX_TOKENS', 1000),
150
+ 'max_tokens': max_tokens,
69
151
  'messages': [{'role': 'user', 'content': content}]
70
152
  })
71
153
  )