@cdklabs/cdk-appmod-catalog-blueprints 1.4.1 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/.jsii +2579 -194
  2. package/lib/document-processing/adapter/adapter.d.ts +4 -2
  3. package/lib/document-processing/adapter/adapter.js +1 -1
  4. package/lib/document-processing/adapter/queued-s3-adapter.d.ts +9 -2
  5. package/lib/document-processing/adapter/queued-s3-adapter.js +29 -15
  6. package/lib/document-processing/agentic-document-processing.d.ts +4 -0
  7. package/lib/document-processing/agentic-document-processing.js +20 -10
  8. package/lib/document-processing/base-document-processing.d.ts +54 -2
  9. package/lib/document-processing/base-document-processing.js +136 -82
  10. package/lib/document-processing/bedrock-document-processing.d.ts +202 -2
  11. package/lib/document-processing/bedrock-document-processing.js +717 -77
  12. package/lib/document-processing/chunking-config.d.ts +614 -0
  13. package/lib/document-processing/chunking-config.js +5 -0
  14. package/lib/document-processing/default-document-processing-config.js +1 -1
  15. package/lib/document-processing/index.d.ts +1 -0
  16. package/lib/document-processing/index.js +2 -1
  17. package/lib/document-processing/resources/aggregation/handler.py +567 -0
  18. package/lib/document-processing/resources/aggregation/requirements.txt +7 -0
  19. package/lib/document-processing/resources/aggregation/test_handler.py +362 -0
  20. package/lib/document-processing/resources/cleanup/handler.py +276 -0
  21. package/lib/document-processing/resources/cleanup/requirements.txt +5 -0
  22. package/lib/document-processing/resources/cleanup/test_handler.py +436 -0
  23. package/lib/document-processing/resources/default-bedrock-invoke/index.py +85 -3
  24. package/lib/document-processing/resources/default-bedrock-invoke/test_index.py +622 -0
  25. package/lib/document-processing/resources/pdf-chunking/README.md +313 -0
  26. package/lib/document-processing/resources/pdf-chunking/chunking_strategies.py +460 -0
  27. package/lib/document-processing/resources/pdf-chunking/error_handling.py +491 -0
  28. package/lib/document-processing/resources/pdf-chunking/handler.py +958 -0
  29. package/lib/document-processing/resources/pdf-chunking/metrics.py +435 -0
  30. package/lib/document-processing/resources/pdf-chunking/requirements.txt +3 -0
  31. package/lib/document-processing/resources/pdf-chunking/strategy_selection.py +420 -0
  32. package/lib/document-processing/resources/pdf-chunking/structured_logging.py +457 -0
  33. package/lib/document-processing/resources/pdf-chunking/test_chunking_strategies.py +353 -0
  34. package/lib/document-processing/resources/pdf-chunking/test_error_handling.py +487 -0
  35. package/lib/document-processing/resources/pdf-chunking/test_handler.py +609 -0
  36. package/lib/document-processing/resources/pdf-chunking/test_integration.py +694 -0
  37. package/lib/document-processing/resources/pdf-chunking/test_metrics.py +532 -0
  38. package/lib/document-processing/resources/pdf-chunking/test_strategy_selection.py +471 -0
  39. package/lib/document-processing/resources/pdf-chunking/test_structured_logging.py +449 -0
  40. package/lib/document-processing/resources/pdf-chunking/test_token_estimation.py +374 -0
  41. package/lib/document-processing/resources/pdf-chunking/token_estimation.py +189 -0
  42. package/lib/document-processing/tests/agentic-document-processing-nag.test.js +4 -3
  43. package/lib/document-processing/tests/agentic-document-processing.test.js +488 -4
  44. package/lib/document-processing/tests/base-document-processing-nag.test.js +9 -2
  45. package/lib/document-processing/tests/base-document-processing-schema.test.d.ts +1 -0
  46. package/lib/document-processing/tests/base-document-processing-schema.test.js +337 -0
  47. package/lib/document-processing/tests/base-document-processing.test.js +114 -8
  48. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.d.ts +1 -0
  49. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.js +382 -0
  50. package/lib/document-processing/tests/bedrock-document-processing-nag.test.js +4 -3
  51. package/lib/document-processing/tests/bedrock-document-processing-security.test.d.ts +1 -0
  52. package/lib/document-processing/tests/bedrock-document-processing-security.test.js +389 -0
  53. package/lib/document-processing/tests/bedrock-document-processing.test.js +808 -8
  54. package/lib/document-processing/tests/chunking-config.test.d.ts +1 -0
  55. package/lib/document-processing/tests/chunking-config.test.js +238 -0
  56. package/lib/document-processing/tests/queued-s3-adapter-nag.test.js +9 -2
  57. package/lib/document-processing/tests/queued-s3-adapter.test.js +17 -6
  58. package/lib/framework/agents/base-agent.js +1 -1
  59. package/lib/framework/agents/batch-agent.js +1 -1
  60. package/lib/framework/agents/default-agent-config.js +1 -1
  61. package/lib/framework/bedrock/bedrock.js +1 -1
  62. package/lib/framework/custom-resource/default-runtimes.js +1 -1
  63. package/lib/framework/foundation/access-log.js +1 -1
  64. package/lib/framework/foundation/eventbridge-broker.js +1 -1
  65. package/lib/framework/foundation/network.d.ts +4 -2
  66. package/lib/framework/foundation/network.js +52 -41
  67. package/lib/framework/tests/access-log.test.js +5 -2
  68. package/lib/framework/tests/batch-agent.test.js +5 -2
  69. package/lib/framework/tests/bedrock.test.js +5 -2
  70. package/lib/framework/tests/eventbridge-broker.test.js +5 -2
  71. package/lib/framework/tests/framework-nag.test.js +26 -7
  72. package/lib/framework/tests/network.test.js +30 -2
  73. package/lib/tsconfig.tsbuildinfo +1 -1
  74. package/lib/utilities/data-loader.js +1 -1
  75. package/lib/utilities/lambda-iam-utils.js +1 -1
  76. package/lib/utilities/observability/cloudfront-distribution-observability-property-injector.js +1 -1
  77. package/lib/utilities/observability/default-observability-config.js +1 -1
  78. package/lib/utilities/observability/lambda-observability-property-injector.js +1 -1
  79. package/lib/utilities/observability/log-group-data-protection-utils.js +1 -1
  80. package/lib/utilities/observability/powertools-config.d.ts +10 -1
  81. package/lib/utilities/observability/powertools-config.js +19 -3
  82. package/lib/utilities/observability/state-machine-observability-property-injector.js +1 -1
  83. package/lib/utilities/test-utils.d.ts +43 -0
  84. package/lib/utilities/test-utils.js +56 -0
  85. package/lib/utilities/tests/data-loader-nag.test.js +3 -2
  86. package/lib/utilities/tests/data-loader.test.js +3 -2
  87. package/lib/webapp/frontend-construct.js +1 -1
  88. package/lib/webapp/tests/frontend-construct-nag.test.js +3 -2
  89. package/lib/webapp/tests/frontend-construct.test.js +3 -2
  90. package/package.json +6 -5
  91. package/lib/document-processing/resources/default-error-handler/index.js +0 -46
  92. package/lib/document-processing/resources/default-pdf-processor/index.js +0 -46
  93. package/lib/document-processing/resources/default-pdf-validator/index.js +0 -36
@@ -0,0 +1,622 @@
1
+ """
2
+ Unit tests for the default-bedrock-invoke Lambda handler.
3
+
4
+ Tests chunk-aware classification and processing functionality including:
5
+ - Parsing chunk metadata from events
6
+ - Building chunk context prompts
7
+ - Backward compatibility with non-chunked documents
8
+ - Processing with chunk metadata
9
+ - Processing without chunk metadata (backward compatibility)
10
+ """
11
+
12
+ import pytest
13
+ from unittest.mock import patch, MagicMock
14
+ import json
15
+ import os
16
+
17
+ # Set required environment variables before importing the module
18
+ os.environ['INVOKE_TYPE'] = 'classification'
19
+ os.environ['PROMPT'] = 'Test prompt'
20
+ os.environ['MODEL_ID'] = 'test-model'
21
+
22
+ # Import after setting environment variables
23
+ from index import parse_chunk_metadata, build_chunk_context_prompt, handler
24
+
25
+
26
+ class TestParseChunkMetadata:
27
+ """Tests for parse_chunk_metadata function."""
28
+
29
+ def test_returns_none_when_no_chunk_metadata(self):
30
+ """Test that None is returned when no chunk metadata is present."""
31
+ event = {
32
+ 'documentId': 'doc-123',
33
+ 'contentType': 'file',
34
+ 'content': {'bucket': 'test', 'key': 'test.pdf'}
35
+ }
36
+ result = parse_chunk_metadata(event)
37
+ assert result is None
38
+
39
+ def test_parses_direct_chunk_metadata(self):
40
+ """Test parsing direct chunkMetadata field."""
41
+ event = {
42
+ 'documentId': 'doc-123',
43
+ 'chunkMetadata': {
44
+ 'chunkIndex': 2,
45
+ 'totalChunks': 5,
46
+ 'startPage': 100,
47
+ 'endPage': 149,
48
+ 'pageCount': 50,
49
+ 'estimatedTokens': 75000,
50
+ 'overlapPages': 5
51
+ }
52
+ }
53
+ result = parse_chunk_metadata(event)
54
+
55
+ assert result is not None
56
+ assert result['chunkIndex'] == 2
57
+ assert result['totalChunks'] == 5
58
+ assert result['startPage'] == 100
59
+ assert result['endPage'] == 149
60
+ assert result['pageCount'] == 50
61
+ assert result['estimatedTokens'] == 75000
62
+ assert result['overlapPages'] == 5
63
+
64
+ def test_parses_chunk_object_from_map_state(self):
65
+ """Test parsing chunk object from Step Functions Map State iteration."""
66
+ event = {
67
+ 'documentId': 'doc-123',
68
+ 'chunk': {
69
+ 'chunkId': 'doc-123_chunk_1',
70
+ 'chunkIndex': 1,
71
+ 'startPage': 50,
72
+ 'endPage': 99,
73
+ 'pageCount': 50,
74
+ 'estimatedTokens': 80000,
75
+ 'bucket': 'test-bucket',
76
+ 'key': 'chunks/doc-123_chunk_1.pdf'
77
+ },
78
+ 'chunkIndex': 1,
79
+ 'totalChunks': 3
80
+ }
81
+ result = parse_chunk_metadata(event)
82
+
83
+ assert result is not None
84
+ assert result['chunkIndex'] == 1
85
+ assert result['totalChunks'] == 3
86
+ assert result['startPage'] == 50
87
+ assert result['endPage'] == 99
88
+
89
+ def test_handles_missing_optional_fields(self):
90
+ """Test that missing optional fields default to 0."""
91
+ event = {
92
+ 'documentId': 'doc-123',
93
+ 'chunk': {
94
+ 'chunkId': 'doc-123_chunk_0',
95
+ 'startPage': 0,
96
+ 'endPage': 49
97
+ },
98
+ 'totalChunks': 2
99
+ }
100
+ result = parse_chunk_metadata(event)
101
+
102
+ assert result is not None
103
+ assert result['chunkIndex'] == 0
104
+ assert result['overlapPages'] == 0
105
+ assert result['estimatedTokens'] == 0
106
+
107
+
108
+ class TestBuildChunkContextPrompt:
109
+ """Tests for build_chunk_context_prompt function."""
110
+
111
+ def test_returns_empty_string_when_no_metadata(self):
112
+ """Test that empty string is returned when no metadata is provided."""
113
+ result = build_chunk_context_prompt(None)
114
+ assert result == ""
115
+
116
+ def test_builds_basic_chunk_context(self):
117
+ """Test building basic chunk context without overlap."""
118
+ chunk_metadata = {
119
+ 'chunkIndex': 0,
120
+ 'totalChunks': 3,
121
+ 'startPage': 0,
122
+ 'endPage': 49,
123
+ 'overlapPages': 0
124
+ }
125
+ result = build_chunk_context_prompt(chunk_metadata)
126
+
127
+ assert "You are analyzing chunk 1 of 3" in result
128
+ assert "from pages 1 to 50" in result
129
+ assert "overlapping" not in result
130
+
131
+ def test_builds_chunk_context_with_overlap(self):
132
+ """Test building chunk context with overlap information."""
133
+ chunk_metadata = {
134
+ 'chunkIndex': 1,
135
+ 'totalChunks': 3,
136
+ 'startPage': 45,
137
+ 'endPage': 99,
138
+ 'overlapPages': 5
139
+ }
140
+ result = build_chunk_context_prompt(chunk_metadata)
141
+
142
+ assert "You are analyzing chunk 2 of 3" in result
143
+ assert "from pages 46 to 100" in result
144
+ assert "5 overlapping pages" in result
145
+
146
+ def test_no_overlap_message_for_first_chunk(self):
147
+ """Test that first chunk doesn't mention overlap even if overlapPages > 0."""
148
+ chunk_metadata = {
149
+ 'chunkIndex': 0, # First chunk
150
+ 'totalChunks': 3,
151
+ 'startPage': 0,
152
+ 'endPage': 49,
153
+ 'overlapPages': 5 # Has overlap config but is first chunk
154
+ }
155
+ result = build_chunk_context_prompt(chunk_metadata)
156
+
157
+ assert "You are analyzing chunk 1 of 3" in result
158
+ assert "overlapping" not in result # First chunk shouldn't mention overlap
159
+
160
+ def test_format_matches_design_spec(self):
161
+ """Test that format matches the design specification."""
162
+ chunk_metadata = {
163
+ 'chunkIndex': 2,
164
+ 'totalChunks': 5,
165
+ 'startPage': 100,
166
+ 'endPage': 149,
167
+ 'overlapPages': 0
168
+ }
169
+ result = build_chunk_context_prompt(chunk_metadata)
170
+
171
+ # Format should be: "You are analyzing chunk {N} of {total} from pages {start} to {end}"
172
+ expected_format = "You are analyzing chunk 3 of 5 from pages 101 to 150"
173
+ assert expected_format in result
174
+
175
+
176
+ class TestHandlerChunkAwareness:
177
+ """Tests for handler function chunk awareness."""
178
+
179
+ @patch('index.bedrock')
180
+ @patch('index.s3')
181
+ @patch('index.tracer')
182
+ @patch('index.metrics')
183
+ def test_handler_without_chunk_metadata(self, mock_metrics, mock_tracer, mock_s3, mock_bedrock):
184
+ """Test handler processes non-chunked documents correctly (backward compatibility)."""
185
+ # Setup mocks
186
+ mock_bedrock.invoke_model.return_value = {
187
+ 'body': MagicMock(read=lambda: json.dumps({
188
+ 'content': [{'text': '{"documentClassification": "INVOICE"}'}]
189
+ }).encode())
190
+ }
191
+ mock_s3.download_file = MagicMock()
192
+ mock_tracer.put_annotation = MagicMock()
193
+ mock_tracer.capture_lambda_handler = lambda f: f
194
+ mock_metrics.log_metrics = lambda f: f
195
+ mock_metrics.add_dimension = MagicMock()
196
+ mock_metrics.add_metric = MagicMock()
197
+
198
+ event = {
199
+ 'documentId': 'doc-123',
200
+ 'contentType': 'data',
201
+ 'content': {'data': 'Test document content'}
202
+ }
203
+
204
+ # Call handler
205
+ with patch.dict(os.environ, {'INVOKE_TYPE': 'classification', 'PROMPT': 'Classify this:', 'MODEL_ID': 'test-model'}):
206
+ result = handler(event, None)
207
+
208
+ # Verify result
209
+ assert result == {'documentClassification': 'INVOICE'}
210
+
211
+ # Verify is_chunked dimension is set to false
212
+ mock_metrics.add_dimension.assert_any_call(name="is_chunked", value="false")
213
+
214
+ @patch('index.bedrock')
215
+ @patch('index.s3')
216
+ @patch('index.tracer')
217
+ @patch('index.metrics')
218
+ def test_handler_with_chunk_metadata(self, mock_metrics, mock_tracer, mock_s3, mock_bedrock):
219
+ """Test handler processes chunked documents with context."""
220
+ # Setup mocks
221
+ mock_bedrock.invoke_model.return_value = {
222
+ 'body': MagicMock(read=lambda: json.dumps({
223
+ 'content': [{'text': '{"documentClassification": "CONTRACT"}'}]
224
+ }).encode())
225
+ }
226
+ mock_s3.download_file = MagicMock()
227
+ mock_tracer.put_annotation = MagicMock()
228
+ mock_tracer.capture_lambda_handler = lambda f: f
229
+ mock_metrics.log_metrics = lambda f: f
230
+ mock_metrics.add_dimension = MagicMock()
231
+ mock_metrics.add_metric = MagicMock()
232
+
233
+ event = {
234
+ 'documentId': 'doc-123',
235
+ 'contentType': 'data',
236
+ 'content': {'data': 'Test document content'},
237
+ 'chunkMetadata': {
238
+ 'chunkIndex': 1,
239
+ 'totalChunks': 3,
240
+ 'startPage': 50,
241
+ 'endPage': 99,
242
+ 'overlapPages': 5
243
+ }
244
+ }
245
+
246
+ # Call handler
247
+ with patch.dict(os.environ, {'INVOKE_TYPE': 'classification', 'PROMPT': 'Classify this:', 'MODEL_ID': 'test-model'}):
248
+ result = handler(event, None)
249
+
250
+ # Verify result
251
+ assert result == {'documentClassification': 'CONTRACT'}
252
+
253
+ # Verify is_chunked dimension is set to true
254
+ mock_metrics.add_dimension.assert_any_call(name="is_chunked", value="true")
255
+
256
+ # Verify chunk annotations are set
257
+ mock_tracer.put_annotation.assert_any_call(key="chunkIndex", value="1")
258
+ mock_tracer.put_annotation.assert_any_call(key="totalChunks", value="3")
259
+
260
+ @patch('index.bedrock')
261
+ @patch('index.s3')
262
+ @patch('index.tracer')
263
+ @patch('index.metrics')
264
+ def test_prompt_includes_chunk_context(self, mock_metrics, mock_tracer, mock_s3, mock_bedrock):
265
+ """Test that the prompt sent to Bedrock includes chunk context."""
266
+ captured_body = None
267
+
268
+ def capture_invoke(modelId, body):
269
+ nonlocal captured_body
270
+ captured_body = json.loads(body)
271
+ return {
272
+ 'body': MagicMock(read=lambda: json.dumps({
273
+ 'content': [{'text': '{"documentClassification": "INVOICE"}'}]
274
+ }).encode())
275
+ }
276
+
277
+ mock_bedrock.invoke_model = capture_invoke
278
+ mock_s3.download_file = MagicMock()
279
+ mock_tracer.put_annotation = MagicMock()
280
+ mock_tracer.capture_lambda_handler = lambda f: f
281
+ mock_metrics.log_metrics = lambda f: f
282
+ mock_metrics.add_dimension = MagicMock()
283
+ mock_metrics.add_metric = MagicMock()
284
+
285
+ event = {
286
+ 'documentId': 'doc-123',
287
+ 'contentType': 'data',
288
+ 'content': {'data': 'Test document content'},
289
+ 'chunkMetadata': {
290
+ 'chunkIndex': 2,
291
+ 'totalChunks': 5,
292
+ 'startPage': 100,
293
+ 'endPage': 149,
294
+ 'overlapPages': 0
295
+ }
296
+ }
297
+
298
+ # Call handler
299
+ with patch.dict(os.environ, {'INVOKE_TYPE': 'classification', 'PROMPT': 'Classify this document:', 'MODEL_ID': 'test-model'}):
300
+ handler(event, None)
301
+
302
+ # Verify the prompt includes chunk context
303
+ assert captured_body is not None
304
+ prompt_text = captured_body['messages'][0]['content'][0]['text']
305
+ assert "You are analyzing chunk 3 of 5" in prompt_text
306
+ assert "from pages 101 to 150" in prompt_text
307
+ assert "Classify this document:" in prompt_text
308
+
309
+
310
+ if __name__ == '__main__':
311
+ pytest.main([__file__, '-v'])
312
+
313
+
314
+ class TestProcessingChunkAwareness:
315
+ """Tests for handler function chunk awareness in processing mode."""
316
+
317
+ @patch('index.bedrock')
318
+ @patch('index.s3')
319
+ @patch('index.tracer')
320
+ @patch('index.metrics')
321
+ def test_processing_without_chunk_metadata(self, mock_metrics, mock_tracer, mock_s3, mock_bedrock):
322
+ """Test processing handler processes non-chunked documents correctly (backward compatibility)."""
323
+ # Setup mocks
324
+ mock_bedrock.invoke_model.return_value = {
325
+ 'body': MagicMock(read=lambda: json.dumps({
326
+ 'content': [{'text': '{"documentClassification": "INVOICE", "result": {"entities": [{"type": "AMOUNT", "value": "$100.00"}]}}'}]
327
+ }).encode())
328
+ }
329
+ mock_s3.download_file = MagicMock()
330
+ mock_tracer.put_annotation = MagicMock()
331
+ mock_tracer.capture_lambda_handler = lambda f: f
332
+ mock_metrics.log_metrics = lambda f: f
333
+ mock_metrics.add_dimension = MagicMock()
334
+ mock_metrics.add_metric = MagicMock()
335
+
336
+ event = {
337
+ 'documentId': 'doc-123',
338
+ 'contentType': 'data',
339
+ 'content': {'data': 'Test document content'},
340
+ 'classificationResult': {'documentClassification': 'INVOICE'}
341
+ }
342
+
343
+ # Call handler with processing mode
344
+ with patch.dict(os.environ, {
345
+ 'INVOKE_TYPE': 'processing',
346
+ 'PROMPT': 'Extract entities from this [ACTUAL_CLASSIFICATION]:',
347
+ 'MODEL_ID': 'test-model'
348
+ }):
349
+ result = handler(event, None)
350
+
351
+ # Verify result
352
+ assert result['documentClassification'] == 'INVOICE'
353
+ assert 'result' in result
354
+ assert 'entities' in result['result']
355
+
356
+ # Verify is_chunked dimension is set to false
357
+ mock_metrics.add_dimension.assert_any_call(name="is_chunked", value="false")
358
+
359
+ @patch('index.bedrock')
360
+ @patch('index.s3')
361
+ @patch('index.tracer')
362
+ @patch('index.metrics')
363
+ def test_processing_with_chunk_metadata(self, mock_metrics, mock_tracer, mock_s3, mock_bedrock):
364
+ """Test processing handler processes chunked documents with context."""
365
+ # Setup mocks
366
+ mock_bedrock.invoke_model.return_value = {
367
+ 'body': MagicMock(read=lambda: json.dumps({
368
+ 'content': [{'text': '{"documentClassification": "CONTRACT", "result": {"entities": [{"type": "PARTY", "value": "Acme Corp", "page": 51}]}}'}]
369
+ }).encode())
370
+ }
371
+ mock_s3.download_file = MagicMock()
372
+ mock_tracer.put_annotation = MagicMock()
373
+ mock_tracer.capture_lambda_handler = lambda f: f
374
+ mock_metrics.log_metrics = lambda f: f
375
+ mock_metrics.add_dimension = MagicMock()
376
+ mock_metrics.add_metric = MagicMock()
377
+
378
+ event = {
379
+ 'documentId': 'doc-123',
380
+ 'contentType': 'data',
381
+ 'content': {'data': 'Test document content'},
382
+ 'classificationResult': {'documentClassification': 'CONTRACT'},
383
+ 'chunkMetadata': {
384
+ 'chunkIndex': 1,
385
+ 'totalChunks': 3,
386
+ 'startPage': 50,
387
+ 'endPage': 99,
388
+ 'overlapPages': 5
389
+ }
390
+ }
391
+
392
+ # Call handler with processing mode
393
+ with patch.dict(os.environ, {
394
+ 'INVOKE_TYPE': 'processing',
395
+ 'PROMPT': 'Extract entities from this [ACTUAL_CLASSIFICATION]:',
396
+ 'MODEL_ID': 'test-model'
397
+ }):
398
+ result = handler(event, None)
399
+
400
+ # Verify result
401
+ assert result['documentClassification'] == 'CONTRACT'
402
+ assert 'result' in result
403
+
404
+ # Verify is_chunked dimension is set to true
405
+ mock_metrics.add_dimension.assert_any_call(name="is_chunked", value="true")
406
+
407
+ # Verify chunk annotations are set
408
+ mock_tracer.put_annotation.assert_any_call(key="chunkIndex", value="1")
409
+ mock_tracer.put_annotation.assert_any_call(key="totalChunks", value="3")
410
+
411
+ @patch('index.bedrock')
412
+ @patch('index.s3')
413
+ @patch('index.tracer')
414
+ @patch('index.metrics')
415
+ def test_processing_prompt_includes_chunk_context(self, mock_metrics, mock_tracer, mock_s3, mock_bedrock):
416
+ """Test that the processing prompt sent to Bedrock includes chunk context."""
417
+ captured_body = None
418
+
419
+ def capture_invoke(modelId, body):
420
+ nonlocal captured_body
421
+ captured_body = json.loads(body)
422
+ return {
423
+ 'body': MagicMock(read=lambda: json.dumps({
424
+ 'content': [{'text': '{"documentClassification": "INVOICE", "result": {"entities": []}}'}]
425
+ }).encode())
426
+ }
427
+
428
+ mock_bedrock.invoke_model = capture_invoke
429
+ mock_s3.download_file = MagicMock()
430
+ mock_tracer.put_annotation = MagicMock()
431
+ mock_tracer.capture_lambda_handler = lambda f: f
432
+ mock_metrics.log_metrics = lambda f: f
433
+ mock_metrics.add_dimension = MagicMock()
434
+ mock_metrics.add_metric = MagicMock()
435
+
436
+ event = {
437
+ 'documentId': 'doc-123',
438
+ 'contentType': 'data',
439
+ 'content': {'data': 'Test document content'},
440
+ 'classificationResult': {'documentClassification': 'INVOICE'},
441
+ 'chunkMetadata': {
442
+ 'chunkIndex': 2,
443
+ 'totalChunks': 5,
444
+ 'startPage': 100,
445
+ 'endPage': 149,
446
+ 'overlapPages': 0
447
+ }
448
+ }
449
+
450
+ # Call handler with processing mode
451
+ with patch.dict(os.environ, {
452
+ 'INVOKE_TYPE': 'processing',
453
+ 'PROMPT': 'Extract entities from this [ACTUAL_CLASSIFICATION]:',
454
+ 'MODEL_ID': 'test-model'
455
+ }):
456
+ handler(event, None)
457
+
458
+ # Verify the prompt includes chunk context
459
+ assert captured_body is not None
460
+ prompt_text = captured_body['messages'][0]['content'][0]['text']
461
+ assert "You are analyzing chunk 3 of 5" in prompt_text
462
+ assert "from pages 101 to 150" in prompt_text
463
+ assert "Extract entities from this INVOICE:" in prompt_text
464
+
465
+ @patch('index.bedrock')
466
+ @patch('index.s3')
467
+ @patch('index.tracer')
468
+ @patch('index.metrics')
469
+ def test_processing_prompt_includes_overlap_info(self, mock_metrics, mock_tracer, mock_s3, mock_bedrock):
470
+ """Test that the processing prompt includes overlap information when applicable."""
471
+ captured_body = None
472
+
473
+ def capture_invoke(modelId, body):
474
+ nonlocal captured_body
475
+ captured_body = json.loads(body)
476
+ return {
477
+ 'body': MagicMock(read=lambda: json.dumps({
478
+ 'content': [{'text': '{"documentClassification": "CONTRACT", "result": {"entities": []}}'}]
479
+ }).encode())
480
+ }
481
+
482
+ mock_bedrock.invoke_model = capture_invoke
483
+ mock_s3.download_file = MagicMock()
484
+ mock_tracer.put_annotation = MagicMock()
485
+ mock_tracer.capture_lambda_handler = lambda f: f
486
+ mock_metrics.log_metrics = lambda f: f
487
+ mock_metrics.add_dimension = MagicMock()
488
+ mock_metrics.add_metric = MagicMock()
489
+
490
+ event = {
491
+ 'documentId': 'doc-123',
492
+ 'contentType': 'data',
493
+ 'content': {'data': 'Test document content'},
494
+ 'classificationResult': {'documentClassification': 'CONTRACT'},
495
+ 'chunkMetadata': {
496
+ 'chunkIndex': 2, # Not first chunk
497
+ 'totalChunks': 4,
498
+ 'startPage': 95,
499
+ 'endPage': 144,
500
+ 'overlapPages': 5
501
+ }
502
+ }
503
+
504
+ # Call handler with processing mode
505
+ with patch.dict(os.environ, {
506
+ 'INVOKE_TYPE': 'processing',
507
+ 'PROMPT': 'Extract entities from this [ACTUAL_CLASSIFICATION]:',
508
+ 'MODEL_ID': 'test-model'
509
+ }):
510
+ handler(event, None)
511
+
512
+ # Verify the prompt includes overlap information
513
+ assert captured_body is not None
514
+ prompt_text = captured_body['messages'][0]['content'][0]['text']
515
+ assert "You are analyzing chunk 3 of 4" in prompt_text
516
+ assert "5 overlapping pages" in prompt_text
517
+
518
+ @patch('index.bedrock')
519
+ @patch('index.s3')
520
+ @patch('index.tracer')
521
+ @patch('index.metrics')
522
+ def test_processing_uses_chunk_s3_location(self, mock_metrics, mock_tracer, mock_s3, mock_bedrock):
523
+ """Test that processing uses chunk-specific S3 location when available."""
524
+ captured_bucket = None
525
+ captured_key = None
526
+
527
+ def capture_download(bucket, key, local_path):
528
+ nonlocal captured_bucket, captured_key
529
+ captured_bucket = bucket
530
+ captured_key = key
531
+ # Create a dummy file
532
+ with open(local_path, 'wb') as f:
533
+ f.write(b'%PDF-1.4 dummy content')
534
+
535
+ mock_bedrock.invoke_model.return_value = {
536
+ 'body': MagicMock(read=lambda: json.dumps({
537
+ 'content': [{'text': '{"documentClassification": "INVOICE", "result": {"entities": []}}'}]
538
+ }).encode())
539
+ }
540
+ mock_s3.download_file = capture_download
541
+ mock_tracer.put_annotation = MagicMock()
542
+ mock_tracer.capture_lambda_handler = lambda f: f
543
+ mock_metrics.log_metrics = lambda f: f
544
+ mock_metrics.add_dimension = MagicMock()
545
+ mock_metrics.add_metric = MagicMock()
546
+
547
+ event = {
548
+ 'documentId': 'doc-123',
549
+ 'contentType': 'file',
550
+ 'content': {
551
+ 'location': 's3',
552
+ 'bucket': 'original-bucket',
553
+ 'key': 'raw/document.pdf'
554
+ },
555
+ 'classificationResult': {'documentClassification': 'INVOICE'},
556
+ 'chunkMetadata': {
557
+ 'chunkIndex': 1,
558
+ 'totalChunks': 3,
559
+ 'startPage': 50,
560
+ 'endPage': 99,
561
+ 'bucket': 'chunk-bucket',
562
+ 'key': 'chunks/doc-123_chunk_1.pdf'
563
+ }
564
+ }
565
+
566
+ # Call handler with processing mode
567
+ with patch.dict(os.environ, {
568
+ 'INVOKE_TYPE': 'processing',
569
+ 'PROMPT': 'Extract entities from this [ACTUAL_CLASSIFICATION]:',
570
+ 'MODEL_ID': 'test-model'
571
+ }):
572
+ handler(event, None)
573
+
574
+ # Verify chunk-specific S3 location was used
575
+ assert captured_bucket == 'chunk-bucket'
576
+ assert captured_key == 'chunks/doc-123_chunk_1.pdf'
577
+
578
+ @patch('index.bedrock')
579
+ @patch('index.s3')
580
+ @patch('index.tracer')
581
+ @patch('index.metrics')
582
+ def test_processing_classification_replacement(self, mock_metrics, mock_tracer, mock_s3, mock_bedrock):
583
+ """Test that [ACTUAL_CLASSIFICATION] placeholder is replaced in processing prompt."""
584
+ captured_body = None
585
+
586
+ def capture_invoke(modelId, body):
587
+ nonlocal captured_body
588
+ captured_body = json.loads(body)
589
+ return {
590
+ 'body': MagicMock(read=lambda: json.dumps({
591
+ 'content': [{'text': '{"documentClassification": "RECEIPT", "result": {"entities": []}}'}]
592
+ }).encode())
593
+ }
594
+
595
+ mock_bedrock.invoke_model = capture_invoke
596
+ mock_s3.download_file = MagicMock()
597
+ mock_tracer.put_annotation = MagicMock()
598
+ mock_tracer.capture_lambda_handler = lambda f: f
599
+ mock_metrics.log_metrics = lambda f: f
600
+ mock_metrics.add_dimension = MagicMock()
601
+ mock_metrics.add_metric = MagicMock()
602
+
603
+ event = {
604
+ 'documentId': 'doc-123',
605
+ 'contentType': 'data',
606
+ 'content': {'data': 'Test document content'},
607
+ 'classificationResult': {'documentClassification': 'RECEIPT'}
608
+ }
609
+
610
+ # Call handler with processing mode
611
+ with patch.dict(os.environ, {
612
+ 'INVOKE_TYPE': 'processing',
613
+ 'PROMPT': 'The document is classified as [ACTUAL_CLASSIFICATION]. Extract entities:',
614
+ 'MODEL_ID': 'test-model'
615
+ }):
616
+ handler(event, None)
617
+
618
+ # Verify the classification was replaced in the prompt
619
+ assert captured_body is not None
620
+ prompt_text = captured_body['messages'][0]['content'][0]['text']
621
+ assert "[ACTUAL_CLASSIFICATION]" not in prompt_text
622
+ assert "RECEIPT" in prompt_text