@cdklabs/cdk-appmod-catalog-blueprints 1.5.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/.jsii +2537 -204
  2. package/lib/document-processing/adapter/adapter.d.ts +4 -2
  3. package/lib/document-processing/adapter/adapter.js +1 -1
  4. package/lib/document-processing/adapter/queued-s3-adapter.d.ts +9 -2
  5. package/lib/document-processing/adapter/queued-s3-adapter.js +29 -15
  6. package/lib/document-processing/agentic-document-processing.d.ts +4 -0
  7. package/lib/document-processing/agentic-document-processing.js +20 -10
  8. package/lib/document-processing/base-document-processing.d.ts +54 -2
  9. package/lib/document-processing/base-document-processing.js +136 -82
  10. package/lib/document-processing/bedrock-document-processing.d.ts +202 -2
  11. package/lib/document-processing/bedrock-document-processing.js +717 -77
  12. package/lib/document-processing/chunking-config.d.ts +614 -0
  13. package/lib/document-processing/chunking-config.js +5 -0
  14. package/lib/document-processing/default-document-processing-config.js +1 -1
  15. package/lib/document-processing/index.d.ts +1 -0
  16. package/lib/document-processing/index.js +2 -1
  17. package/lib/document-processing/resources/aggregation/handler.py +567 -0
  18. package/lib/document-processing/resources/aggregation/requirements.txt +7 -0
  19. package/lib/document-processing/resources/aggregation/test_handler.py +362 -0
  20. package/lib/document-processing/resources/cleanup/handler.py +276 -0
  21. package/lib/document-processing/resources/cleanup/requirements.txt +5 -0
  22. package/lib/document-processing/resources/cleanup/test_handler.py +436 -0
  23. package/lib/document-processing/resources/default-bedrock-invoke/index.py +85 -3
  24. package/lib/document-processing/resources/default-bedrock-invoke/test_index.py +622 -0
  25. package/lib/document-processing/resources/pdf-chunking/README.md +313 -0
  26. package/lib/document-processing/resources/pdf-chunking/chunking_strategies.py +460 -0
  27. package/lib/document-processing/resources/pdf-chunking/error_handling.py +491 -0
  28. package/lib/document-processing/resources/pdf-chunking/handler.py +958 -0
  29. package/lib/document-processing/resources/pdf-chunking/metrics.py +435 -0
  30. package/lib/document-processing/resources/pdf-chunking/requirements.txt +3 -0
  31. package/lib/document-processing/resources/pdf-chunking/strategy_selection.py +420 -0
  32. package/lib/document-processing/resources/pdf-chunking/structured_logging.py +457 -0
  33. package/lib/document-processing/resources/pdf-chunking/test_chunking_strategies.py +353 -0
  34. package/lib/document-processing/resources/pdf-chunking/test_error_handling.py +487 -0
  35. package/lib/document-processing/resources/pdf-chunking/test_handler.py +609 -0
  36. package/lib/document-processing/resources/pdf-chunking/test_integration.py +694 -0
  37. package/lib/document-processing/resources/pdf-chunking/test_metrics.py +532 -0
  38. package/lib/document-processing/resources/pdf-chunking/test_strategy_selection.py +471 -0
  39. package/lib/document-processing/resources/pdf-chunking/test_structured_logging.py +449 -0
  40. package/lib/document-processing/resources/pdf-chunking/test_token_estimation.py +374 -0
  41. package/lib/document-processing/resources/pdf-chunking/token_estimation.py +189 -0
  42. package/lib/document-processing/tests/agentic-document-processing-nag.test.js +4 -3
  43. package/lib/document-processing/tests/agentic-document-processing.test.js +488 -4
  44. package/lib/document-processing/tests/base-document-processing-nag.test.js +9 -2
  45. package/lib/document-processing/tests/base-document-processing-schema.test.d.ts +1 -0
  46. package/lib/document-processing/tests/base-document-processing-schema.test.js +337 -0
  47. package/lib/document-processing/tests/base-document-processing.test.js +114 -8
  48. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.d.ts +1 -0
  49. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.js +382 -0
  50. package/lib/document-processing/tests/bedrock-document-processing-nag.test.js +4 -3
  51. package/lib/document-processing/tests/bedrock-document-processing-security.test.d.ts +1 -0
  52. package/lib/document-processing/tests/bedrock-document-processing-security.test.js +389 -0
  53. package/lib/document-processing/tests/bedrock-document-processing.test.js +808 -8
  54. package/lib/document-processing/tests/chunking-config.test.d.ts +1 -0
  55. package/lib/document-processing/tests/chunking-config.test.js +238 -0
  56. package/lib/document-processing/tests/queued-s3-adapter-nag.test.js +9 -2
  57. package/lib/document-processing/tests/queued-s3-adapter.test.js +17 -6
  58. package/lib/framework/agents/base-agent.js +1 -1
  59. package/lib/framework/agents/batch-agent.js +1 -1
  60. package/lib/framework/agents/default-agent-config.js +1 -1
  61. package/lib/framework/bedrock/bedrock.js +1 -1
  62. package/lib/framework/custom-resource/default-runtimes.js +1 -1
  63. package/lib/framework/foundation/access-log.js +1 -1
  64. package/lib/framework/foundation/eventbridge-broker.js +1 -1
  65. package/lib/framework/foundation/network.js +1 -1
  66. package/lib/framework/tests/access-log.test.js +5 -2
  67. package/lib/framework/tests/batch-agent.test.js +5 -2
  68. package/lib/framework/tests/bedrock.test.js +5 -2
  69. package/lib/framework/tests/eventbridge-broker.test.js +5 -2
  70. package/lib/framework/tests/framework-nag.test.js +16 -8
  71. package/lib/framework/tests/network.test.js +9 -4
  72. package/lib/tsconfig.tsbuildinfo +1 -1
  73. package/lib/utilities/data-loader.js +1 -1
  74. package/lib/utilities/lambda-iam-utils.js +1 -1
  75. package/lib/utilities/observability/cloudfront-distribution-observability-property-injector.js +1 -1
  76. package/lib/utilities/observability/default-observability-config.js +1 -1
  77. package/lib/utilities/observability/lambda-observability-property-injector.js +1 -1
  78. package/lib/utilities/observability/log-group-data-protection-utils.js +1 -1
  79. package/lib/utilities/observability/powertools-config.d.ts +10 -1
  80. package/lib/utilities/observability/powertools-config.js +19 -3
  81. package/lib/utilities/observability/state-machine-observability-property-injector.js +1 -1
  82. package/lib/utilities/test-utils.d.ts +43 -0
  83. package/lib/utilities/test-utils.js +56 -0
  84. package/lib/utilities/tests/data-loader-nag.test.js +3 -2
  85. package/lib/utilities/tests/data-loader.test.js +3 -2
  86. package/lib/webapp/frontend-construct.js +1 -1
  87. package/lib/webapp/tests/frontend-construct-nag.test.js +3 -2
  88. package/lib/webapp/tests/frontend-construct.test.js +3 -2
  89. package/package.json +6 -5
  90. package/lib/document-processing/resources/default-error-handler/index.js +0 -46
  91. package/lib/document-processing/resources/default-pdf-processor/index.js +0 -46
  92. package/lib/document-processing/resources/default-pdf-validator/index.js +0 -36
@@ -0,0 +1,609 @@
1
+ """
2
+ Unit tests for PDF analysis and chunking Lambda handler.
3
+
4
+ Tests cover:
5
+ - Small documents (no chunking)
6
+ - Large documents (chunking required)
7
+ - Invalid PDF format
8
+ - S3 access denied
9
+ - Configuration validation
10
+ - Error handling
11
+ """
12
+
13
+ import unittest
14
+ from unittest.mock import Mock, patch, MagicMock
15
+ import json
16
+ import io
17
+ from botocore.exceptions import ClientError
18
+
19
+ # Import handler functions
20
+ from handler import (
21
+ lambda_handler,
22
+ _merge_configuration,
23
+ _get_no_chunking_reason,
24
+ _calculate_chunk_boundaries,
25
+ _get_strategy_config,
26
+ _create_error_response,
27
+ _split_and_upload_pdf,
28
+ _upload_chunk_with_retry
29
+ )
30
+
31
+
32
+ class TestLambdaHandler(unittest.TestCase):
33
+ """Test cases for the main Lambda handler function."""
34
+
35
+ def setUp(self):
36
+ """Set up test fixtures."""
37
+ self.small_doc_event = {
38
+ 'documentId': 'doc-123',
39
+ 'contentType': 'file',
40
+ 'content': {
41
+ 'bucket': 'test-bucket',
42
+ 'key': 'raw/small-doc.pdf',
43
+ 'location': 's3',
44
+ 'filename': 'small-doc.pdf'
45
+ },
46
+ 'config': {
47
+ 'strategy': 'hybrid',
48
+ 'pageThreshold': 100,
49
+ 'tokenThreshold': 150000
50
+ }
51
+ }
52
+
53
+ self.large_doc_event = {
54
+ 'documentId': 'doc-456',
55
+ 'contentType': 'file',
56
+ 'content': {
57
+ 'bucket': 'test-bucket',
58
+ 'key': 'raw/large-doc.pdf',
59
+ 'location': 's3',
60
+ 'filename': 'large-doc.pdf'
61
+ },
62
+ 'config': {
63
+ 'strategy': 'hybrid',
64
+ 'pageThreshold': 100,
65
+ 'tokenThreshold': 150000
66
+ }
67
+ }
68
+
69
+ @patch('handler.analyze_pdf_tokens')
70
+ @patch('handler.validate_configuration')
71
+ def test_small_document_no_chunking(self, mock_validate, mock_analyze):
72
+ """Test handler with small document that doesn't require chunking."""
73
+ # Mock configuration validation
74
+ mock_validate.return_value = True
75
+
76
+ # Mock token analysis - small document
77
+ mock_analyze.return_value = {
78
+ 'total_tokens': 45000,
79
+ 'total_pages': 30,
80
+ 'avg_tokens_per_page': 1500,
81
+ 'tokens_per_page': [1500] * 30,
82
+ 'requires_chunking': False,
83
+ 'strategy': 'hybrid'
84
+ }
85
+
86
+ # Call handler
87
+ result = lambda_handler(self.small_doc_event, None)
88
+
89
+ # Verify response
90
+ self.assertEqual(result['documentId'], 'doc-123')
91
+ self.assertFalse(result['requiresChunking'])
92
+ self.assertEqual(result['tokenAnalysis']['totalTokens'], 45000)
93
+ self.assertEqual(result['tokenAnalysis']['totalPages'], 30)
94
+ self.assertIn('reason', result)
95
+ self.assertIn('30 pages', result['reason'])
96
+
97
+ @patch('handler._split_and_upload_pdf')
98
+ @patch('handler._calculate_chunk_boundaries')
99
+ @patch('handler.analyze_pdf_tokens')
100
+ @patch('handler.validate_configuration')
101
+ def test_large_document_with_chunking(
102
+ self,
103
+ mock_validate,
104
+ mock_analyze,
105
+ mock_calculate,
106
+ mock_split
107
+ ):
108
+ """Test handler with large document that requires chunking."""
109
+ # Mock configuration validation
110
+ mock_validate.return_value = True
111
+
112
+ # Mock token analysis - large document
113
+ mock_analyze.return_value = {
114
+ 'total_tokens': 200000,
115
+ 'total_pages': 150,
116
+ 'avg_tokens_per_page': 1333,
117
+ 'tokens_per_page': [1333] * 150,
118
+ 'requires_chunking': True,
119
+ 'strategy': 'hybrid'
120
+ }
121
+
122
+ # Mock chunk boundaries
123
+ mock_calculate.return_value = [
124
+ {'chunk_index': 0, 'start_page': 0, 'end_page': 74, 'page_count': 75},
125
+ {'chunk_index': 1, 'start_page': 70, 'end_page': 149, 'page_count': 80}
126
+ ]
127
+
128
+ # Mock chunk upload
129
+ mock_split.return_value = [
130
+ {
131
+ 'chunkId': 'doc-456_chunk_0',
132
+ 'chunkIndex': 0,
133
+ 'totalChunks': 2,
134
+ 'startPage': 0,
135
+ 'endPage': 74,
136
+ 'pageCount': 75,
137
+ 'estimatedTokens': 100000,
138
+ 'bucket': 'test-bucket',
139
+ 'key': 'chunks/doc-456/doc-456_chunk_0.pdf'
140
+ },
141
+ {
142
+ 'chunkId': 'doc-456_chunk_1',
143
+ 'chunkIndex': 1,
144
+ 'totalChunks': 2,
145
+ 'startPage': 70,
146
+ 'endPage': 149,
147
+ 'pageCount': 80,
148
+ 'estimatedTokens': 106640,
149
+ 'bucket': 'test-bucket',
150
+ 'key': 'chunks/doc-456/doc-456_chunk_1.pdf'
151
+ }
152
+ ]
153
+
154
+ # Call handler
155
+ result = lambda_handler(self.large_doc_event, None)
156
+
157
+ # Verify response
158
+ self.assertEqual(result['documentId'], 'doc-456')
159
+ self.assertTrue(result['requiresChunking'])
160
+ self.assertEqual(result['tokenAnalysis']['totalTokens'], 200000)
161
+ self.assertEqual(result['tokenAnalysis']['totalPages'], 150)
162
+ self.assertEqual(result['strategy'], 'hybrid')
163
+ self.assertEqual(len(result['chunks']), 2)
164
+ self.assertEqual(result['chunks'][0]['chunkId'], 'doc-456_chunk_0')
165
+ self.assertEqual(result['chunks'][1]['chunkId'], 'doc-456_chunk_1')
166
+
167
+ @patch('handler.analyze_pdf_tokens')
168
+ @patch('handler.validate_configuration')
169
+ def test_invalid_pdf_format(self, mock_validate, mock_analyze):
170
+ """Test handler with invalid PDF format."""
171
+ # Mock configuration validation
172
+ mock_validate.return_value = True
173
+
174
+ # Mock token analysis failure with InvalidPDFFormatError
175
+ from error_handling import InvalidPDFFormatError
176
+ mock_analyze.side_effect = InvalidPDFFormatError(
177
+ message="Invalid or corrupted PDF format",
178
+ document_id="doc-123"
179
+ )
180
+
181
+ # Call handler
182
+ result = lambda_handler(self.small_doc_event, None)
183
+
184
+ # Verify error response
185
+ self.assertEqual(result['documentId'], 'doc-123')
186
+ self.assertFalse(result['requiresChunking'])
187
+ self.assertIn('error', result)
188
+ self.assertEqual(result['error']['type'], 'InvalidPDFFormat')
189
+ self.assertIn('Invalid', result['error']['message'])
190
+
191
+ @patch('handler.analyze_pdf_tokens')
192
+ @patch('handler.validate_configuration')
193
+ def test_s3_access_denied(self, mock_validate, mock_analyze):
194
+ """Test handler with S3 access denied error."""
195
+ # Mock configuration validation
196
+ mock_validate.return_value = True
197
+
198
+ # Mock S3 access denied
199
+ error_response = {'Error': {'Code': 'AccessDenied', 'Message': 'Access Denied'}}
200
+ mock_analyze.side_effect = ClientError(error_response, 'GetObject')
201
+
202
+ # Call handler
203
+ result = lambda_handler(self.small_doc_event, None)
204
+
205
+ # Verify error response
206
+ self.assertEqual(result['documentId'], 'doc-123')
207
+ self.assertFalse(result['requiresChunking'])
208
+ self.assertIn('error', result)
209
+ self.assertEqual(result['error']['type'], 'S3AccessDenied')
210
+
211
+ def test_missing_document_id(self):
212
+ """Test handler with missing documentId."""
213
+ event = {
214
+ 'contentType': 'file',
215
+ 'content': {
216
+ 'bucket': 'test-bucket',
217
+ 'key': 'raw/doc.pdf'
218
+ }
219
+ }
220
+
221
+ result = lambda_handler(event, None)
222
+
223
+ # Verify error response
224
+ self.assertIn('error', result)
225
+ self.assertIn('documentId', result['error']['message'])
226
+
227
+ def test_missing_content_fields(self):
228
+ """Test handler with missing content fields."""
229
+ event = {
230
+ 'documentId': 'doc-789',
231
+ 'contentType': 'file',
232
+ 'content': {}
233
+ }
234
+
235
+ result = lambda_handler(event, None)
236
+
237
+ # Verify error response
238
+ self.assertIn('error', result)
239
+ self.assertIn('bucket', result['error']['message'])
240
+
241
+ def test_actual_sqs_consumer_payload(self):
242
+ """Test handler with actual payload from SQS consumer."""
243
+ # This is the exact format sent by the SQS consumer
244
+ event = {
245
+ 'documentId': 'invoice-2024-001-1705315800000',
246
+ 'contentType': 'file',
247
+ 'content': {
248
+ 'location': 's3',
249
+ 'bucket': 'my-document-bucket',
250
+ 'key': 'raw/invoice-2024-001.pdf',
251
+ 'filename': 'invoice-2024-001.pdf'
252
+ },
253
+ 'eventTime': '2024-01-15T10:30:00.000Z',
254
+ 'eventName': 'ObjectCreated:Put',
255
+ 'source': 'sqs-consumer'
256
+ }
257
+
258
+ # Mock the analysis to return no chunking needed
259
+ with patch('handler.analyze_pdf_tokens') as mock_analyze, \
260
+ patch('handler.validate_configuration') as mock_validate:
261
+
262
+ mock_validate.return_value = True
263
+ mock_analyze.return_value = {
264
+ 'total_tokens': 45000,
265
+ 'total_pages': 30,
266
+ 'avg_tokens_per_page': 1500,
267
+ 'tokens_per_page': [1500] * 30,
268
+ 'requires_chunking': False,
269
+ 'strategy': 'hybrid'
270
+ }
271
+
272
+ result = lambda_handler(event, None)
273
+
274
+ # Verify it processes correctly
275
+ self.assertEqual(result['documentId'], 'invoice-2024-001-1705315800000')
276
+ self.assertFalse(result['requiresChunking'])
277
+ self.assertIn('tokenAnalysis', result)
278
+
279
+ def test_unsupported_content_type(self):
280
+ """Test handler with unsupported contentType."""
281
+ event = {
282
+ 'documentId': 'doc-999',
283
+ 'contentType': 'url', # Not supported
284
+ 'content': {
285
+ 'bucket': 'test-bucket',
286
+ 'key': 'raw/doc.pdf'
287
+ }
288
+ }
289
+
290
+ result = lambda_handler(event, None)
291
+
292
+ # Verify error response
293
+ self.assertIn('error', result)
294
+ self.assertIn('contentType', result['error']['message'])
295
+
296
+ def test_non_pdf_file_extension(self):
297
+ """Test handler logs warning for non-PDF file extension."""
298
+ event = {
299
+ 'documentId': 'doc-888',
300
+ 'contentType': 'file',
301
+ 'content': {
302
+ 'bucket': 'test-bucket',
303
+ 'key': 'raw/document.txt', # Not a PDF extension
304
+ 'filename': 'document.txt'
305
+ }
306
+ }
307
+
308
+ # Mock the analysis to fail with invalid PDF
309
+ with patch('handler.analyze_pdf_tokens') as mock_analyze, \
310
+ patch('handler.validate_configuration') as mock_validate:
311
+
312
+ mock_validate.return_value = True
313
+ mock_analyze.side_effect = Exception("File is not a valid PDF")
314
+
315
+ result = lambda_handler(event, None)
316
+
317
+ # Should return error
318
+ self.assertIn('error', result)
319
+
320
+
321
+ class TestConfigurationMerging(unittest.TestCase):
322
+ """Test cases for configuration merging logic."""
323
+
324
+ def test_default_configuration(self):
325
+ """Test default configuration values."""
326
+ config = _merge_configuration({})
327
+
328
+ self.assertEqual(config['strategy'], 'hybrid')
329
+ self.assertEqual(config['pageThreshold'], 100)
330
+ self.assertEqual(config['tokenThreshold'], 150000)
331
+ self.assertEqual(config['targetTokensPerChunk'], 80000)
332
+ # maxPagesPerChunk is 99 (not 100) to stay under Bedrock's 100-page limit
333
+ self.assertEqual(config['maxPagesPerChunk'], 99)
334
+
335
+ def test_event_configuration_override(self):
336
+ """Test event configuration overrides defaults."""
337
+ event_config = {
338
+ 'strategy': 'token-based',
339
+ 'tokenThreshold': 200000,
340
+ 'maxTokensPerChunk': 120000
341
+ }
342
+
343
+ config = _merge_configuration(event_config)
344
+
345
+ self.assertEqual(config['strategy'], 'token-based')
346
+ self.assertEqual(config['tokenThreshold'], 200000)
347
+ self.assertEqual(config['maxTokensPerChunk'], 120000)
348
+ # Defaults still apply for unspecified values
349
+ self.assertEqual(config['pageThreshold'], 100)
350
+
351
+ @patch.dict('os.environ', {
352
+ 'CHUNKING_STRATEGY': 'fixed-pages',
353
+ 'PAGE_THRESHOLD': '150',
354
+ 'CHUNK_SIZE': '75'
355
+ })
356
+ def test_environment_variable_configuration(self):
357
+ """Test environment variables override defaults."""
358
+ config = _merge_configuration({})
359
+
360
+ self.assertEqual(config['strategy'], 'fixed-pages')
361
+ self.assertEqual(config['pageThreshold'], 150)
362
+ self.assertEqual(config['chunkSize'], 75)
363
+
364
+ @patch.dict('os.environ', {
365
+ 'CHUNKING_STRATEGY': 'fixed-pages',
366
+ 'PAGE_THRESHOLD': '150'
367
+ })
368
+ def test_event_overrides_environment(self):
369
+ """Test event configuration has highest precedence."""
370
+ event_config = {
371
+ 'strategy': 'hybrid',
372
+ 'pageThreshold': 200
373
+ }
374
+
375
+ config = _merge_configuration(event_config)
376
+
377
+ # Event config should override environment
378
+ self.assertEqual(config['strategy'], 'hybrid')
379
+ self.assertEqual(config['pageThreshold'], 200)
380
+
381
+
382
+ class TestChunkBoundaryCalculation(unittest.TestCase):
383
+ """Test cases for chunk boundary calculation."""
384
+
385
+ def test_fixed_pages_strategy(self):
386
+ """Test chunk boundary calculation with fixed-pages strategy."""
387
+ token_analysis = {
388
+ 'total_pages': 150,
389
+ 'tokens_per_page': [1500] * 150
390
+ }
391
+
392
+ config = {
393
+ 'strategy': 'fixed-pages',
394
+ 'chunkSize': 50,
395
+ 'overlapPages': 5
396
+ }
397
+
398
+ chunks = _calculate_chunk_boundaries(token_analysis, config)
399
+
400
+ self.assertGreater(len(chunks), 0)
401
+ self.assertEqual(chunks[0]['start_page'], 0)
402
+ self.assertEqual(chunks[0]['page_count'], 50)
403
+
404
+ def test_token_based_strategy(self):
405
+ """Test chunk boundary calculation with token-based strategy."""
406
+ token_analysis = {
407
+ 'total_pages': 100,
408
+ 'tokens_per_page': [2000] * 100
409
+ }
410
+
411
+ config = {
412
+ 'strategy': 'token-based',
413
+ 'maxTokensPerChunk': 100000,
414
+ 'overlapTokens': 5000
415
+ }
416
+
417
+ chunks = _calculate_chunk_boundaries(token_analysis, config)
418
+
419
+ self.assertGreater(len(chunks), 0)
420
+ # Verify no chunk exceeds token limit
421
+ for chunk in chunks:
422
+ self.assertLessEqual(chunk['token_count'], 100000 + 5000) # Allow overlap
423
+
424
+ def test_hybrid_strategy(self):
425
+ """Test chunk boundary calculation with hybrid strategy."""
426
+ token_analysis = {
427
+ 'total_pages': 200,
428
+ 'tokens_per_page': [1500] * 200
429
+ }
430
+
431
+ config = {
432
+ 'strategy': 'hybrid',
433
+ 'targetTokensPerChunk': 80000,
434
+ 'maxPagesPerChunk': 100,
435
+ 'overlapTokens': 5000
436
+ }
437
+
438
+ chunks = _calculate_chunk_boundaries(token_analysis, config)
439
+
440
+ self.assertGreater(len(chunks), 0)
441
+ # Verify no chunk exceeds page limit
442
+ for chunk in chunks:
443
+ self.assertLessEqual(chunk['page_count'], 100)
444
+
445
+
446
+ class TestErrorResponses(unittest.TestCase):
447
+ """Test cases for error response creation."""
448
+
449
+ def test_create_error_response(self):
450
+ """Test error response creation."""
451
+ response = _create_error_response(
452
+ 'doc-123',
453
+ 'InvalidPDF',
454
+ 'PDF format is invalid'
455
+ )
456
+
457
+ self.assertEqual(response['documentId'], 'doc-123')
458
+ self.assertFalse(response['requiresChunking'])
459
+ self.assertEqual(response['error']['type'], 'InvalidPDF')
460
+ self.assertEqual(response['error']['message'], 'PDF format is invalid')
461
+
462
+ def test_get_no_chunking_reason_fixed_pages(self):
463
+ """Test no-chunking reason for fixed-pages strategy."""
464
+ token_analysis = {
465
+ 'total_pages': 50,
466
+ 'total_tokens': 75000
467
+ }
468
+
469
+ config = {
470
+ 'strategy': 'fixed-pages',
471
+ 'pageThreshold': 100
472
+ }
473
+
474
+ reason = _get_no_chunking_reason(token_analysis, config)
475
+
476
+ self.assertIn('50 pages', reason)
477
+ self.assertIn('100', reason)
478
+ self.assertIn('fixed-pages', reason)
479
+
480
+ def test_get_no_chunking_reason_hybrid(self):
481
+ """Test no-chunking reason for hybrid strategy."""
482
+ token_analysis = {
483
+ 'total_pages': 80,
484
+ 'total_tokens': 120000
485
+ }
486
+
487
+ config = {
488
+ 'strategy': 'hybrid',
489
+ 'pageThreshold': 100,
490
+ 'tokenThreshold': 150000
491
+ }
492
+
493
+ reason = _get_no_chunking_reason(token_analysis, config)
494
+
495
+ self.assertIn('80 pages', reason)
496
+ self.assertIn('120000 tokens', reason)
497
+ self.assertIn('hybrid', reason)
498
+
499
+
500
+ class TestPDFValidation(unittest.TestCase):
501
+ """Test cases for PDF file validation."""
502
+
503
+ def test_valid_pdf_magic_bytes(self):
504
+ """Test validation with valid PDF magic bytes."""
505
+ from handler import _is_valid_pdf
506
+
507
+ # Valid PDF starts with %PDF-
508
+ valid_pdf = b'%PDF-1.4\n%\xe2\xe3\xcf\xd3\n'
509
+ self.assertTrue(_is_valid_pdf(valid_pdf))
510
+
511
+ # Another valid PDF version
512
+ valid_pdf_17 = b'%PDF-1.7\n...'
513
+ self.assertTrue(_is_valid_pdf(valid_pdf_17))
514
+
515
+ def test_invalid_pdf_magic_bytes(self):
516
+ """Test validation with invalid magic bytes."""
517
+ from handler import _is_valid_pdf
518
+
519
+ # HTML file
520
+ html_file = b'<html><body>Not a PDF</body></html>'
521
+ self.assertFalse(_is_valid_pdf(html_file))
522
+
523
+ # Text file
524
+ text_file = b'This is just a text file'
525
+ self.assertFalse(_is_valid_pdf(text_file))
526
+
527
+ # Binary file with wrong magic bytes
528
+ wrong_magic = b'RIFF....WAVE'
529
+ self.assertFalse(_is_valid_pdf(wrong_magic))
530
+
531
+ def test_empty_or_short_file(self):
532
+ """Test validation with empty or too short files."""
533
+ from handler import _is_valid_pdf
534
+
535
+ # Empty file
536
+ self.assertFalse(_is_valid_pdf(b''))
537
+
538
+ # Too short (less than 5 bytes)
539
+ self.assertFalse(_is_valid_pdf(b'%PDF'))
540
+ self.assertFalse(_is_valid_pdf(b'%'))
541
+
542
+ def test_pdf_with_leading_whitespace(self):
543
+ """Test that PDFs with leading whitespace are rejected."""
544
+ from handler import _is_valid_pdf
545
+
546
+ # PDF spec requires %PDF- at the start, no leading whitespace
547
+ pdf_with_space = b' %PDF-1.4\n'
548
+ self.assertFalse(_is_valid_pdf(pdf_with_space))
549
+
550
+ pdf_with_newline = b'\n%PDF-1.4\n'
551
+ self.assertFalse(_is_valid_pdf(pdf_with_newline))
552
+
553
+
554
+ class TestStrategyConfig(unittest.TestCase):
555
+ """Test cases for strategy-specific configuration extraction."""
556
+
557
+ def test_get_fixed_pages_config(self):
558
+ """Test extraction of fixed-pages strategy config."""
559
+ config = {
560
+ 'strategy': 'fixed-pages',
561
+ 'chunkSize': 50,
562
+ 'overlapPages': 5,
563
+ 'pageThreshold': 100
564
+ }
565
+
566
+ strategy_config = _get_strategy_config(config)
567
+
568
+ self.assertIn('chunkSize', strategy_config)
569
+ self.assertIn('overlapPages', strategy_config)
570
+ self.assertIn('pageThreshold', strategy_config)
571
+ self.assertEqual(strategy_config['chunkSize'], 50)
572
+
573
+ def test_get_token_based_config(self):
574
+ """Test extraction of token-based strategy config."""
575
+ config = {
576
+ 'strategy': 'token-based',
577
+ 'maxTokensPerChunk': 100000,
578
+ 'overlapTokens': 5000,
579
+ 'tokenThreshold': 150000
580
+ }
581
+
582
+ strategy_config = _get_strategy_config(config)
583
+
584
+ self.assertIn('maxTokensPerChunk', strategy_config)
585
+ self.assertIn('overlapTokens', strategy_config)
586
+ self.assertIn('tokenThreshold', strategy_config)
587
+ self.assertEqual(strategy_config['maxTokensPerChunk'], 100000)
588
+
589
+ def test_get_hybrid_config(self):
590
+ """Test extraction of hybrid strategy config."""
591
+ config = {
592
+ 'strategy': 'hybrid',
593
+ 'targetTokensPerChunk': 80000,
594
+ 'maxPagesPerChunk': 100,
595
+ 'overlapTokens': 5000,
596
+ 'pageThreshold': 100,
597
+ 'tokenThreshold': 150000
598
+ }
599
+
600
+ strategy_config = _get_strategy_config(config)
601
+
602
+ self.assertIn('targetTokensPerChunk', strategy_config)
603
+ self.assertIn('maxPagesPerChunk', strategy_config)
604
+ self.assertIn('overlapTokens', strategy_config)
605
+ self.assertEqual(strategy_config['targetTokensPerChunk'], 80000)
606
+
607
+
608
+ if __name__ == '__main__':
609
+ unittest.main()