@cdklabs/cdk-appmod-catalog-blueprints 1.4.1 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/.jsii +2579 -194
  2. package/lib/document-processing/adapter/adapter.d.ts +4 -2
  3. package/lib/document-processing/adapter/adapter.js +1 -1
  4. package/lib/document-processing/adapter/queued-s3-adapter.d.ts +9 -2
  5. package/lib/document-processing/adapter/queued-s3-adapter.js +29 -15
  6. package/lib/document-processing/agentic-document-processing.d.ts +4 -0
  7. package/lib/document-processing/agentic-document-processing.js +20 -10
  8. package/lib/document-processing/base-document-processing.d.ts +54 -2
  9. package/lib/document-processing/base-document-processing.js +136 -82
  10. package/lib/document-processing/bedrock-document-processing.d.ts +202 -2
  11. package/lib/document-processing/bedrock-document-processing.js +717 -77
  12. package/lib/document-processing/chunking-config.d.ts +614 -0
  13. package/lib/document-processing/chunking-config.js +5 -0
  14. package/lib/document-processing/default-document-processing-config.js +1 -1
  15. package/lib/document-processing/index.d.ts +1 -0
  16. package/lib/document-processing/index.js +2 -1
  17. package/lib/document-processing/resources/aggregation/handler.py +567 -0
  18. package/lib/document-processing/resources/aggregation/requirements.txt +7 -0
  19. package/lib/document-processing/resources/aggregation/test_handler.py +362 -0
  20. package/lib/document-processing/resources/cleanup/handler.py +276 -0
  21. package/lib/document-processing/resources/cleanup/requirements.txt +5 -0
  22. package/lib/document-processing/resources/cleanup/test_handler.py +436 -0
  23. package/lib/document-processing/resources/default-bedrock-invoke/index.py +85 -3
  24. package/lib/document-processing/resources/default-bedrock-invoke/test_index.py +622 -0
  25. package/lib/document-processing/resources/pdf-chunking/README.md +313 -0
  26. package/lib/document-processing/resources/pdf-chunking/chunking_strategies.py +460 -0
  27. package/lib/document-processing/resources/pdf-chunking/error_handling.py +491 -0
  28. package/lib/document-processing/resources/pdf-chunking/handler.py +958 -0
  29. package/lib/document-processing/resources/pdf-chunking/metrics.py +435 -0
  30. package/lib/document-processing/resources/pdf-chunking/requirements.txt +3 -0
  31. package/lib/document-processing/resources/pdf-chunking/strategy_selection.py +420 -0
  32. package/lib/document-processing/resources/pdf-chunking/structured_logging.py +457 -0
  33. package/lib/document-processing/resources/pdf-chunking/test_chunking_strategies.py +353 -0
  34. package/lib/document-processing/resources/pdf-chunking/test_error_handling.py +487 -0
  35. package/lib/document-processing/resources/pdf-chunking/test_handler.py +609 -0
  36. package/lib/document-processing/resources/pdf-chunking/test_integration.py +694 -0
  37. package/lib/document-processing/resources/pdf-chunking/test_metrics.py +532 -0
  38. package/lib/document-processing/resources/pdf-chunking/test_strategy_selection.py +471 -0
  39. package/lib/document-processing/resources/pdf-chunking/test_structured_logging.py +449 -0
  40. package/lib/document-processing/resources/pdf-chunking/test_token_estimation.py +374 -0
  41. package/lib/document-processing/resources/pdf-chunking/token_estimation.py +189 -0
  42. package/lib/document-processing/tests/agentic-document-processing-nag.test.js +4 -3
  43. package/lib/document-processing/tests/agentic-document-processing.test.js +488 -4
  44. package/lib/document-processing/tests/base-document-processing-nag.test.js +9 -2
  45. package/lib/document-processing/tests/base-document-processing-schema.test.d.ts +1 -0
  46. package/lib/document-processing/tests/base-document-processing-schema.test.js +337 -0
  47. package/lib/document-processing/tests/base-document-processing.test.js +114 -8
  48. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.d.ts +1 -0
  49. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.js +382 -0
  50. package/lib/document-processing/tests/bedrock-document-processing-nag.test.js +4 -3
  51. package/lib/document-processing/tests/bedrock-document-processing-security.test.d.ts +1 -0
  52. package/lib/document-processing/tests/bedrock-document-processing-security.test.js +389 -0
  53. package/lib/document-processing/tests/bedrock-document-processing.test.js +808 -8
  54. package/lib/document-processing/tests/chunking-config.test.d.ts +1 -0
  55. package/lib/document-processing/tests/chunking-config.test.js +238 -0
  56. package/lib/document-processing/tests/queued-s3-adapter-nag.test.js +9 -2
  57. package/lib/document-processing/tests/queued-s3-adapter.test.js +17 -6
  58. package/lib/framework/agents/base-agent.js +1 -1
  59. package/lib/framework/agents/batch-agent.js +1 -1
  60. package/lib/framework/agents/default-agent-config.js +1 -1
  61. package/lib/framework/bedrock/bedrock.js +1 -1
  62. package/lib/framework/custom-resource/default-runtimes.js +1 -1
  63. package/lib/framework/foundation/access-log.js +1 -1
  64. package/lib/framework/foundation/eventbridge-broker.js +1 -1
  65. package/lib/framework/foundation/network.d.ts +4 -2
  66. package/lib/framework/foundation/network.js +52 -41
  67. package/lib/framework/tests/access-log.test.js +5 -2
  68. package/lib/framework/tests/batch-agent.test.js +5 -2
  69. package/lib/framework/tests/bedrock.test.js +5 -2
  70. package/lib/framework/tests/eventbridge-broker.test.js +5 -2
  71. package/lib/framework/tests/framework-nag.test.js +26 -7
  72. package/lib/framework/tests/network.test.js +30 -2
  73. package/lib/tsconfig.tsbuildinfo +1 -1
  74. package/lib/utilities/data-loader.js +1 -1
  75. package/lib/utilities/lambda-iam-utils.js +1 -1
  76. package/lib/utilities/observability/cloudfront-distribution-observability-property-injector.js +1 -1
  77. package/lib/utilities/observability/default-observability-config.js +1 -1
  78. package/lib/utilities/observability/lambda-observability-property-injector.js +1 -1
  79. package/lib/utilities/observability/log-group-data-protection-utils.js +1 -1
  80. package/lib/utilities/observability/powertools-config.d.ts +10 -1
  81. package/lib/utilities/observability/powertools-config.js +19 -3
  82. package/lib/utilities/observability/state-machine-observability-property-injector.js +1 -1
  83. package/lib/utilities/test-utils.d.ts +43 -0
  84. package/lib/utilities/test-utils.js +56 -0
  85. package/lib/utilities/tests/data-loader-nag.test.js +3 -2
  86. package/lib/utilities/tests/data-loader.test.js +3 -2
  87. package/lib/webapp/frontend-construct.js +1 -1
  88. package/lib/webapp/tests/frontend-construct-nag.test.js +3 -2
  89. package/lib/webapp/tests/frontend-construct.test.js +3 -2
  90. package/package.json +6 -5
  91. package/lib/document-processing/resources/default-error-handler/index.js +0 -46
  92. package/lib/document-processing/resources/default-pdf-processor/index.js +0 -46
  93. package/lib/document-processing/resources/default-pdf-validator/index.js +0 -36
@@ -0,0 +1,353 @@
1
+ """
2
+ Unit tests for chunking strategies.
3
+
4
+ Tests cover:
5
+ - Fixed-pages strategy with various page counts and chunk sizes
6
+ - Token-based strategy with variable token density
7
+ - Hybrid strategy with mixed scenarios
8
+ - Edge cases (document size equals chunk size, 1 page, last chunk smaller)
9
+ - Configuration validation
10
+ """
11
+
12
+ import unittest
13
+ from chunking_strategies import (
14
+ calculate_chunks_fixed_pages,
15
+ calculate_chunks_token_based,
16
+ calculate_chunks_hybrid,
17
+ validate_configuration,
18
+ validate_fixed_pages_config,
19
+ validate_token_based_config,
20
+ validate_hybrid_config,
21
+ ConfigurationError
22
+ )
23
+
24
+
25
+ class TestFixedPagesStrategy(unittest.TestCase):
26
+ """Tests for fixed-pages chunking strategy."""
27
+
28
+ def test_basic_chunking(self):
29
+ """Test basic fixed-pages chunking."""
30
+ chunks = calculate_chunks_fixed_pages(150, 50, 0)
31
+
32
+ self.assertEqual(len(chunks), 3)
33
+ self.assertEqual(chunks[0], {'chunk_index': 0, 'start_page': 0, 'end_page': 49, 'page_count': 50})
34
+ self.assertEqual(chunks[1], {'chunk_index': 1, 'start_page': 50, 'end_page': 99, 'page_count': 50})
35
+ self.assertEqual(chunks[2], {'chunk_index': 2, 'start_page': 100, 'end_page': 149, 'page_count': 50})
36
+
37
+ def test_chunking_with_overlap(self):
38
+ """Test fixed-pages chunking with overlap."""
39
+ chunks = calculate_chunks_fixed_pages(150, 50, 5)
40
+
41
+ # With 150 pages, chunk_size 50, overlap 5:
42
+ # Chunk 0: pages 0-49 (50 pages)
43
+ # Chunk 1: pages 45-99 (55 pages, starts 5 pages before end of chunk 0)
44
+ # Chunk 2: pages 95-149 (55 pages, starts 5 pages before end of chunk 1)
45
+ self.assertEqual(len(chunks), 3)
46
+ # First chunk has no overlap
47
+ self.assertEqual(chunks[0]['start_page'], 0)
48
+ self.assertEqual(chunks[0]['end_page'], 49)
49
+ # Second chunk starts with overlap
50
+ self.assertEqual(chunks[1]['start_page'], 45) # 50 - 5 = 45
51
+
52
+ def test_exact_division(self):
53
+ """Test when total pages divides evenly by chunk size."""
54
+ chunks = calculate_chunks_fixed_pages(100, 50, 0)
55
+
56
+ self.assertEqual(len(chunks), 2)
57
+ self.assertEqual(chunks[0]['page_count'], 50)
58
+ self.assertEqual(chunks[1]['page_count'], 50)
59
+
60
+ def test_last_chunk_smaller(self):
61
+ """Test when last chunk is smaller than chunk size."""
62
+ chunks = calculate_chunks_fixed_pages(125, 50, 0)
63
+
64
+ self.assertEqual(len(chunks), 3)
65
+ self.assertEqual(chunks[0]['page_count'], 50)
66
+ self.assertEqual(chunks[1]['page_count'], 50)
67
+ self.assertEqual(chunks[2]['page_count'], 25) # Last chunk is smaller
68
+
69
+ def test_single_page(self):
70
+ """Test with single page document."""
71
+ chunks = calculate_chunks_fixed_pages(1, 50, 0)
72
+
73
+ self.assertEqual(len(chunks), 1)
74
+ self.assertEqual(chunks[0], {'chunk_index': 0, 'start_page': 0, 'end_page': 0, 'page_count': 1})
75
+
76
+ def test_document_equals_chunk_size(self):
77
+ """Test when document size equals chunk size."""
78
+ chunks = calculate_chunks_fixed_pages(50, 50, 0)
79
+
80
+ self.assertEqual(len(chunks), 1)
81
+ self.assertEqual(chunks[0]['page_count'], 50)
82
+
83
+
84
+ class TestTokenBasedStrategy(unittest.TestCase):
85
+ """Tests for token-based chunking strategy."""
86
+
87
+ def test_uniform_density(self):
88
+ """Test with uniform token density."""
89
+ tokens = [1500] * 100 # 100 pages, 1500 tokens each
90
+ chunks = calculate_chunks_token_based(tokens, 100000, 0)
91
+
92
+ # Should create 2 chunks: 66 pages (99000 tokens) + 34 pages (51000 tokens)
93
+ self.assertEqual(len(chunks), 2)
94
+ self.assertTrue(all(c['token_count'] <= 100000 for c in chunks))
95
+
96
+ def test_variable_density(self):
97
+ """Test with variable token density."""
98
+ # Mix of low and high density pages
99
+ tokens = [500] * 50 + [5000] * 50 # 50 low + 50 high density
100
+ chunks = calculate_chunks_token_based(tokens, 100000, 0)
101
+
102
+ # All chunks should respect token limit
103
+ self.assertTrue(all(c['token_count'] <= 100000 for c in chunks))
104
+
105
+ def test_with_overlap(self):
106
+ """Test token-based chunking with overlap."""
107
+ tokens = [2000] * 100 # 100 pages, 2000 tokens each
108
+ chunks = calculate_chunks_token_based(tokens, 80000, 5000)
109
+
110
+ # Verify overlap exists between consecutive chunks
111
+ self.assertGreater(len(chunks), 1)
112
+ for i in range(len(chunks) - 1):
113
+ # There should be some overlap in page ranges
114
+ self.assertGreaterEqual(chunks[i]['end_page'], chunks[i + 1]['start_page'] - 3)
115
+
116
+ def test_single_page_exceeds_limit(self):
117
+ """Test when a single page exceeds token limit."""
118
+ tokens = [150000] # Single page with 150k tokens
119
+ chunks = calculate_chunks_token_based(tokens, 100000, 0)
120
+
121
+ # Should still create one chunk (can't split a single page)
122
+ self.assertEqual(len(chunks), 1)
123
+ self.assertEqual(chunks[0]['token_count'], 150000)
124
+
125
+ def test_exact_token_fit(self):
126
+ """Test when pages fit exactly into token limit."""
127
+ tokens = [10000] * 10 # 10 pages, 10k tokens each = 100k total
128
+ chunks = calculate_chunks_token_based(tokens, 100000, 0)
129
+
130
+ self.assertEqual(len(chunks), 1)
131
+ self.assertEqual(chunks[0]['token_count'], 100000)
132
+
133
+
134
+ class TestHybridStrategy(unittest.TestCase):
135
+ """Tests for hybrid chunking strategy."""
136
+
137
+ def test_token_limit_triggers(self):
138
+ """Test when token limit is reached before page limit."""
139
+ tokens = [5000] * 100 # 100 pages, 5000 tokens each
140
+ chunks = calculate_chunks_hybrid(tokens, 80000, 100, 5000)
141
+
142
+ # Should finalize chunks based on token limit
143
+ self.assertGreater(len(chunks), 1)
144
+ self.assertTrue(all(c['page_count'] <= 100 for c in chunks))
145
+ # Most chunks should be finalized due to token limit
146
+ token_limit_chunks = [c for c in chunks if c.get('finalize_reason') == 'token_limit']
147
+ self.assertGreater(len(token_limit_chunks), 0)
148
+
149
+ def test_page_limit_triggers(self):
150
+ """Test when page limit is reached before token limit."""
151
+ tokens = [500] * 200 # 200 pages, 500 tokens each (low density)
152
+ chunks = calculate_chunks_hybrid(tokens, 80000, 100, 5000)
153
+
154
+ # Should finalize chunks based on page limit
155
+ self.assertGreaterEqual(len(chunks), 2)
156
+ page_limit_chunks = [c for c in chunks if c.get('finalize_reason') == 'page_limit']
157
+ self.assertGreater(len(page_limit_chunks), 0)
158
+
159
+ def test_mixed_density(self):
160
+ """Test hybrid strategy with mixed token density."""
161
+ # Create variable density: some pages high, some low
162
+ tokens = []
163
+ for i in range(200):
164
+ if i % 10 == 0:
165
+ tokens.append(8000) # High density every 10th page
166
+ else:
167
+ tokens.append(1000) # Low density otherwise
168
+
169
+ chunks = calculate_chunks_hybrid(tokens, 80000, 100, 5000)
170
+
171
+ # Verify all constraints are met
172
+ self.assertTrue(all(c['page_count'] <= 100 for c in chunks))
173
+ # Allow some tolerance for overlap
174
+ self.assertTrue(all(c['token_count'] <= 100000 for c in chunks))
175
+
176
+ def test_respects_both_limits(self):
177
+ """Test that hybrid respects both token and page limits."""
178
+ tokens = [2000] * 200 # 200 pages, 2000 tokens each
179
+ chunks = calculate_chunks_hybrid(tokens, 80000, 100, 5000)
180
+
181
+ for chunk in chunks:
182
+ self.assertLessEqual(chunk['page_count'], 100, f"Page limit violated: {chunk['page_count']}")
183
+ # Allow tolerance for overlap
184
+ self.assertLessEqual(chunk['token_count'], 110000, f"Token limit violated: {chunk['token_count']}")
185
+
186
+ def test_overlap_limited_to_10_pages(self):
187
+ """Test that overlap is limited to maximum 10 pages."""
188
+ tokens = [1000] * 200 # 200 pages, 1000 tokens each
189
+ chunks = calculate_chunks_hybrid(tokens, 80000, 100, 20000) # High overlap target
190
+
191
+ # Check overlap between consecutive chunks
192
+ for i in range(len(chunks) - 1):
193
+ overlap_pages = chunks[i]['end_page'] - chunks[i + 1]['start_page'] + 1
194
+ self.assertLessEqual(overlap_pages, 10, f"Overlap exceeds 10 pages: {overlap_pages}")
195
+
196
+
197
+ class TestEdgeCases(unittest.TestCase):
198
+ """Tests for edge cases across all strategies."""
199
+
200
+ def test_single_page_fixed(self):
201
+ """Test single page with fixed-pages strategy."""
202
+ chunks = calculate_chunks_fixed_pages(1, 50, 0)
203
+ self.assertEqual(len(chunks), 1)
204
+ self.assertEqual(chunks[0]['page_count'], 1)
205
+
206
+ def test_single_page_token_based(self):
207
+ """Test single page with token-based strategy."""
208
+ chunks = calculate_chunks_token_based([50000], 100000, 0)
209
+ self.assertEqual(len(chunks), 1)
210
+ self.assertEqual(chunks[0]['token_count'], 50000)
211
+
212
+ def test_single_page_hybrid(self):
213
+ """Test single page with hybrid strategy."""
214
+ chunks = calculate_chunks_hybrid([50000], 80000, 100, 5000)
215
+ self.assertEqual(len(chunks), 1)
216
+ self.assertEqual(chunks[0]['token_count'], 50000)
217
+
218
+ def test_empty_pages_token_based(self):
219
+ """Test pages with zero tokens."""
220
+ tokens = [0, 1000, 0, 2000, 0] # Mix of empty and non-empty pages
221
+ chunks = calculate_chunks_token_based(tokens, 10000, 0)
222
+
223
+ self.assertEqual(len(chunks), 1)
224
+ self.assertEqual(chunks[0]['token_count'], 3000)
225
+
226
+ def test_very_large_document(self):
227
+ """Test with very large document (1000 pages)."""
228
+ tokens = [1500] * 1000
229
+ chunks = calculate_chunks_hybrid(tokens, 80000, 100, 5000)
230
+
231
+ # Should create multiple chunks
232
+ self.assertGreater(len(chunks), 10)
233
+ # All chunks should respect limits
234
+ self.assertTrue(all(c['page_count'] <= 100 for c in chunks))
235
+
236
+
237
+ class TestConfigurationValidation(unittest.TestCase):
238
+ """Tests for configuration validation."""
239
+
240
+ def test_valid_fixed_pages_config(self):
241
+ """Test valid fixed-pages configuration."""
242
+ config = {'chunkSize': 50, 'overlapPages': 5, 'pageThreshold': 100}
243
+ self.assertTrue(validate_configuration(config))
244
+
245
+ def test_invalid_chunk_size(self):
246
+ """Test invalid chunk size (negative)."""
247
+ config = {'chunkSize': -10, 'overlapPages': 5}
248
+ self.assertFalse(validate_configuration(config))
249
+
250
+ def test_invalid_chunk_size_zero(self):
251
+ """Test invalid chunk size (zero)."""
252
+ config = {'chunkSize': 0, 'overlapPages': 5}
253
+ self.assertFalse(validate_configuration(config))
254
+
255
+ def test_invalid_overlap_negative(self):
256
+ """Test invalid overlap (negative)."""
257
+ config = {'chunkSize': 50, 'overlapPages': -5}
258
+ self.assertFalse(validate_configuration(config))
259
+
260
+ def test_overlap_exceeds_chunk_size(self):
261
+ """Test overlap >= chunk size."""
262
+ config = {'chunkSize': 50, 'overlapPages': 60}
263
+ self.assertFalse(validate_configuration(config))
264
+
265
+ def test_overlap_equals_chunk_size(self):
266
+ """Test overlap == chunk size (invalid)."""
267
+ config = {'chunkSize': 50, 'overlapPages': 50}
268
+ self.assertFalse(validate_configuration(config))
269
+
270
+ def test_invalid_threshold(self):
271
+ """Test invalid threshold (negative)."""
272
+ config = {'pageThreshold': -100}
273
+ self.assertFalse(validate_configuration(config))
274
+
275
+ def test_strict_mode_raises_exception(self):
276
+ """Test that strict mode raises ConfigurationError."""
277
+ config = {'chunkSize': -10, 'strict': True}
278
+ with self.assertRaises(ConfigurationError):
279
+ validate_configuration(config)
280
+
281
+ def test_validate_fixed_pages_config_valid(self):
282
+ """Test validate_fixed_pages_config with valid config."""
283
+ # Should not raise exception
284
+ validate_fixed_pages_config(50, 5, 100)
285
+
286
+ def test_validate_fixed_pages_config_invalid(self):
287
+ """Test validate_fixed_pages_config with invalid config."""
288
+ with self.assertRaises(ConfigurationError):
289
+ validate_fixed_pages_config(-50, 5, 100)
290
+
291
+ def test_validate_token_based_config_valid(self):
292
+ """Test validate_token_based_config with valid config."""
293
+ # Should not raise exception
294
+ validate_token_based_config(100000, 5000, 150000)
295
+
296
+ def test_validate_token_based_config_invalid(self):
297
+ """Test validate_token_based_config with invalid config."""
298
+ with self.assertRaises(ConfigurationError):
299
+ validate_token_based_config(-100000, 5000, 150000)
300
+
301
+ def test_validate_hybrid_config_valid(self):
302
+ """Test validate_hybrid_config with valid config."""
303
+ # Should not raise exception
304
+ validate_hybrid_config(80000, 100, 5000)
305
+
306
+ def test_validate_hybrid_config_invalid(self):
307
+ """Test validate_hybrid_config with invalid config."""
308
+ with self.assertRaises(ConfigurationError):
309
+ validate_hybrid_config(-80000, 100, 5000)
310
+
311
+ def test_multiple_validation_errors(self):
312
+ """Test configuration with multiple errors."""
313
+ config = {
314
+ 'chunkSize': -10,
315
+ 'overlapPages': -5,
316
+ 'pageThreshold': 0,
317
+ 'strict': True
318
+ }
319
+ with self.assertRaises(ConfigurationError) as context:
320
+ validate_configuration(config)
321
+
322
+ # Should contain multiple error messages
323
+ error_msg = str(context.exception)
324
+ self.assertIn('chunk_size', error_msg)
325
+ self.assertIn('overlap', error_msg)
326
+ self.assertIn('threshold', error_msg)
327
+
328
+
329
+ class TestStrategyComparison(unittest.TestCase):
330
+ """Tests comparing different strategies on the same document."""
331
+
332
+ def test_all_strategies_on_same_document(self):
333
+ """Test all three strategies on the same document."""
334
+ tokens = [1500] * 150 # 150 pages, 1500 tokens each
335
+
336
+ fixed_chunks = calculate_chunks_fixed_pages(150, 50, 5)
337
+ token_chunks = calculate_chunks_token_based(tokens, 100000, 5000)
338
+ hybrid_chunks = calculate_chunks_hybrid(tokens, 80000, 100, 5000)
339
+
340
+ # All should create valid chunks
341
+ self.assertGreater(len(fixed_chunks), 0)
342
+ self.assertGreater(len(token_chunks), 0)
343
+ self.assertGreater(len(hybrid_chunks), 0)
344
+
345
+ # Token-based should never exceed token limit
346
+ self.assertTrue(all(c['token_count'] <= 100000 for c in token_chunks))
347
+
348
+ # Hybrid should respect both limits
349
+ self.assertTrue(all(c['page_count'] <= 100 for c in hybrid_chunks))
350
+
351
+
352
+ if __name__ == '__main__':
353
+ unittest.main()