@cdklabs/cdk-appmod-catalog-blueprints 1.4.1 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/.jsii +2579 -194
  2. package/lib/document-processing/adapter/adapter.d.ts +4 -2
  3. package/lib/document-processing/adapter/adapter.js +1 -1
  4. package/lib/document-processing/adapter/queued-s3-adapter.d.ts +9 -2
  5. package/lib/document-processing/adapter/queued-s3-adapter.js +29 -15
  6. package/lib/document-processing/agentic-document-processing.d.ts +4 -0
  7. package/lib/document-processing/agentic-document-processing.js +20 -10
  8. package/lib/document-processing/base-document-processing.d.ts +54 -2
  9. package/lib/document-processing/base-document-processing.js +136 -82
  10. package/lib/document-processing/bedrock-document-processing.d.ts +202 -2
  11. package/lib/document-processing/bedrock-document-processing.js +717 -77
  12. package/lib/document-processing/chunking-config.d.ts +614 -0
  13. package/lib/document-processing/chunking-config.js +5 -0
  14. package/lib/document-processing/default-document-processing-config.js +1 -1
  15. package/lib/document-processing/index.d.ts +1 -0
  16. package/lib/document-processing/index.js +2 -1
  17. package/lib/document-processing/resources/aggregation/handler.py +567 -0
  18. package/lib/document-processing/resources/aggregation/requirements.txt +7 -0
  19. package/lib/document-processing/resources/aggregation/test_handler.py +362 -0
  20. package/lib/document-processing/resources/cleanup/handler.py +276 -0
  21. package/lib/document-processing/resources/cleanup/requirements.txt +5 -0
  22. package/lib/document-processing/resources/cleanup/test_handler.py +436 -0
  23. package/lib/document-processing/resources/default-bedrock-invoke/index.py +85 -3
  24. package/lib/document-processing/resources/default-bedrock-invoke/test_index.py +622 -0
  25. package/lib/document-processing/resources/pdf-chunking/README.md +313 -0
  26. package/lib/document-processing/resources/pdf-chunking/chunking_strategies.py +460 -0
  27. package/lib/document-processing/resources/pdf-chunking/error_handling.py +491 -0
  28. package/lib/document-processing/resources/pdf-chunking/handler.py +958 -0
  29. package/lib/document-processing/resources/pdf-chunking/metrics.py +435 -0
  30. package/lib/document-processing/resources/pdf-chunking/requirements.txt +3 -0
  31. package/lib/document-processing/resources/pdf-chunking/strategy_selection.py +420 -0
  32. package/lib/document-processing/resources/pdf-chunking/structured_logging.py +457 -0
  33. package/lib/document-processing/resources/pdf-chunking/test_chunking_strategies.py +353 -0
  34. package/lib/document-processing/resources/pdf-chunking/test_error_handling.py +487 -0
  35. package/lib/document-processing/resources/pdf-chunking/test_handler.py +609 -0
  36. package/lib/document-processing/resources/pdf-chunking/test_integration.py +694 -0
  37. package/lib/document-processing/resources/pdf-chunking/test_metrics.py +532 -0
  38. package/lib/document-processing/resources/pdf-chunking/test_strategy_selection.py +471 -0
  39. package/lib/document-processing/resources/pdf-chunking/test_structured_logging.py +449 -0
  40. package/lib/document-processing/resources/pdf-chunking/test_token_estimation.py +374 -0
  41. package/lib/document-processing/resources/pdf-chunking/token_estimation.py +189 -0
  42. package/lib/document-processing/tests/agentic-document-processing-nag.test.js +4 -3
  43. package/lib/document-processing/tests/agentic-document-processing.test.js +488 -4
  44. package/lib/document-processing/tests/base-document-processing-nag.test.js +9 -2
  45. package/lib/document-processing/tests/base-document-processing-schema.test.d.ts +1 -0
  46. package/lib/document-processing/tests/base-document-processing-schema.test.js +337 -0
  47. package/lib/document-processing/tests/base-document-processing.test.js +114 -8
  48. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.d.ts +1 -0
  49. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.js +382 -0
  50. package/lib/document-processing/tests/bedrock-document-processing-nag.test.js +4 -3
  51. package/lib/document-processing/tests/bedrock-document-processing-security.test.d.ts +1 -0
  52. package/lib/document-processing/tests/bedrock-document-processing-security.test.js +389 -0
  53. package/lib/document-processing/tests/bedrock-document-processing.test.js +808 -8
  54. package/lib/document-processing/tests/chunking-config.test.d.ts +1 -0
  55. package/lib/document-processing/tests/chunking-config.test.js +238 -0
  56. package/lib/document-processing/tests/queued-s3-adapter-nag.test.js +9 -2
  57. package/lib/document-processing/tests/queued-s3-adapter.test.js +17 -6
  58. package/lib/framework/agents/base-agent.js +1 -1
  59. package/lib/framework/agents/batch-agent.js +1 -1
  60. package/lib/framework/agents/default-agent-config.js +1 -1
  61. package/lib/framework/bedrock/bedrock.js +1 -1
  62. package/lib/framework/custom-resource/default-runtimes.js +1 -1
  63. package/lib/framework/foundation/access-log.js +1 -1
  64. package/lib/framework/foundation/eventbridge-broker.js +1 -1
  65. package/lib/framework/foundation/network.d.ts +4 -2
  66. package/lib/framework/foundation/network.js +52 -41
  67. package/lib/framework/tests/access-log.test.js +5 -2
  68. package/lib/framework/tests/batch-agent.test.js +5 -2
  69. package/lib/framework/tests/bedrock.test.js +5 -2
  70. package/lib/framework/tests/eventbridge-broker.test.js +5 -2
  71. package/lib/framework/tests/framework-nag.test.js +26 -7
  72. package/lib/framework/tests/network.test.js +30 -2
  73. package/lib/tsconfig.tsbuildinfo +1 -1
  74. package/lib/utilities/data-loader.js +1 -1
  75. package/lib/utilities/lambda-iam-utils.js +1 -1
  76. package/lib/utilities/observability/cloudfront-distribution-observability-property-injector.js +1 -1
  77. package/lib/utilities/observability/default-observability-config.js +1 -1
  78. package/lib/utilities/observability/lambda-observability-property-injector.js +1 -1
  79. package/lib/utilities/observability/log-group-data-protection-utils.js +1 -1
  80. package/lib/utilities/observability/powertools-config.d.ts +10 -1
  81. package/lib/utilities/observability/powertools-config.js +19 -3
  82. package/lib/utilities/observability/state-machine-observability-property-injector.js +1 -1
  83. package/lib/utilities/test-utils.d.ts +43 -0
  84. package/lib/utilities/test-utils.js +56 -0
  85. package/lib/utilities/tests/data-loader-nag.test.js +3 -2
  86. package/lib/utilities/tests/data-loader.test.js +3 -2
  87. package/lib/webapp/frontend-construct.js +1 -1
  88. package/lib/webapp/tests/frontend-construct-nag.test.js +3 -2
  89. package/lib/webapp/tests/frontend-construct.test.js +3 -2
  90. package/package.json +6 -5
  91. package/lib/document-processing/resources/default-error-handler/index.js +0 -46
  92. package/lib/document-processing/resources/default-pdf-processor/index.js +0 -46
  93. package/lib/document-processing/resources/default-pdf-validator/index.js +0 -36
@@ -0,0 +1,374 @@
1
+ """
2
+ Unit tests for token estimation module.
3
+
4
+ Tests cover various text densities, edge cases, and accuracy verification.
5
+ """
6
+
7
+ import unittest
8
+ from unittest.mock import Mock, patch, MagicMock
9
+ from io import BytesIO
10
+ from token_estimation import estimate_tokens_fast, analyze_pdf_tokens
11
+
12
+
13
+ class TestEstimateTokensFast(unittest.TestCase):
14
+ """Test cases for the estimate_tokens_fast function."""
15
+
16
+ def test_empty_text(self):
17
+ """Test with empty text (0 tokens)."""
18
+ result = estimate_tokens_fast("")
19
+ self.assertEqual(result, 0)
20
+
21
+ def test_none_text(self):
22
+ """Test with None text (0 tokens)."""
23
+ result = estimate_tokens_fast(None)
24
+ self.assertEqual(result, 0)
25
+
26
+ def test_simple_sentence(self):
27
+ """Test with a simple sentence."""
28
+ text = "Hello world"
29
+ result = estimate_tokens_fast(text)
30
+ # 2 words * 1.3 = 2.6 -> 2 tokens
31
+ self.assertEqual(result, 2)
32
+
33
+ def test_medium_density_text(self):
34
+ """Test with medium density text (~1500 tokens per page)."""
35
+ # Typical page: ~1000-1200 words
36
+ text = " ".join(["word"] * 1000)
37
+ result = estimate_tokens_fast(text)
38
+ # 1000 words * 1.3 = 1300 tokens
39
+ self.assertEqual(result, 1300)
40
+ # Verify it's in the expected range for medium density
41
+ self.assertGreaterEqual(result, 1200)
42
+ self.assertLessEqual(result, 1500)
43
+
44
+ def test_high_density_text(self):
45
+ """Test with very dense text (>10,000 tokens)."""
46
+ # Very dense page: ~8000+ words
47
+ text = " ".join(["word"] * 8000)
48
+ result = estimate_tokens_fast(text)
49
+ # 8000 words * 1.3 = 10400 tokens
50
+ self.assertEqual(result, 10400)
51
+ self.assertGreater(result, 10000)
52
+
53
+ def test_low_density_text(self):
54
+ """Test with low density text (sparse content)."""
55
+ # Sparse page: ~100 words
56
+ text = " ".join(["word"] * 100)
57
+ result = estimate_tokens_fast(text)
58
+ # 100 words * 1.3 = 130 tokens
59
+ self.assertEqual(result, 130)
60
+ self.assertLess(result, 200)
61
+
62
+ def test_text_with_punctuation(self):
63
+ """Test that punctuation doesn't inflate word count."""
64
+ text = "Hello, world! How are you? I'm fine, thanks."
65
+ result = estimate_tokens_fast(text)
66
+ # Words: Hello, world, How, are, you, I, m, fine, thanks = 9 words
67
+ # 9 * 1.3 = 11.7 -> 11 tokens
68
+ self.assertEqual(result, 11)
69
+
70
+ def test_text_with_numbers(self):
71
+ """Test with text containing numbers."""
72
+ text = "The year 2024 has 365 days and 12 months"
73
+ result = estimate_tokens_fast(text)
74
+ # Words: The, year, 2024, has, 365, days, and, 12, months = 9 words
75
+ # 9 * 1.3 = 11.7 -> 11 tokens
76
+ self.assertEqual(result, 11)
77
+
78
+ def test_text_with_special_characters(self):
79
+ """Test with special characters and symbols."""
80
+ text = "Email: user@example.com, Phone: +1-555-0123"
81
+ result = estimate_tokens_fast(text)
82
+ # Words extracted by \b\w+\b: Email, user, example, com, Phone, 1, 555, 0123 = 8 words
83
+ # 8 * 1.3 = 10.4 -> 10 tokens
84
+ self.assertEqual(result, 10)
85
+
86
+ def test_multiline_text(self):
87
+ """Test with multiline text."""
88
+ text = """Line one with some words.
89
+ Line two with more words.
90
+ Line three with even more words."""
91
+ result = estimate_tokens_fast(text)
92
+ # 16 words * 1.3 = 20.8 -> 20 tokens
93
+ self.assertEqual(result, 20)
94
+
95
+ def test_estimation_accuracy_range(self):
96
+ """Verify estimation is within expected accuracy range (85-90%)."""
97
+ # Sample text with known characteristics
98
+ text = "The quick brown fox jumps over the lazy dog. " * 100
99
+ result = estimate_tokens_fast(text)
100
+
101
+ # 9 words per sentence * 100 = 900 words
102
+ # 900 * 1.3 = 1170 tokens (our estimate)
103
+ self.assertEqual(result, 1170)
104
+
105
+ # Actual tokenization would be around 1000-1100 tokens
106
+ # Our estimate should be within 85-90% accuracy
107
+ # This means we're slightly conservative (overestimating)
108
+ # which is acceptable for chunking decisions
109
+ expected_actual = 1050 # Approximate actual token count
110
+ accuracy = min(result, expected_actual) / max(result, expected_actual)
111
+ self.assertGreaterEqual(accuracy, 0.85)
112
+
113
+
114
+ class TestAnalyzePdfTokens(unittest.TestCase):
115
+ """Test cases for the analyze_pdf_tokens function."""
116
+
117
+ @patch('token_estimation.boto3.client')
118
+ @patch('PyPDF2.PdfReader')
119
+ def test_small_pdf_no_chunking(self, mock_pdf_reader, mock_boto_client):
120
+ """Test with small PDF that doesn't require chunking."""
121
+ # Mock S3 client
122
+ mock_s3 = Mock()
123
+ mock_boto_client.return_value = mock_s3
124
+ mock_s3.get_object.return_value = {'Body': BytesIO(b'fake pdf')}
125
+
126
+ # Mock PDF with 30 pages, ~1500 tokens per page
127
+ mock_pages = []
128
+ for i in range(30):
129
+ mock_page = Mock()
130
+ mock_page.extract_text.return_value = " ".join(["word"] * 1000)
131
+ mock_pages.append(mock_page)
132
+
133
+ mock_reader = Mock()
134
+ mock_reader.pages = mock_pages
135
+ mock_pdf_reader.return_value = mock_reader
136
+
137
+ # Analyze PDF
138
+ result = analyze_pdf_tokens('test-bucket', 'test.pdf')
139
+
140
+ # Verify results
141
+ self.assertEqual(result['total_pages'], 30)
142
+ self.assertEqual(result['total_tokens'], 39000) # 30 * 1300
143
+ self.assertEqual(result['avg_tokens_per_page'], 1300)
144
+ self.assertFalse(result['requires_chunking']) # Below 100 page threshold
145
+ self.assertEqual(result['strategy'], 'hybrid')
146
+ self.assertEqual(result['estimation_method'], 'word-based')
147
+
148
+ @patch('token_estimation.boto3.client')
149
+ @patch('PyPDF2.PdfReader')
150
+ def test_large_pdf_requires_chunking_pages(self, mock_pdf_reader, mock_boto_client):
151
+ """Test with large PDF that requires chunking (page threshold)."""
152
+ # Mock S3 client
153
+ mock_s3 = Mock()
154
+ mock_boto_client.return_value = mock_s3
155
+ mock_s3.get_object.return_value = {'Body': BytesIO(b'fake pdf')}
156
+
157
+ # Mock PDF with 150 pages
158
+ mock_pages = []
159
+ for i in range(150):
160
+ mock_page = Mock()
161
+ mock_page.extract_text.return_value = " ".join(["word"] * 1000)
162
+ mock_pages.append(mock_page)
163
+
164
+ mock_reader = Mock()
165
+ mock_reader.pages = mock_pages
166
+ mock_pdf_reader.return_value = mock_reader
167
+
168
+ # Analyze PDF
169
+ result = analyze_pdf_tokens('test-bucket', 'test.pdf')
170
+
171
+ # Verify results
172
+ self.assertEqual(result['total_pages'], 150)
173
+ self.assertTrue(result['requires_chunking']) # Above 100 page threshold
174
+
175
+ @patch('token_estimation.boto3.client')
176
+ @patch('PyPDF2.PdfReader')
177
+ def test_large_pdf_requires_chunking_tokens(self, mock_pdf_reader, mock_boto_client):
178
+ """Test with PDF that requires chunking (token threshold)."""
179
+ # Mock S3 client
180
+ mock_s3 = Mock()
181
+ mock_boto_client.return_value = mock_s3
182
+ mock_s3.get_object.return_value = {'Body': BytesIO(b'fake pdf')}
183
+
184
+ # Mock PDF with 80 pages but very high density (~5000 tokens per page)
185
+ mock_pages = []
186
+ for i in range(80):
187
+ mock_page = Mock()
188
+ # ~4000 words per page -> ~5200 tokens per page
189
+ mock_page.extract_text.return_value = " ".join(["word"] * 4000)
190
+ mock_pages.append(mock_page)
191
+
192
+ mock_reader = Mock()
193
+ mock_reader.pages = mock_pages
194
+ mock_pdf_reader.return_value = mock_reader
195
+
196
+ # Analyze PDF with token-based strategy
197
+ config = {
198
+ 'chunkingStrategy': 'token-based',
199
+ 'tokenThreshold': 150000
200
+ }
201
+ result = analyze_pdf_tokens('test-bucket', 'test.pdf', config)
202
+
203
+ # Verify results
204
+ self.assertEqual(result['total_pages'], 80)
205
+ self.assertEqual(result['total_tokens'], 416000) # 80 * 5200
206
+ self.assertTrue(result['requires_chunking']) # Above 150000 token threshold
207
+ self.assertEqual(result['strategy'], 'token-based')
208
+
209
+ @patch('token_estimation.boto3.client')
210
+ @patch('PyPDF2.PdfReader')
211
+ def test_fixed_pages_strategy(self, mock_pdf_reader, mock_boto_client):
212
+ """Test with fixed-pages strategy."""
213
+ # Mock S3 client
214
+ mock_s3 = Mock()
215
+ mock_boto_client.return_value = mock_s3
216
+ mock_s3.get_object.return_value = {'Body': BytesIO(b'fake pdf')}
217
+
218
+ # Mock PDF with 50 pages
219
+ mock_pages = []
220
+ for i in range(50):
221
+ mock_page = Mock()
222
+ mock_page.extract_text.return_value = " ".join(["word"] * 1000)
223
+ mock_pages.append(mock_page)
224
+
225
+ mock_reader = Mock()
226
+ mock_reader.pages = mock_pages
227
+ mock_pdf_reader.return_value = mock_reader
228
+
229
+ # Analyze PDF with fixed-pages strategy
230
+ config = {
231
+ 'chunkingStrategy': 'fixed-pages',
232
+ 'pageThreshold': 100
233
+ }
234
+ result = analyze_pdf_tokens('test-bucket', 'test.pdf', config)
235
+
236
+ # Verify results
237
+ self.assertFalse(result['requires_chunking']) # Below page threshold
238
+ self.assertEqual(result['strategy'], 'fixed-pages')
239
+
240
+ @patch('token_estimation.boto3.client')
241
+ @patch('PyPDF2.PdfReader')
242
+ def test_empty_pages(self, mock_pdf_reader, mock_boto_client):
243
+ """Test with PDF containing empty pages (0 tokens)."""
244
+ # Mock S3 client
245
+ mock_s3 = Mock()
246
+ mock_boto_client.return_value = mock_s3
247
+ mock_s3.get_object.return_value = {'Body': BytesIO(b'fake pdf')}
248
+
249
+ # Mock PDF with mix of empty and content pages
250
+ mock_pages = []
251
+ for i in range(10):
252
+ mock_page = Mock()
253
+ if i % 2 == 0:
254
+ mock_page.extract_text.return_value = "" # Empty page
255
+ else:
256
+ mock_page.extract_text.return_value = " ".join(["word"] * 1000)
257
+ mock_pages.append(mock_page)
258
+
259
+ mock_reader = Mock()
260
+ mock_reader.pages = mock_pages
261
+ mock_pdf_reader.return_value = mock_reader
262
+
263
+ # Analyze PDF
264
+ result = analyze_pdf_tokens('test-bucket', 'test.pdf')
265
+
266
+ # Verify results
267
+ self.assertEqual(result['total_pages'], 10)
268
+ # 5 empty pages (0 tokens) + 5 content pages (1300 tokens each) = 6500 tokens
269
+ self.assertEqual(result['total_tokens'], 6500)
270
+ self.assertEqual(len(result['tokens_per_page']), 10)
271
+ # Check that empty pages have 0 tokens
272
+ self.assertEqual(result['tokens_per_page'][0], 0)
273
+ self.assertEqual(result['tokens_per_page'][2], 0)
274
+
275
+ @patch('token_estimation.boto3.client')
276
+ @patch('PyPDF2.PdfReader')
277
+ def test_variable_density_pages(self, mock_pdf_reader, mock_boto_client):
278
+ """Test with PDF containing pages of varying density."""
279
+ # Mock S3 client
280
+ mock_s3 = Mock()
281
+ mock_boto_client.return_value = mock_s3
282
+ mock_s3.get_object.return_value = {'Body': BytesIO(b'fake pdf')}
283
+
284
+ # Mock PDF with variable density pages
285
+ mock_pages = []
286
+ densities = [100, 500, 1000, 2000, 5000] # words per page
287
+ for density in densities:
288
+ mock_page = Mock()
289
+ mock_page.extract_text.return_value = " ".join(["word"] * density)
290
+ mock_pages.append(mock_page)
291
+
292
+ mock_reader = Mock()
293
+ mock_reader.pages = mock_pages
294
+ mock_pdf_reader.return_value = mock_reader
295
+
296
+ # Analyze PDF
297
+ result = analyze_pdf_tokens('test-bucket', 'test.pdf')
298
+
299
+ # Verify results
300
+ self.assertEqual(result['total_pages'], 5)
301
+ # Expected tokens: 130, 650, 1300, 2600, 6500 = 11180 total
302
+ self.assertEqual(result['total_tokens'], 11180)
303
+ self.assertEqual(result['tokens_per_page'][0], 130)
304
+ self.assertEqual(result['tokens_per_page'][4], 6500)
305
+
306
+ @patch('token_estimation.boto3.client')
307
+ def test_s3_access_error(self, mock_boto_client):
308
+ """Test error handling for S3 access denied."""
309
+ from botocore.exceptions import ClientError
310
+
311
+ # Mock S3 client to raise access denied error
312
+ mock_s3 = Mock()
313
+ mock_boto_client.return_value = mock_s3
314
+ mock_s3.get_object.side_effect = ClientError(
315
+ {'Error': {'Code': 'AccessDenied', 'Message': 'Access Denied'}},
316
+ 'GetObject'
317
+ )
318
+
319
+ # Verify error is raised with context
320
+ with self.assertRaises(ClientError) as context:
321
+ analyze_pdf_tokens('test-bucket', 'test.pdf')
322
+
323
+ self.assertIn('Failed to access S3 object', str(context.exception))
324
+
325
+ @patch('token_estimation.boto3.client')
326
+ @patch('PyPDF2.PdfReader')
327
+ def test_pdf_processing_error(self, mock_pdf_reader, mock_boto_client):
328
+ """Test error handling for PDF processing failures."""
329
+ # Mock S3 client
330
+ mock_s3 = Mock()
331
+ mock_boto_client.return_value = mock_s3
332
+ mock_s3.get_object.return_value = {'Body': BytesIO(b'fake pdf')}
333
+
334
+ # Mock PDF reader to raise error
335
+ mock_pdf_reader.side_effect = Exception("Invalid PDF format")
336
+
337
+ # Verify error is raised with context
338
+ with self.assertRaises(Exception) as context:
339
+ analyze_pdf_tokens('test-bucket', 'test.pdf')
340
+
341
+ self.assertIn('Failed to analyze PDF', str(context.exception))
342
+
343
+ @patch('token_estimation.boto3.client')
344
+ @patch('PyPDF2.PdfReader')
345
+ def test_default_config_values(self, mock_pdf_reader, mock_boto_client):
346
+ """Test that default configuration values are applied."""
347
+ # Mock S3 client
348
+ mock_s3 = Mock()
349
+ mock_boto_client.return_value = mock_s3
350
+ mock_s3.get_object.return_value = {'Body': BytesIO(b'fake pdf')}
351
+
352
+ # Mock PDF with 50 pages
353
+ mock_pages = []
354
+ for i in range(50):
355
+ mock_page = Mock()
356
+ mock_page.extract_text.return_value = " ".join(["word"] * 1000)
357
+ mock_pages.append(mock_page)
358
+
359
+ mock_reader = Mock()
360
+ mock_reader.pages = mock_pages
361
+ mock_pdf_reader.return_value = mock_reader
362
+
363
+ # Analyze PDF without config (should use defaults)
364
+ result = analyze_pdf_tokens('test-bucket', 'test.pdf')
365
+
366
+ # Verify default strategy is used
367
+ self.assertEqual(result['strategy'], 'hybrid')
368
+ # With 50 pages and 65000 tokens, should not require chunking
369
+ # (below 100 page threshold and 150000 token threshold)
370
+ self.assertFalse(result['requires_chunking'])
371
+
372
+
373
+ if __name__ == '__main__':
374
+ unittest.main()
@@ -0,0 +1,189 @@
1
+ """
2
+ Token estimation module for PDF chunking.
3
+
4
+ This module provides fast token estimation using word-based heuristics
5
+ to determine if PDFs require chunking before processing.
6
+ """
7
+
8
+ import re
9
+ import io
10
+ import logging
11
+ from typing import Dict, List, Optional
12
+ import boto3
13
+ from botocore.exceptions import ClientError
14
+
15
+ # Import strategy selection module
16
+ from strategy_selection import select_strategy_and_check_thresholds
17
+
18
+ # Configure module logger
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def estimate_tokens_fast(text: str) -> int:
23
+ """
24
+ Fast token estimation using word count heuristic.
25
+
26
+ This approach provides ~85-90% accuracy for English text while being
27
+ significantly faster than actual tokenization. It uses a conservative
28
+ multiplier to avoid underestimating token counts.
29
+
30
+ Approach:
31
+ - Count words (alphanumeric sequences)
32
+ - Apply multiplier: 1.3 tokens per word (conservative)
33
+ - Accounts for multi-token words, punctuation, and special characters
34
+
35
+ Args:
36
+ text: The text to estimate tokens for
37
+
38
+ Returns:
39
+ Estimated token count as an integer
40
+
41
+ Examples:
42
+ >>> estimate_tokens_fast("Hello world")
43
+ 2
44
+ >>> estimate_tokens_fast("The quick brown fox jumps over the lazy dog")
45
+ 11
46
+ >>> estimate_tokens_fast("")
47
+ 0
48
+ """
49
+ if not text:
50
+ return 0
51
+
52
+ # Count words (sequences of alphanumeric characters)
53
+ # Pattern \b\w+\b matches word boundaries
54
+ words = re.findall(r'\b\w+\b', text)
55
+ word_count = len(words)
56
+
57
+ # Conservative estimate: 1.3 tokens per word
58
+ # This accounts for:
59
+ # - Multi-token words (compound words, technical terms)
60
+ # - Punctuation and special characters
61
+ # - Whitespace tokens
62
+ estimated_tokens = int(word_count * 1.3)
63
+
64
+ return estimated_tokens
65
+
66
+
67
+ def analyze_pdf_tokens(
68
+ bucket: str,
69
+ key: str,
70
+ config: Optional[Dict] = None
71
+ ) -> Dict:
72
+ """
73
+ Analyze PDF token distribution using efficient S3 streaming.
74
+
75
+ This function streams a PDF from S3, extracts text from each page,
76
+ estimates tokens per page, and determines if chunking is required
77
+ based on the configured strategy and thresholds.
78
+
79
+ Args:
80
+ bucket: S3 bucket name containing the PDF
81
+ key: S3 object key for the PDF
82
+ config: Configuration dictionary with optional keys:
83
+ - chunkingStrategy: 'fixed-pages', 'token-based', or 'hybrid' (default: 'hybrid')
84
+ - pageThreshold: Maximum pages before chunking (default: 100)
85
+ - tokenThreshold: Maximum tokens before chunking (default: 150000)
86
+
87
+ Returns:
88
+ Dictionary containing:
89
+ - total_tokens: Total estimated tokens in the document
90
+ - total_pages: Total number of pages
91
+ - tokens_per_page: List of token counts for each page
92
+ - avg_tokens_per_page: Average tokens per page
93
+ - requires_chunking: Boolean indicating if chunking is needed
94
+ - strategy: The strategy used for the decision
95
+ - estimation_method: Always 'word-based'
96
+
97
+ Raises:
98
+ ClientError: If S3 access fails
99
+ Exception: If PDF processing fails
100
+
101
+ Examples:
102
+ >>> result = analyze_pdf_tokens('my-bucket', 'docs/file.pdf')
103
+ >>> print(result['total_pages'])
104
+ 150
105
+ >>> print(result['requires_chunking'])
106
+ True
107
+ """
108
+ # Import PyPDF2 here to avoid import errors if not installed
109
+ try:
110
+ import PyPDF2
111
+ except ImportError:
112
+ raise ImportError(
113
+ "PyPDF2 is required for PDF processing. "
114
+ "Install it with: pip install PyPDF2"
115
+ )
116
+
117
+ # Set default configuration
118
+ if config is None:
119
+ config = {}
120
+
121
+ # Initialize S3 client
122
+ s3 = boto3.client('s3')
123
+
124
+ try:
125
+ # Stream PDF from S3 (don't load entire file into memory)
126
+ pdf_obj = s3.get_object(Bucket=bucket, Key=key)
127
+ pdf_bytes = pdf_obj['Body'].read()
128
+
129
+ # Validate file is actually a PDF by checking magic bytes
130
+ if len(pdf_bytes) < 5 or pdf_bytes[:5] != b'%PDF-':
131
+ raise Exception(
132
+ f"File s3://{bucket}/{key} is not a valid PDF. "
133
+ "File must start with PDF magic bytes (%PDF-)."
134
+ )
135
+
136
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
137
+
138
+ tokens_per_page = []
139
+ total_tokens = 0
140
+
141
+ # Process each page
142
+ for page in pdf_reader.pages:
143
+ text = page.extract_text()
144
+ page_tokens = estimate_tokens_fast(text)
145
+ tokens_per_page.append(page_tokens)
146
+ total_tokens += page_tokens
147
+
148
+ total_pages = len(pdf_reader.pages)
149
+
150
+ # Calculate average tokens per page
151
+ avg_tokens_per_page = total_tokens / total_pages if total_pages > 0 else 0
152
+
153
+ # Use strategy selection module to determine if chunking is required
154
+ selection_result = select_strategy_and_check_thresholds(
155
+ total_pages=total_pages,
156
+ total_tokens=total_tokens,
157
+ config=config
158
+ )
159
+
160
+ return {
161
+ 'total_tokens': total_tokens,
162
+ 'total_pages': total_pages,
163
+ 'tokens_per_page': tokens_per_page,
164
+ 'avg_tokens_per_page': avg_tokens_per_page,
165
+ 'requires_chunking': selection_result.requires_chunking,
166
+ 'strategy': selection_result.strategy,
167
+ 'estimation_method': 'word-based',
168
+ 'selection_reason': selection_result.reason,
169
+ 'page_threshold_exceeded': selection_result.page_threshold_exceeded,
170
+ 'token_threshold_exceeded': selection_result.token_threshold_exceeded
171
+ }
172
+
173
+ except ClientError as e:
174
+ # Re-raise S3 errors with context
175
+ error_code = e.response['Error']['Code']
176
+ raise ClientError(
177
+ {
178
+ 'Error': {
179
+ 'Code': error_code,
180
+ 'Message': f"Failed to access S3 object s3://{bucket}/{key}: {str(e)}"
181
+ }
182
+ },
183
+ 'GetObject'
184
+ )
185
+ except Exception as e:
186
+ # Wrap other errors with context
187
+ raise Exception(
188
+ f"Failed to analyze PDF s3://{bucket}/{key}: {str(e)}"
189
+ ) from e
@@ -6,10 +6,11 @@ const aws_s3_1 = require("aws-cdk-lib/aws-s3");
6
6
  const aws_s3_assets_1 = require("aws-cdk-lib/aws-s3-assets");
7
7
  const cdk_nag_1 = require("cdk-nag");
8
8
  const framework_1 = require("../../framework");
9
+ const test_utils_1 = require("../../utilities/test-utils");
9
10
  const adapter_1 = require("../adapter");
10
11
  const agentic_document_processing_1 = require("../agentic-document-processing");
11
- // Create app and stack
12
- const app = new aws_cdk_lib_1.App();
12
+ // Create app and stack with bundling disabled for faster tests
13
+ const app = (0, test_utils_1.createTestApp)();
13
14
  const stack = new aws_cdk_lib_1.Stack(app, 'TestStack', {
14
15
  env: {
15
16
  account: '123456789012',
@@ -109,4 +110,4 @@ test('No unsuppressed errors', () => {
109
110
  }
110
111
  expect(errors).toHaveLength(0);
111
112
  });
112
- //# sourceMappingURL=data:application/json;base64,
113
+ //# sourceMappingURL=data:application/json;base64,