@cdklabs/cdk-appmod-catalog-blueprints 1.5.0 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.jsii +2537 -204
- package/lib/document-processing/adapter/adapter.d.ts +4 -2
- package/lib/document-processing/adapter/adapter.js +1 -1
- package/lib/document-processing/adapter/queued-s3-adapter.d.ts +9 -2
- package/lib/document-processing/adapter/queued-s3-adapter.js +29 -15
- package/lib/document-processing/agentic-document-processing.d.ts +4 -0
- package/lib/document-processing/agentic-document-processing.js +20 -10
- package/lib/document-processing/base-document-processing.d.ts +54 -2
- package/lib/document-processing/base-document-processing.js +136 -82
- package/lib/document-processing/bedrock-document-processing.d.ts +202 -2
- package/lib/document-processing/bedrock-document-processing.js +717 -77
- package/lib/document-processing/chunking-config.d.ts +614 -0
- package/lib/document-processing/chunking-config.js +5 -0
- package/lib/document-processing/default-document-processing-config.js +1 -1
- package/lib/document-processing/index.d.ts +1 -0
- package/lib/document-processing/index.js +2 -1
- package/lib/document-processing/resources/aggregation/handler.py +567 -0
- package/lib/document-processing/resources/aggregation/requirements.txt +7 -0
- package/lib/document-processing/resources/aggregation/test_handler.py +362 -0
- package/lib/document-processing/resources/cleanup/handler.py +276 -0
- package/lib/document-processing/resources/cleanup/requirements.txt +5 -0
- package/lib/document-processing/resources/cleanup/test_handler.py +436 -0
- package/lib/document-processing/resources/default-bedrock-invoke/index.py +85 -3
- package/lib/document-processing/resources/default-bedrock-invoke/test_index.py +622 -0
- package/lib/document-processing/resources/pdf-chunking/README.md +313 -0
- package/lib/document-processing/resources/pdf-chunking/chunking_strategies.py +460 -0
- package/lib/document-processing/resources/pdf-chunking/error_handling.py +491 -0
- package/lib/document-processing/resources/pdf-chunking/handler.py +958 -0
- package/lib/document-processing/resources/pdf-chunking/metrics.py +435 -0
- package/lib/document-processing/resources/pdf-chunking/requirements.txt +3 -0
- package/lib/document-processing/resources/pdf-chunking/strategy_selection.py +420 -0
- package/lib/document-processing/resources/pdf-chunking/structured_logging.py +457 -0
- package/lib/document-processing/resources/pdf-chunking/test_chunking_strategies.py +353 -0
- package/lib/document-processing/resources/pdf-chunking/test_error_handling.py +487 -0
- package/lib/document-processing/resources/pdf-chunking/test_handler.py +609 -0
- package/lib/document-processing/resources/pdf-chunking/test_integration.py +694 -0
- package/lib/document-processing/resources/pdf-chunking/test_metrics.py +532 -0
- package/lib/document-processing/resources/pdf-chunking/test_strategy_selection.py +471 -0
- package/lib/document-processing/resources/pdf-chunking/test_structured_logging.py +449 -0
- package/lib/document-processing/resources/pdf-chunking/test_token_estimation.py +374 -0
- package/lib/document-processing/resources/pdf-chunking/token_estimation.py +189 -0
- package/lib/document-processing/tests/agentic-document-processing-nag.test.js +4 -3
- package/lib/document-processing/tests/agentic-document-processing.test.js +488 -4
- package/lib/document-processing/tests/base-document-processing-nag.test.js +9 -2
- package/lib/document-processing/tests/base-document-processing-schema.test.d.ts +1 -0
- package/lib/document-processing/tests/base-document-processing-schema.test.js +337 -0
- package/lib/document-processing/tests/base-document-processing.test.js +114 -8
- package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.d.ts +1 -0
- package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.js +382 -0
- package/lib/document-processing/tests/bedrock-document-processing-nag.test.js +4 -3
- package/lib/document-processing/tests/bedrock-document-processing-security.test.d.ts +1 -0
- package/lib/document-processing/tests/bedrock-document-processing-security.test.js +389 -0
- package/lib/document-processing/tests/bedrock-document-processing.test.js +808 -8
- package/lib/document-processing/tests/chunking-config.test.d.ts +1 -0
- package/lib/document-processing/tests/chunking-config.test.js +238 -0
- package/lib/document-processing/tests/queued-s3-adapter-nag.test.js +9 -2
- package/lib/document-processing/tests/queued-s3-adapter.test.js +17 -6
- package/lib/framework/agents/base-agent.js +1 -1
- package/lib/framework/agents/batch-agent.js +1 -1
- package/lib/framework/agents/default-agent-config.js +1 -1
- package/lib/framework/bedrock/bedrock.js +1 -1
- package/lib/framework/custom-resource/default-runtimes.js +1 -1
- package/lib/framework/foundation/access-log.js +1 -1
- package/lib/framework/foundation/eventbridge-broker.js +1 -1
- package/lib/framework/foundation/network.js +1 -1
- package/lib/framework/tests/access-log.test.js +5 -2
- package/lib/framework/tests/batch-agent.test.js +5 -2
- package/lib/framework/tests/bedrock.test.js +5 -2
- package/lib/framework/tests/eventbridge-broker.test.js +5 -2
- package/lib/framework/tests/framework-nag.test.js +16 -8
- package/lib/framework/tests/network.test.js +9 -4
- package/lib/tsconfig.tsbuildinfo +1 -1
- package/lib/utilities/data-loader.js +1 -1
- package/lib/utilities/lambda-iam-utils.js +1 -1
- package/lib/utilities/observability/cloudfront-distribution-observability-property-injector.js +1 -1
- package/lib/utilities/observability/default-observability-config.js +1 -1
- package/lib/utilities/observability/lambda-observability-property-injector.js +1 -1
- package/lib/utilities/observability/log-group-data-protection-utils.js +1 -1
- package/lib/utilities/observability/powertools-config.d.ts +10 -1
- package/lib/utilities/observability/powertools-config.js +19 -3
- package/lib/utilities/observability/state-machine-observability-property-injector.js +1 -1
- package/lib/utilities/test-utils.d.ts +43 -0
- package/lib/utilities/test-utils.js +56 -0
- package/lib/utilities/tests/data-loader-nag.test.js +3 -2
- package/lib/utilities/tests/data-loader.test.js +3 -2
- package/lib/webapp/frontend-construct.js +1 -1
- package/lib/webapp/tests/frontend-construct-nag.test.js +3 -2
- package/lib/webapp/tests/frontend-construct.test.js +3 -2
- package/package.json +6 -5
- package/lib/document-processing/resources/default-error-handler/index.js +0 -46
- package/lib/document-processing/resources/default-pdf-processor/index.js +0 -46
- package/lib/document-processing/resources/default-pdf-validator/index.js +0 -36
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unit tests for token estimation module.
|
|
3
|
+
|
|
4
|
+
Tests cover various text densities, edge cases, and accuracy verification.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import unittest
|
|
8
|
+
from unittest.mock import Mock, patch, MagicMock
|
|
9
|
+
from io import BytesIO
|
|
10
|
+
from token_estimation import estimate_tokens_fast, analyze_pdf_tokens
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TestEstimateTokensFast(unittest.TestCase):
|
|
14
|
+
"""Test cases for the estimate_tokens_fast function."""
|
|
15
|
+
|
|
16
|
+
def test_empty_text(self):
|
|
17
|
+
"""Test with empty text (0 tokens)."""
|
|
18
|
+
result = estimate_tokens_fast("")
|
|
19
|
+
self.assertEqual(result, 0)
|
|
20
|
+
|
|
21
|
+
def test_none_text(self):
|
|
22
|
+
"""Test with None text (0 tokens)."""
|
|
23
|
+
result = estimate_tokens_fast(None)
|
|
24
|
+
self.assertEqual(result, 0)
|
|
25
|
+
|
|
26
|
+
def test_simple_sentence(self):
|
|
27
|
+
"""Test with a simple sentence."""
|
|
28
|
+
text = "Hello world"
|
|
29
|
+
result = estimate_tokens_fast(text)
|
|
30
|
+
# 2 words * 1.3 = 2.6 -> 2 tokens
|
|
31
|
+
self.assertEqual(result, 2)
|
|
32
|
+
|
|
33
|
+
def test_medium_density_text(self):
|
|
34
|
+
"""Test with medium density text (~1500 tokens per page)."""
|
|
35
|
+
# Typical page: ~1000-1200 words
|
|
36
|
+
text = " ".join(["word"] * 1000)
|
|
37
|
+
result = estimate_tokens_fast(text)
|
|
38
|
+
# 1000 words * 1.3 = 1300 tokens
|
|
39
|
+
self.assertEqual(result, 1300)
|
|
40
|
+
# Verify it's in the expected range for medium density
|
|
41
|
+
self.assertGreaterEqual(result, 1200)
|
|
42
|
+
self.assertLessEqual(result, 1500)
|
|
43
|
+
|
|
44
|
+
def test_high_density_text(self):
|
|
45
|
+
"""Test with very dense text (>10,000 tokens)."""
|
|
46
|
+
# Very dense page: ~8000+ words
|
|
47
|
+
text = " ".join(["word"] * 8000)
|
|
48
|
+
result = estimate_tokens_fast(text)
|
|
49
|
+
# 8000 words * 1.3 = 10400 tokens
|
|
50
|
+
self.assertEqual(result, 10400)
|
|
51
|
+
self.assertGreater(result, 10000)
|
|
52
|
+
|
|
53
|
+
def test_low_density_text(self):
|
|
54
|
+
"""Test with low density text (sparse content)."""
|
|
55
|
+
# Sparse page: ~100 words
|
|
56
|
+
text = " ".join(["word"] * 100)
|
|
57
|
+
result = estimate_tokens_fast(text)
|
|
58
|
+
# 100 words * 1.3 = 130 tokens
|
|
59
|
+
self.assertEqual(result, 130)
|
|
60
|
+
self.assertLess(result, 200)
|
|
61
|
+
|
|
62
|
+
def test_text_with_punctuation(self):
|
|
63
|
+
"""Test that punctuation doesn't inflate word count."""
|
|
64
|
+
text = "Hello, world! How are you? I'm fine, thanks."
|
|
65
|
+
result = estimate_tokens_fast(text)
|
|
66
|
+
# Words: Hello, world, How, are, you, I, m, fine, thanks = 9 words
|
|
67
|
+
# 9 * 1.3 = 11.7 -> 11 tokens
|
|
68
|
+
self.assertEqual(result, 11)
|
|
69
|
+
|
|
70
|
+
def test_text_with_numbers(self):
|
|
71
|
+
"""Test with text containing numbers."""
|
|
72
|
+
text = "The year 2024 has 365 days and 12 months"
|
|
73
|
+
result = estimate_tokens_fast(text)
|
|
74
|
+
# Words: The, year, 2024, has, 365, days, and, 12, months = 9 words
|
|
75
|
+
# 9 * 1.3 = 11.7 -> 11 tokens
|
|
76
|
+
self.assertEqual(result, 11)
|
|
77
|
+
|
|
78
|
+
def test_text_with_special_characters(self):
|
|
79
|
+
"""Test with special characters and symbols."""
|
|
80
|
+
text = "Email: user@example.com, Phone: +1-555-0123"
|
|
81
|
+
result = estimate_tokens_fast(text)
|
|
82
|
+
# Words extracted by \b\w+\b: Email, user, example, com, Phone, 1, 555, 0123 = 8 words
|
|
83
|
+
# 8 * 1.3 = 10.4 -> 10 tokens
|
|
84
|
+
self.assertEqual(result, 10)
|
|
85
|
+
|
|
86
|
+
def test_multiline_text(self):
|
|
87
|
+
"""Test with multiline text."""
|
|
88
|
+
text = """Line one with some words.
|
|
89
|
+
Line two with more words.
|
|
90
|
+
Line three with even more words."""
|
|
91
|
+
result = estimate_tokens_fast(text)
|
|
92
|
+
# 16 words * 1.3 = 20.8 -> 20 tokens
|
|
93
|
+
self.assertEqual(result, 20)
|
|
94
|
+
|
|
95
|
+
def test_estimation_accuracy_range(self):
|
|
96
|
+
"""Verify estimation is within expected accuracy range (85-90%)."""
|
|
97
|
+
# Sample text with known characteristics
|
|
98
|
+
text = "The quick brown fox jumps over the lazy dog. " * 100
|
|
99
|
+
result = estimate_tokens_fast(text)
|
|
100
|
+
|
|
101
|
+
# 9 words per sentence * 100 = 900 words
|
|
102
|
+
# 900 * 1.3 = 1170 tokens (our estimate)
|
|
103
|
+
self.assertEqual(result, 1170)
|
|
104
|
+
|
|
105
|
+
# Actual tokenization would be around 1000-1100 tokens
|
|
106
|
+
# Our estimate should be within 85-90% accuracy
|
|
107
|
+
# This means we're slightly conservative (overestimating)
|
|
108
|
+
# which is acceptable for chunking decisions
|
|
109
|
+
expected_actual = 1050 # Approximate actual token count
|
|
110
|
+
accuracy = min(result, expected_actual) / max(result, expected_actual)
|
|
111
|
+
self.assertGreaterEqual(accuracy, 0.85)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class TestAnalyzePdfTokens(unittest.TestCase):
|
|
115
|
+
"""Test cases for the analyze_pdf_tokens function."""
|
|
116
|
+
|
|
117
|
+
@patch('token_estimation.boto3.client')
|
|
118
|
+
@patch('PyPDF2.PdfReader')
|
|
119
|
+
def test_small_pdf_no_chunking(self, mock_pdf_reader, mock_boto_client):
|
|
120
|
+
"""Test with small PDF that doesn't require chunking."""
|
|
121
|
+
# Mock S3 client
|
|
122
|
+
mock_s3 = Mock()
|
|
123
|
+
mock_boto_client.return_value = mock_s3
|
|
124
|
+
mock_s3.get_object.return_value = {'Body': BytesIO(b'fake pdf')}
|
|
125
|
+
|
|
126
|
+
# Mock PDF with 30 pages, ~1500 tokens per page
|
|
127
|
+
mock_pages = []
|
|
128
|
+
for i in range(30):
|
|
129
|
+
mock_page = Mock()
|
|
130
|
+
mock_page.extract_text.return_value = " ".join(["word"] * 1000)
|
|
131
|
+
mock_pages.append(mock_page)
|
|
132
|
+
|
|
133
|
+
mock_reader = Mock()
|
|
134
|
+
mock_reader.pages = mock_pages
|
|
135
|
+
mock_pdf_reader.return_value = mock_reader
|
|
136
|
+
|
|
137
|
+
# Analyze PDF
|
|
138
|
+
result = analyze_pdf_tokens('test-bucket', 'test.pdf')
|
|
139
|
+
|
|
140
|
+
# Verify results
|
|
141
|
+
self.assertEqual(result['total_pages'], 30)
|
|
142
|
+
self.assertEqual(result['total_tokens'], 39000) # 30 * 1300
|
|
143
|
+
self.assertEqual(result['avg_tokens_per_page'], 1300)
|
|
144
|
+
self.assertFalse(result['requires_chunking']) # Below 100 page threshold
|
|
145
|
+
self.assertEqual(result['strategy'], 'hybrid')
|
|
146
|
+
self.assertEqual(result['estimation_method'], 'word-based')
|
|
147
|
+
|
|
148
|
+
@patch('token_estimation.boto3.client')
|
|
149
|
+
@patch('PyPDF2.PdfReader')
|
|
150
|
+
def test_large_pdf_requires_chunking_pages(self, mock_pdf_reader, mock_boto_client):
|
|
151
|
+
"""Test with large PDF that requires chunking (page threshold)."""
|
|
152
|
+
# Mock S3 client
|
|
153
|
+
mock_s3 = Mock()
|
|
154
|
+
mock_boto_client.return_value = mock_s3
|
|
155
|
+
mock_s3.get_object.return_value = {'Body': BytesIO(b'fake pdf')}
|
|
156
|
+
|
|
157
|
+
# Mock PDF with 150 pages
|
|
158
|
+
mock_pages = []
|
|
159
|
+
for i in range(150):
|
|
160
|
+
mock_page = Mock()
|
|
161
|
+
mock_page.extract_text.return_value = " ".join(["word"] * 1000)
|
|
162
|
+
mock_pages.append(mock_page)
|
|
163
|
+
|
|
164
|
+
mock_reader = Mock()
|
|
165
|
+
mock_reader.pages = mock_pages
|
|
166
|
+
mock_pdf_reader.return_value = mock_reader
|
|
167
|
+
|
|
168
|
+
# Analyze PDF
|
|
169
|
+
result = analyze_pdf_tokens('test-bucket', 'test.pdf')
|
|
170
|
+
|
|
171
|
+
# Verify results
|
|
172
|
+
self.assertEqual(result['total_pages'], 150)
|
|
173
|
+
self.assertTrue(result['requires_chunking']) # Above 100 page threshold
|
|
174
|
+
|
|
175
|
+
@patch('token_estimation.boto3.client')
|
|
176
|
+
@patch('PyPDF2.PdfReader')
|
|
177
|
+
def test_large_pdf_requires_chunking_tokens(self, mock_pdf_reader, mock_boto_client):
|
|
178
|
+
"""Test with PDF that requires chunking (token threshold)."""
|
|
179
|
+
# Mock S3 client
|
|
180
|
+
mock_s3 = Mock()
|
|
181
|
+
mock_boto_client.return_value = mock_s3
|
|
182
|
+
mock_s3.get_object.return_value = {'Body': BytesIO(b'fake pdf')}
|
|
183
|
+
|
|
184
|
+
# Mock PDF with 80 pages but very high density (~5000 tokens per page)
|
|
185
|
+
mock_pages = []
|
|
186
|
+
for i in range(80):
|
|
187
|
+
mock_page = Mock()
|
|
188
|
+
# ~4000 words per page -> ~5200 tokens per page
|
|
189
|
+
mock_page.extract_text.return_value = " ".join(["word"] * 4000)
|
|
190
|
+
mock_pages.append(mock_page)
|
|
191
|
+
|
|
192
|
+
mock_reader = Mock()
|
|
193
|
+
mock_reader.pages = mock_pages
|
|
194
|
+
mock_pdf_reader.return_value = mock_reader
|
|
195
|
+
|
|
196
|
+
# Analyze PDF with token-based strategy
|
|
197
|
+
config = {
|
|
198
|
+
'chunkingStrategy': 'token-based',
|
|
199
|
+
'tokenThreshold': 150000
|
|
200
|
+
}
|
|
201
|
+
result = analyze_pdf_tokens('test-bucket', 'test.pdf', config)
|
|
202
|
+
|
|
203
|
+
# Verify results
|
|
204
|
+
self.assertEqual(result['total_pages'], 80)
|
|
205
|
+
self.assertEqual(result['total_tokens'], 416000) # 80 * 5200
|
|
206
|
+
self.assertTrue(result['requires_chunking']) # Above 150000 token threshold
|
|
207
|
+
self.assertEqual(result['strategy'], 'token-based')
|
|
208
|
+
|
|
209
|
+
@patch('token_estimation.boto3.client')
|
|
210
|
+
@patch('PyPDF2.PdfReader')
|
|
211
|
+
def test_fixed_pages_strategy(self, mock_pdf_reader, mock_boto_client):
|
|
212
|
+
"""Test with fixed-pages strategy."""
|
|
213
|
+
# Mock S3 client
|
|
214
|
+
mock_s3 = Mock()
|
|
215
|
+
mock_boto_client.return_value = mock_s3
|
|
216
|
+
mock_s3.get_object.return_value = {'Body': BytesIO(b'fake pdf')}
|
|
217
|
+
|
|
218
|
+
# Mock PDF with 50 pages
|
|
219
|
+
mock_pages = []
|
|
220
|
+
for i in range(50):
|
|
221
|
+
mock_page = Mock()
|
|
222
|
+
mock_page.extract_text.return_value = " ".join(["word"] * 1000)
|
|
223
|
+
mock_pages.append(mock_page)
|
|
224
|
+
|
|
225
|
+
mock_reader = Mock()
|
|
226
|
+
mock_reader.pages = mock_pages
|
|
227
|
+
mock_pdf_reader.return_value = mock_reader
|
|
228
|
+
|
|
229
|
+
# Analyze PDF with fixed-pages strategy
|
|
230
|
+
config = {
|
|
231
|
+
'chunkingStrategy': 'fixed-pages',
|
|
232
|
+
'pageThreshold': 100
|
|
233
|
+
}
|
|
234
|
+
result = analyze_pdf_tokens('test-bucket', 'test.pdf', config)
|
|
235
|
+
|
|
236
|
+
# Verify results
|
|
237
|
+
self.assertFalse(result['requires_chunking']) # Below page threshold
|
|
238
|
+
self.assertEqual(result['strategy'], 'fixed-pages')
|
|
239
|
+
|
|
240
|
+
@patch('token_estimation.boto3.client')
|
|
241
|
+
@patch('PyPDF2.PdfReader')
|
|
242
|
+
def test_empty_pages(self, mock_pdf_reader, mock_boto_client):
|
|
243
|
+
"""Test with PDF containing empty pages (0 tokens)."""
|
|
244
|
+
# Mock S3 client
|
|
245
|
+
mock_s3 = Mock()
|
|
246
|
+
mock_boto_client.return_value = mock_s3
|
|
247
|
+
mock_s3.get_object.return_value = {'Body': BytesIO(b'fake pdf')}
|
|
248
|
+
|
|
249
|
+
# Mock PDF with mix of empty and content pages
|
|
250
|
+
mock_pages = []
|
|
251
|
+
for i in range(10):
|
|
252
|
+
mock_page = Mock()
|
|
253
|
+
if i % 2 == 0:
|
|
254
|
+
mock_page.extract_text.return_value = "" # Empty page
|
|
255
|
+
else:
|
|
256
|
+
mock_page.extract_text.return_value = " ".join(["word"] * 1000)
|
|
257
|
+
mock_pages.append(mock_page)
|
|
258
|
+
|
|
259
|
+
mock_reader = Mock()
|
|
260
|
+
mock_reader.pages = mock_pages
|
|
261
|
+
mock_pdf_reader.return_value = mock_reader
|
|
262
|
+
|
|
263
|
+
# Analyze PDF
|
|
264
|
+
result = analyze_pdf_tokens('test-bucket', 'test.pdf')
|
|
265
|
+
|
|
266
|
+
# Verify results
|
|
267
|
+
self.assertEqual(result['total_pages'], 10)
|
|
268
|
+
# 5 empty pages (0 tokens) + 5 content pages (1300 tokens each) = 6500 tokens
|
|
269
|
+
self.assertEqual(result['total_tokens'], 6500)
|
|
270
|
+
self.assertEqual(len(result['tokens_per_page']), 10)
|
|
271
|
+
# Check that empty pages have 0 tokens
|
|
272
|
+
self.assertEqual(result['tokens_per_page'][0], 0)
|
|
273
|
+
self.assertEqual(result['tokens_per_page'][2], 0)
|
|
274
|
+
|
|
275
|
+
@patch('token_estimation.boto3.client')
|
|
276
|
+
@patch('PyPDF2.PdfReader')
|
|
277
|
+
def test_variable_density_pages(self, mock_pdf_reader, mock_boto_client):
|
|
278
|
+
"""Test with PDF containing pages of varying density."""
|
|
279
|
+
# Mock S3 client
|
|
280
|
+
mock_s3 = Mock()
|
|
281
|
+
mock_boto_client.return_value = mock_s3
|
|
282
|
+
mock_s3.get_object.return_value = {'Body': BytesIO(b'fake pdf')}
|
|
283
|
+
|
|
284
|
+
# Mock PDF with variable density pages
|
|
285
|
+
mock_pages = []
|
|
286
|
+
densities = [100, 500, 1000, 2000, 5000] # words per page
|
|
287
|
+
for density in densities:
|
|
288
|
+
mock_page = Mock()
|
|
289
|
+
mock_page.extract_text.return_value = " ".join(["word"] * density)
|
|
290
|
+
mock_pages.append(mock_page)
|
|
291
|
+
|
|
292
|
+
mock_reader = Mock()
|
|
293
|
+
mock_reader.pages = mock_pages
|
|
294
|
+
mock_pdf_reader.return_value = mock_reader
|
|
295
|
+
|
|
296
|
+
# Analyze PDF
|
|
297
|
+
result = analyze_pdf_tokens('test-bucket', 'test.pdf')
|
|
298
|
+
|
|
299
|
+
# Verify results
|
|
300
|
+
self.assertEqual(result['total_pages'], 5)
|
|
301
|
+
# Expected tokens: 130, 650, 1300, 2600, 6500 = 11180 total
|
|
302
|
+
self.assertEqual(result['total_tokens'], 11180)
|
|
303
|
+
self.assertEqual(result['tokens_per_page'][0], 130)
|
|
304
|
+
self.assertEqual(result['tokens_per_page'][4], 6500)
|
|
305
|
+
|
|
306
|
+
@patch('token_estimation.boto3.client')
|
|
307
|
+
def test_s3_access_error(self, mock_boto_client):
|
|
308
|
+
"""Test error handling for S3 access denied."""
|
|
309
|
+
from botocore.exceptions import ClientError
|
|
310
|
+
|
|
311
|
+
# Mock S3 client to raise access denied error
|
|
312
|
+
mock_s3 = Mock()
|
|
313
|
+
mock_boto_client.return_value = mock_s3
|
|
314
|
+
mock_s3.get_object.side_effect = ClientError(
|
|
315
|
+
{'Error': {'Code': 'AccessDenied', 'Message': 'Access Denied'}},
|
|
316
|
+
'GetObject'
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
# Verify error is raised with context
|
|
320
|
+
with self.assertRaises(ClientError) as context:
|
|
321
|
+
analyze_pdf_tokens('test-bucket', 'test.pdf')
|
|
322
|
+
|
|
323
|
+
self.assertIn('Failed to access S3 object', str(context.exception))
|
|
324
|
+
|
|
325
|
+
@patch('token_estimation.boto3.client')
|
|
326
|
+
@patch('PyPDF2.PdfReader')
|
|
327
|
+
def test_pdf_processing_error(self, mock_pdf_reader, mock_boto_client):
|
|
328
|
+
"""Test error handling for PDF processing failures."""
|
|
329
|
+
# Mock S3 client
|
|
330
|
+
mock_s3 = Mock()
|
|
331
|
+
mock_boto_client.return_value = mock_s3
|
|
332
|
+
mock_s3.get_object.return_value = {'Body': BytesIO(b'fake pdf')}
|
|
333
|
+
|
|
334
|
+
# Mock PDF reader to raise error
|
|
335
|
+
mock_pdf_reader.side_effect = Exception("Invalid PDF format")
|
|
336
|
+
|
|
337
|
+
# Verify error is raised with context
|
|
338
|
+
with self.assertRaises(Exception) as context:
|
|
339
|
+
analyze_pdf_tokens('test-bucket', 'test.pdf')
|
|
340
|
+
|
|
341
|
+
self.assertIn('Failed to analyze PDF', str(context.exception))
|
|
342
|
+
|
|
343
|
+
@patch('token_estimation.boto3.client')
|
|
344
|
+
@patch('PyPDF2.PdfReader')
|
|
345
|
+
def test_default_config_values(self, mock_pdf_reader, mock_boto_client):
|
|
346
|
+
"""Test that default configuration values are applied."""
|
|
347
|
+
# Mock S3 client
|
|
348
|
+
mock_s3 = Mock()
|
|
349
|
+
mock_boto_client.return_value = mock_s3
|
|
350
|
+
mock_s3.get_object.return_value = {'Body': BytesIO(b'fake pdf')}
|
|
351
|
+
|
|
352
|
+
# Mock PDF with 50 pages
|
|
353
|
+
mock_pages = []
|
|
354
|
+
for i in range(50):
|
|
355
|
+
mock_page = Mock()
|
|
356
|
+
mock_page.extract_text.return_value = " ".join(["word"] * 1000)
|
|
357
|
+
mock_pages.append(mock_page)
|
|
358
|
+
|
|
359
|
+
mock_reader = Mock()
|
|
360
|
+
mock_reader.pages = mock_pages
|
|
361
|
+
mock_pdf_reader.return_value = mock_reader
|
|
362
|
+
|
|
363
|
+
# Analyze PDF without config (should use defaults)
|
|
364
|
+
result = analyze_pdf_tokens('test-bucket', 'test.pdf')
|
|
365
|
+
|
|
366
|
+
# Verify default strategy is used
|
|
367
|
+
self.assertEqual(result['strategy'], 'hybrid')
|
|
368
|
+
# With 50 pages and 65000 tokens, should not require chunking
|
|
369
|
+
# (below 100 page threshold and 150000 token threshold)
|
|
370
|
+
self.assertFalse(result['requires_chunking'])
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
if __name__ == '__main__':
|
|
374
|
+
unittest.main()
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Token estimation module for PDF chunking.
|
|
3
|
+
|
|
4
|
+
This module provides fast token estimation using word-based heuristics
|
|
5
|
+
to determine if PDFs require chunking before processing.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
import io
|
|
10
|
+
import logging
|
|
11
|
+
from typing import Dict, List, Optional
|
|
12
|
+
import boto3
|
|
13
|
+
from botocore.exceptions import ClientError
|
|
14
|
+
|
|
15
|
+
# Import strategy selection module
|
|
16
|
+
from strategy_selection import select_strategy_and_check_thresholds
|
|
17
|
+
|
|
18
|
+
# Configure module logger
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def estimate_tokens_fast(text: str) -> int:
|
|
23
|
+
"""
|
|
24
|
+
Fast token estimation using word count heuristic.
|
|
25
|
+
|
|
26
|
+
This approach provides ~85-90% accuracy for English text while being
|
|
27
|
+
significantly faster than actual tokenization. It uses a conservative
|
|
28
|
+
multiplier to avoid underestimating token counts.
|
|
29
|
+
|
|
30
|
+
Approach:
|
|
31
|
+
- Count words (alphanumeric sequences)
|
|
32
|
+
- Apply multiplier: 1.3 tokens per word (conservative)
|
|
33
|
+
- Accounts for multi-token words, punctuation, and special characters
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
text: The text to estimate tokens for
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Estimated token count as an integer
|
|
40
|
+
|
|
41
|
+
Examples:
|
|
42
|
+
>>> estimate_tokens_fast("Hello world")
|
|
43
|
+
2
|
|
44
|
+
>>> estimate_tokens_fast("The quick brown fox jumps over the lazy dog")
|
|
45
|
+
11
|
|
46
|
+
>>> estimate_tokens_fast("")
|
|
47
|
+
0
|
|
48
|
+
"""
|
|
49
|
+
if not text:
|
|
50
|
+
return 0
|
|
51
|
+
|
|
52
|
+
# Count words (sequences of alphanumeric characters)
|
|
53
|
+
# Pattern \b\w+\b matches word boundaries
|
|
54
|
+
words = re.findall(r'\b\w+\b', text)
|
|
55
|
+
word_count = len(words)
|
|
56
|
+
|
|
57
|
+
# Conservative estimate: 1.3 tokens per word
|
|
58
|
+
# This accounts for:
|
|
59
|
+
# - Multi-token words (compound words, technical terms)
|
|
60
|
+
# - Punctuation and special characters
|
|
61
|
+
# - Whitespace tokens
|
|
62
|
+
estimated_tokens = int(word_count * 1.3)
|
|
63
|
+
|
|
64
|
+
return estimated_tokens
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def analyze_pdf_tokens(
|
|
68
|
+
bucket: str,
|
|
69
|
+
key: str,
|
|
70
|
+
config: Optional[Dict] = None
|
|
71
|
+
) -> Dict:
|
|
72
|
+
"""
|
|
73
|
+
Analyze PDF token distribution using efficient S3 streaming.
|
|
74
|
+
|
|
75
|
+
This function streams a PDF from S3, extracts text from each page,
|
|
76
|
+
estimates tokens per page, and determines if chunking is required
|
|
77
|
+
based on the configured strategy and thresholds.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
bucket: S3 bucket name containing the PDF
|
|
81
|
+
key: S3 object key for the PDF
|
|
82
|
+
config: Configuration dictionary with optional keys:
|
|
83
|
+
- chunkingStrategy: 'fixed-pages', 'token-based', or 'hybrid' (default: 'hybrid')
|
|
84
|
+
- pageThreshold: Maximum pages before chunking (default: 100)
|
|
85
|
+
- tokenThreshold: Maximum tokens before chunking (default: 150000)
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Dictionary containing:
|
|
89
|
+
- total_tokens: Total estimated tokens in the document
|
|
90
|
+
- total_pages: Total number of pages
|
|
91
|
+
- tokens_per_page: List of token counts for each page
|
|
92
|
+
- avg_tokens_per_page: Average tokens per page
|
|
93
|
+
- requires_chunking: Boolean indicating if chunking is needed
|
|
94
|
+
- strategy: The strategy used for the decision
|
|
95
|
+
- estimation_method: Always 'word-based'
|
|
96
|
+
|
|
97
|
+
Raises:
|
|
98
|
+
ClientError: If S3 access fails
|
|
99
|
+
Exception: If PDF processing fails
|
|
100
|
+
|
|
101
|
+
Examples:
|
|
102
|
+
>>> result = analyze_pdf_tokens('my-bucket', 'docs/file.pdf')
|
|
103
|
+
>>> print(result['total_pages'])
|
|
104
|
+
150
|
|
105
|
+
>>> print(result['requires_chunking'])
|
|
106
|
+
True
|
|
107
|
+
"""
|
|
108
|
+
# Import PyPDF2 here to avoid import errors if not installed
|
|
109
|
+
try:
|
|
110
|
+
import PyPDF2
|
|
111
|
+
except ImportError:
|
|
112
|
+
raise ImportError(
|
|
113
|
+
"PyPDF2 is required for PDF processing. "
|
|
114
|
+
"Install it with: pip install PyPDF2"
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Set default configuration
|
|
118
|
+
if config is None:
|
|
119
|
+
config = {}
|
|
120
|
+
|
|
121
|
+
# Initialize S3 client
|
|
122
|
+
s3 = boto3.client('s3')
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
# Stream PDF from S3 (don't load entire file into memory)
|
|
126
|
+
pdf_obj = s3.get_object(Bucket=bucket, Key=key)
|
|
127
|
+
pdf_bytes = pdf_obj['Body'].read()
|
|
128
|
+
|
|
129
|
+
# Validate file is actually a PDF by checking magic bytes
|
|
130
|
+
if len(pdf_bytes) < 5 or pdf_bytes[:5] != b'%PDF-':
|
|
131
|
+
raise Exception(
|
|
132
|
+
f"File s3://{bucket}/{key} is not a valid PDF. "
|
|
133
|
+
"File must start with PDF magic bytes (%PDF-)."
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
|
|
137
|
+
|
|
138
|
+
tokens_per_page = []
|
|
139
|
+
total_tokens = 0
|
|
140
|
+
|
|
141
|
+
# Process each page
|
|
142
|
+
for page in pdf_reader.pages:
|
|
143
|
+
text = page.extract_text()
|
|
144
|
+
page_tokens = estimate_tokens_fast(text)
|
|
145
|
+
tokens_per_page.append(page_tokens)
|
|
146
|
+
total_tokens += page_tokens
|
|
147
|
+
|
|
148
|
+
total_pages = len(pdf_reader.pages)
|
|
149
|
+
|
|
150
|
+
# Calculate average tokens per page
|
|
151
|
+
avg_tokens_per_page = total_tokens / total_pages if total_pages > 0 else 0
|
|
152
|
+
|
|
153
|
+
# Use strategy selection module to determine if chunking is required
|
|
154
|
+
selection_result = select_strategy_and_check_thresholds(
|
|
155
|
+
total_pages=total_pages,
|
|
156
|
+
total_tokens=total_tokens,
|
|
157
|
+
config=config
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
return {
|
|
161
|
+
'total_tokens': total_tokens,
|
|
162
|
+
'total_pages': total_pages,
|
|
163
|
+
'tokens_per_page': tokens_per_page,
|
|
164
|
+
'avg_tokens_per_page': avg_tokens_per_page,
|
|
165
|
+
'requires_chunking': selection_result.requires_chunking,
|
|
166
|
+
'strategy': selection_result.strategy,
|
|
167
|
+
'estimation_method': 'word-based',
|
|
168
|
+
'selection_reason': selection_result.reason,
|
|
169
|
+
'page_threshold_exceeded': selection_result.page_threshold_exceeded,
|
|
170
|
+
'token_threshold_exceeded': selection_result.token_threshold_exceeded
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
except ClientError as e:
|
|
174
|
+
# Re-raise S3 errors with context
|
|
175
|
+
error_code = e.response['Error']['Code']
|
|
176
|
+
raise ClientError(
|
|
177
|
+
{
|
|
178
|
+
'Error': {
|
|
179
|
+
'Code': error_code,
|
|
180
|
+
'Message': f"Failed to access S3 object s3://{bucket}/{key}: {str(e)}"
|
|
181
|
+
}
|
|
182
|
+
},
|
|
183
|
+
'GetObject'
|
|
184
|
+
)
|
|
185
|
+
except Exception as e:
|
|
186
|
+
# Wrap other errors with context
|
|
187
|
+
raise Exception(
|
|
188
|
+
f"Failed to analyze PDF s3://{bucket}/{key}: {str(e)}"
|
|
189
|
+
) from e
|
|
@@ -6,10 +6,11 @@ const aws_s3_1 = require("aws-cdk-lib/aws-s3");
|
|
|
6
6
|
const aws_s3_assets_1 = require("aws-cdk-lib/aws-s3-assets");
|
|
7
7
|
const cdk_nag_1 = require("cdk-nag");
|
|
8
8
|
const framework_1 = require("../../framework");
|
|
9
|
+
const test_utils_1 = require("../../utilities/test-utils");
|
|
9
10
|
const adapter_1 = require("../adapter");
|
|
10
11
|
const agentic_document_processing_1 = require("../agentic-document-processing");
|
|
11
|
-
// Create app and stack
|
|
12
|
-
const app =
|
|
12
|
+
// Create app and stack with bundling disabled for faster tests
|
|
13
|
+
const app = (0, test_utils_1.createTestApp)();
|
|
13
14
|
const stack = new aws_cdk_lib_1.Stack(app, 'TestStack', {
|
|
14
15
|
env: {
|
|
15
16
|
account: '123456789012',
|
|
@@ -109,4 +110,4 @@ test('No unsuppressed errors', () => {
|
|
|
109
110
|
}
|
|
110
111
|
expect(errors).toHaveLength(0);
|
|
111
112
|
});
|
|
112
|
-
//# sourceMappingURL=data:application/json;base64,
|
|
113
|
+
//# sourceMappingURL=data:application/json;base64,
|