@cdklabs/cdk-appmod-catalog-blueprints 1.5.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/.jsii +2537 -204
  2. package/lib/document-processing/adapter/adapter.d.ts +4 -2
  3. package/lib/document-processing/adapter/adapter.js +1 -1
  4. package/lib/document-processing/adapter/queued-s3-adapter.d.ts +9 -2
  5. package/lib/document-processing/adapter/queued-s3-adapter.js +29 -15
  6. package/lib/document-processing/agentic-document-processing.d.ts +4 -0
  7. package/lib/document-processing/agentic-document-processing.js +20 -10
  8. package/lib/document-processing/base-document-processing.d.ts +54 -2
  9. package/lib/document-processing/base-document-processing.js +136 -82
  10. package/lib/document-processing/bedrock-document-processing.d.ts +202 -2
  11. package/lib/document-processing/bedrock-document-processing.js +717 -77
  12. package/lib/document-processing/chunking-config.d.ts +614 -0
  13. package/lib/document-processing/chunking-config.js +5 -0
  14. package/lib/document-processing/default-document-processing-config.js +1 -1
  15. package/lib/document-processing/index.d.ts +1 -0
  16. package/lib/document-processing/index.js +2 -1
  17. package/lib/document-processing/resources/aggregation/handler.py +567 -0
  18. package/lib/document-processing/resources/aggregation/requirements.txt +7 -0
  19. package/lib/document-processing/resources/aggregation/test_handler.py +362 -0
  20. package/lib/document-processing/resources/cleanup/handler.py +276 -0
  21. package/lib/document-processing/resources/cleanup/requirements.txt +5 -0
  22. package/lib/document-processing/resources/cleanup/test_handler.py +436 -0
  23. package/lib/document-processing/resources/default-bedrock-invoke/index.py +85 -3
  24. package/lib/document-processing/resources/default-bedrock-invoke/test_index.py +622 -0
  25. package/lib/document-processing/resources/pdf-chunking/README.md +313 -0
  26. package/lib/document-processing/resources/pdf-chunking/chunking_strategies.py +460 -0
  27. package/lib/document-processing/resources/pdf-chunking/error_handling.py +491 -0
  28. package/lib/document-processing/resources/pdf-chunking/handler.py +958 -0
  29. package/lib/document-processing/resources/pdf-chunking/metrics.py +435 -0
  30. package/lib/document-processing/resources/pdf-chunking/requirements.txt +3 -0
  31. package/lib/document-processing/resources/pdf-chunking/strategy_selection.py +420 -0
  32. package/lib/document-processing/resources/pdf-chunking/structured_logging.py +457 -0
  33. package/lib/document-processing/resources/pdf-chunking/test_chunking_strategies.py +353 -0
  34. package/lib/document-processing/resources/pdf-chunking/test_error_handling.py +487 -0
  35. package/lib/document-processing/resources/pdf-chunking/test_handler.py +609 -0
  36. package/lib/document-processing/resources/pdf-chunking/test_integration.py +694 -0
  37. package/lib/document-processing/resources/pdf-chunking/test_metrics.py +532 -0
  38. package/lib/document-processing/resources/pdf-chunking/test_strategy_selection.py +471 -0
  39. package/lib/document-processing/resources/pdf-chunking/test_structured_logging.py +449 -0
  40. package/lib/document-processing/resources/pdf-chunking/test_token_estimation.py +374 -0
  41. package/lib/document-processing/resources/pdf-chunking/token_estimation.py +189 -0
  42. package/lib/document-processing/tests/agentic-document-processing-nag.test.js +4 -3
  43. package/lib/document-processing/tests/agentic-document-processing.test.js +488 -4
  44. package/lib/document-processing/tests/base-document-processing-nag.test.js +9 -2
  45. package/lib/document-processing/tests/base-document-processing-schema.test.d.ts +1 -0
  46. package/lib/document-processing/tests/base-document-processing-schema.test.js +337 -0
  47. package/lib/document-processing/tests/base-document-processing.test.js +114 -8
  48. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.d.ts +1 -0
  49. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.js +382 -0
  50. package/lib/document-processing/tests/bedrock-document-processing-nag.test.js +4 -3
  51. package/lib/document-processing/tests/bedrock-document-processing-security.test.d.ts +1 -0
  52. package/lib/document-processing/tests/bedrock-document-processing-security.test.js +389 -0
  53. package/lib/document-processing/tests/bedrock-document-processing.test.js +808 -8
  54. package/lib/document-processing/tests/chunking-config.test.d.ts +1 -0
  55. package/lib/document-processing/tests/chunking-config.test.js +238 -0
  56. package/lib/document-processing/tests/queued-s3-adapter-nag.test.js +9 -2
  57. package/lib/document-processing/tests/queued-s3-adapter.test.js +17 -6
  58. package/lib/framework/agents/base-agent.js +1 -1
  59. package/lib/framework/agents/batch-agent.js +1 -1
  60. package/lib/framework/agents/default-agent-config.js +1 -1
  61. package/lib/framework/bedrock/bedrock.js +1 -1
  62. package/lib/framework/custom-resource/default-runtimes.js +1 -1
  63. package/lib/framework/foundation/access-log.js +1 -1
  64. package/lib/framework/foundation/eventbridge-broker.js +1 -1
  65. package/lib/framework/foundation/network.js +1 -1
  66. package/lib/framework/tests/access-log.test.js +5 -2
  67. package/lib/framework/tests/batch-agent.test.js +5 -2
  68. package/lib/framework/tests/bedrock.test.js +5 -2
  69. package/lib/framework/tests/eventbridge-broker.test.js +5 -2
  70. package/lib/framework/tests/framework-nag.test.js +16 -8
  71. package/lib/framework/tests/network.test.js +9 -4
  72. package/lib/tsconfig.tsbuildinfo +1 -1
  73. package/lib/utilities/data-loader.js +1 -1
  74. package/lib/utilities/lambda-iam-utils.js +1 -1
  75. package/lib/utilities/observability/cloudfront-distribution-observability-property-injector.js +1 -1
  76. package/lib/utilities/observability/default-observability-config.js +1 -1
  77. package/lib/utilities/observability/lambda-observability-property-injector.js +1 -1
  78. package/lib/utilities/observability/log-group-data-protection-utils.js +1 -1
  79. package/lib/utilities/observability/powertools-config.d.ts +10 -1
  80. package/lib/utilities/observability/powertools-config.js +19 -3
  81. package/lib/utilities/observability/state-machine-observability-property-injector.js +1 -1
  82. package/lib/utilities/test-utils.d.ts +43 -0
  83. package/lib/utilities/test-utils.js +56 -0
  84. package/lib/utilities/tests/data-loader-nag.test.js +3 -2
  85. package/lib/utilities/tests/data-loader.test.js +3 -2
  86. package/lib/webapp/frontend-construct.js +1 -1
  87. package/lib/webapp/tests/frontend-construct-nag.test.js +3 -2
  88. package/lib/webapp/tests/frontend-construct.test.js +3 -2
  89. package/package.json +6 -5
  90. package/lib/document-processing/resources/default-error-handler/index.js +0 -46
  91. package/lib/document-processing/resources/default-pdf-processor/index.js +0 -46
  92. package/lib/document-processing/resources/default-pdf-validator/index.js +0 -36
@@ -0,0 +1,460 @@
1
+ """
2
+ Chunking strategies for splitting large PDFs into manageable chunks.
3
+
4
+ This module implements three chunking strategies:
5
+ 1. Fixed-pages: Simple page-based chunking (legacy)
6
+ 2. Token-based: Token-aware chunking that respects model limits
7
+ 3. Hybrid: Best of both worlds - targets token count but respects page limits (RECOMMENDED)
8
+ """
9
+
10
+ import math
11
+ from typing import List, Dict, Any, Optional
12
+
13
+
14
+ class ConfigurationError(Exception):
15
+ """Raised when chunking configuration is invalid."""
16
+ pass
17
+
18
+
19
+ def calculate_chunks_fixed_pages(
20
+ total_pages: int,
21
+ chunk_size: int,
22
+ overlap_pages: int = 0
23
+ ) -> List[Dict[str, Any]]:
24
+ """
25
+ Create chunks based on fixed page count (legacy approach).
26
+
27
+ This is the simplest chunking strategy that splits documents into
28
+ fixed-size page chunks. It's fast and predictable but doesn't account
29
+ for token density, which can lead to chunks exceeding model token limits.
30
+
31
+ Args:
32
+ total_pages: Total number of pages in the document
33
+ chunk_size: Number of pages per chunk
34
+ overlap_pages: Number of overlapping pages between consecutive chunks
35
+
36
+ Returns:
37
+ List of chunk metadata dictionaries with:
38
+ - chunk_index: Index of the chunk (0-based)
39
+ - start_page: Starting page number (0-based)
40
+ - end_page: Ending page number (0-based, inclusive)
41
+ - page_count: Number of pages in the chunk
42
+
43
+ Example:
44
+ >>> chunks = calculate_chunks_fixed_pages(150, 50, 5)
45
+ >>> len(chunks)
46
+ 4
47
+ >>> chunks[0]
48
+ {'chunk_index': 0, 'start_page': 0, 'end_page': 49, 'page_count': 50}
49
+ """
50
+ chunks = []
51
+ current_page = 0
52
+ chunk_index = 0
53
+
54
+ while current_page < total_pages:
55
+ # Calculate overlap for chunks after the first
56
+ start_page = max(0, current_page - overlap_pages) if chunk_index > 0 else 0
57
+ end_page = min(current_page + chunk_size, total_pages) - 1
58
+
59
+ chunks.append({
60
+ 'chunk_index': chunk_index,
61
+ 'start_page': start_page,
62
+ 'end_page': end_page,
63
+ 'page_count': end_page - start_page + 1
64
+ })
65
+
66
+ current_page = end_page + 1
67
+ chunk_index += 1
68
+
69
+ return chunks
70
+
71
+
72
+ def calculate_chunks_token_based(
73
+ tokens_per_page: List[int],
74
+ max_tokens_per_chunk: int,
75
+ overlap_tokens: int = 0
76
+ ) -> List[Dict[str, Any]]:
77
+ """
78
+ Create chunks based on token count instead of fixed pages.
79
+
80
+ This strategy ensures no chunk exceeds the token limit by analyzing
81
+ token density per page. It's ideal for documents with variable content
82
+ density (e.g., mix of text-heavy and image-heavy pages).
83
+
84
+ Args:
85
+ tokens_per_page: List of token counts for each page
86
+ max_tokens_per_chunk: Maximum tokens allowed per chunk
87
+ overlap_tokens: Target number of overlapping tokens between chunks
88
+
89
+ Returns:
90
+ List of chunk metadata dictionaries with:
91
+ - chunk_index: Index of the chunk (0-based)
92
+ - start_page: Starting page number (0-based)
93
+ - end_page: Ending page number (0-based, inclusive)
94
+ - page_count: Number of pages in the chunk
95
+ - token_count: Estimated tokens in the chunk
96
+
97
+ Example:
98
+ >>> tokens = [1500] * 100 # 100 pages, 1500 tokens each
99
+ >>> chunks = calculate_chunks_token_based(tokens, 100000, 5000)
100
+ >>> all(c['token_count'] <= 100000 for c in chunks)
101
+ True
102
+ """
103
+ chunks = []
104
+ current_chunk_start = 0
105
+ current_chunk_tokens = 0
106
+ chunk_index = 0
107
+
108
+ for page_num, page_tokens in enumerate(tokens_per_page):
109
+ # Check if adding this page would exceed the limit
110
+ if current_chunk_tokens + page_tokens > max_tokens_per_chunk and current_chunk_tokens > 0:
111
+ # Finalize current chunk
112
+ chunks.append({
113
+ 'chunk_index': chunk_index,
114
+ 'start_page': current_chunk_start,
115
+ 'end_page': page_num - 1,
116
+ 'page_count': page_num - current_chunk_start,
117
+ 'token_count': current_chunk_tokens
118
+ })
119
+
120
+ # Calculate overlap: go back to find pages that sum to overlap_tokens
121
+ overlap_start_page = page_num - 1
122
+ overlap_accumulated = 0
123
+
124
+ while (overlap_start_page >= current_chunk_start and
125
+ overlap_accumulated < overlap_tokens):
126
+ overlap_accumulated += tokens_per_page[overlap_start_page]
127
+ overlap_start_page -= 1
128
+
129
+ # Start new chunk with overlap
130
+ current_chunk_start = max(0, overlap_start_page + 1)
131
+ current_chunk_tokens = overlap_accumulated + page_tokens
132
+ chunk_index += 1
133
+ else:
134
+ # Add page to current chunk
135
+ current_chunk_tokens += page_tokens
136
+
137
+ # Add final chunk
138
+ if current_chunk_tokens > 0:
139
+ chunks.append({
140
+ 'chunk_index': chunk_index,
141
+ 'start_page': current_chunk_start,
142
+ 'end_page': len(tokens_per_page) - 1,
143
+ 'page_count': len(tokens_per_page) - current_chunk_start,
144
+ 'token_count': current_chunk_tokens
145
+ })
146
+
147
+ return chunks
148
+
149
+
150
+
151
+ def calculate_chunks_hybrid(
152
+ tokens_per_page: List[int],
153
+ target_tokens: int = 80000,
154
+ max_pages: int = 99,
155
+ overlap_tokens: int = 5000
156
+ ) -> List[Dict[str, Any]]:
157
+ """
158
+ Hybrid chunking: target token count but respect page limits (RECOMMENDED).
159
+
160
+ This is the recommended strategy for production use. It combines the benefits
161
+ of both fixed-pages and token-based strategies:
162
+ - Targets an optimal token count per chunk (soft limit)
163
+ - Enforces a maximum page count per chunk (hard limit)
164
+ - Maintains token-based overlap for context preservation
165
+
166
+ The strategy creates balanced chunks that respect both token and page constraints,
167
+ making it suitable for documents with variable content density.
168
+
169
+ IMPORTANT: The default max_pages is 99 (not 100) because Bedrock has a hard limit
170
+ of 100 pages per PDF. Using 99 provides a safety margin.
171
+
172
+ Args:
173
+ tokens_per_page: List of token counts for each page
174
+ target_tokens: Target tokens per chunk (soft limit, default: 80000)
175
+ max_pages: Maximum pages per chunk (hard limit, default: 99)
176
+ overlap_tokens: Target overlapping tokens between chunks (default: 5000)
177
+
178
+ Returns:
179
+ List of chunk metadata dictionaries with:
180
+ - chunk_index: Index of the chunk (0-based)
181
+ - start_page: Starting page number (0-based)
182
+ - end_page: Ending page number (0-based, inclusive)
183
+ - page_count: Number of pages in the chunk
184
+ - token_count: Estimated tokens in the chunk
185
+ - finalize_reason: Why the chunk was finalized ('token_limit', 'page_limit', or 'final_chunk')
186
+
187
+ Example:
188
+ >>> tokens = [2000] * 200 # 200 pages, 2000 tokens each
189
+ >>> chunks = calculate_chunks_hybrid(tokens, 80000, 99, 5000)
190
+ >>> all(c['page_count'] <= 99 for c in chunks)
191
+ True
192
+ >>> all(c['token_count'] <= 100000 for c in chunks) # Some tolerance for overlap
193
+ True
194
+ """
195
+ chunks = []
196
+ current_chunk_start = 0
197
+ current_chunk_tokens = 0
198
+ current_chunk_pages = 0
199
+ chunk_index = 0
200
+
201
+ for page_num, page_tokens in enumerate(tokens_per_page):
202
+ # Check if we should finalize this chunk
203
+ # Note: We use >= for max_pages to ensure we stay UNDER the limit
204
+ # This is critical because Bedrock has a hard 100-page limit for PDF processing
205
+ should_finalize = (
206
+ # Reached target tokens (soft limit)
207
+ (current_chunk_tokens + page_tokens > target_tokens and current_chunk_tokens > 0) or
208
+ # Would exceed max pages (hard limit) - finalize BEFORE adding this page
209
+ (current_chunk_pages >= max_pages and current_chunk_pages > 0)
210
+ )
211
+
212
+ if should_finalize:
213
+ # Determine finalize reason
214
+ finalize_reason = 'page_limit' if current_chunk_pages >= max_pages else 'token_limit'
215
+
216
+ # Finalize current chunk
217
+ chunks.append({
218
+ 'chunk_index': chunk_index,
219
+ 'start_page': current_chunk_start,
220
+ 'end_page': page_num - 1,
221
+ 'page_count': current_chunk_pages,
222
+ 'token_count': current_chunk_tokens,
223
+ 'finalize_reason': finalize_reason
224
+ })
225
+
226
+ # Calculate overlap: go back to find pages that sum to overlap_tokens
227
+ overlap_start_page = page_num - 1
228
+ overlap_accumulated = 0
229
+ overlap_pages = 0
230
+
231
+ while (overlap_start_page >= current_chunk_start and
232
+ overlap_accumulated < overlap_tokens and
233
+ overlap_pages < 10): # Max 10 pages overlap
234
+ overlap_accumulated += tokens_per_page[overlap_start_page]
235
+ overlap_pages += 1
236
+ overlap_start_page -= 1
237
+
238
+ # Start new chunk with overlap
239
+ current_chunk_start = max(0, overlap_start_page + 1)
240
+ current_chunk_tokens = overlap_accumulated + page_tokens
241
+ current_chunk_pages = overlap_pages + 1
242
+ chunk_index += 1
243
+ else:
244
+ # Add page to current chunk
245
+ current_chunk_tokens += page_tokens
246
+ current_chunk_pages += 1
247
+
248
+ # Add final chunk - but check if it exceeds max_pages
249
+ if current_chunk_tokens > 0:
250
+ # If final chunk exceeds max_pages, we need to split it
251
+ while current_chunk_pages > max_pages:
252
+ # Calculate where to split (at max_pages from current_chunk_start)
253
+ split_end_page = current_chunk_start + max_pages - 1
254
+ split_tokens = sum(tokens_per_page[current_chunk_start:split_end_page + 1])
255
+
256
+ chunks.append({
257
+ 'chunk_index': chunk_index,
258
+ 'start_page': current_chunk_start,
259
+ 'end_page': split_end_page,
260
+ 'page_count': max_pages,
261
+ 'token_count': split_tokens,
262
+ 'finalize_reason': 'page_limit'
263
+ })
264
+
265
+ # Calculate overlap for next chunk
266
+ overlap_start_page = split_end_page
267
+ overlap_accumulated = 0
268
+ overlap_pages = 0
269
+
270
+ while (overlap_start_page >= current_chunk_start and
271
+ overlap_accumulated < overlap_tokens and
272
+ overlap_pages < 10):
273
+ overlap_accumulated += tokens_per_page[overlap_start_page]
274
+ overlap_pages += 1
275
+ overlap_start_page -= 1
276
+
277
+ # Update for next iteration
278
+ current_chunk_start = max(0, overlap_start_page + 1)
279
+ current_chunk_pages = len(tokens_per_page) - current_chunk_start
280
+ current_chunk_tokens = sum(tokens_per_page[current_chunk_start:])
281
+ chunk_index += 1
282
+
283
+ # Add the final chunk (now guaranteed to be <= max_pages)
284
+ chunks.append({
285
+ 'chunk_index': chunk_index,
286
+ 'start_page': current_chunk_start,
287
+ 'end_page': len(tokens_per_page) - 1,
288
+ 'page_count': current_chunk_pages,
289
+ 'token_count': current_chunk_tokens,
290
+ 'finalize_reason': 'final_chunk'
291
+ })
292
+
293
+ return chunks
294
+
295
+
296
+ def validate_configuration(config: Dict[str, Any]) -> bool:
297
+ """
298
+ Validate chunking configuration parameters.
299
+
300
+ Validates that configuration parameters meet the following constraints:
301
+ - chunk_size > 0 (positive chunk size)
302
+ - overlap >= 0 (non-negative overlap)
303
+ - overlap < chunk_size (overlap must be less than chunk size)
304
+ - threshold > 0 (positive threshold)
305
+ - max_tokens_per_chunk > 0 (positive token limit)
306
+ - target_tokens > 0 (positive target tokens)
307
+ - max_pages > 0 (positive page limit)
308
+
309
+ Args:
310
+ config: Configuration dictionary with chunking parameters
311
+
312
+ Returns:
313
+ True if configuration is valid, False otherwise
314
+
315
+ Raises:
316
+ ConfigurationError: If configuration is invalid (when strict=True in config)
317
+
318
+ Example:
319
+ >>> validate_configuration({'chunkSize': 50, 'overlapPages': 5, 'pageThreshold': 100})
320
+ True
321
+ >>> validate_configuration({'chunkSize': -10, 'overlapPages': 5})
322
+ False
323
+ >>> validate_configuration({'chunkSize': 50, 'overlapPages': 60})
324
+ False
325
+ """
326
+ errors = []
327
+
328
+ # Validate chunk_size
329
+ chunk_size = config.get('chunkSize', config.get('chunk_size'))
330
+ if chunk_size is not None:
331
+ if not isinstance(chunk_size, (int, float)) or chunk_size <= 0:
332
+ errors.append(f"chunk_size must be positive, got {chunk_size}")
333
+
334
+ # Validate overlap
335
+ overlap = config.get('overlapPages', config.get('overlap_pages', config.get('overlap', 0)))
336
+ if overlap is not None:
337
+ if not isinstance(overlap, (int, float)) or overlap < 0:
338
+ errors.append(f"overlap must be non-negative, got {overlap}")
339
+
340
+ # Check overlap < chunk_size
341
+ if chunk_size is not None and overlap >= chunk_size:
342
+ errors.append(f"overlap ({overlap}) must be less than chunk_size ({chunk_size})")
343
+
344
+ # Validate threshold
345
+ threshold = config.get('pageThreshold', config.get('page_threshold', config.get('threshold')))
346
+ if threshold is not None:
347
+ if not isinstance(threshold, (int, float)) or threshold <= 0:
348
+ errors.append(f"threshold must be positive, got {threshold}")
349
+
350
+ # Validate token threshold
351
+ token_threshold = config.get('tokenThreshold', config.get('token_threshold'))
352
+ if token_threshold is not None:
353
+ if not isinstance(token_threshold, (int, float)) or token_threshold <= 0:
354
+ errors.append(f"token_threshold must be positive, got {token_threshold}")
355
+
356
+ # Validate max_tokens_per_chunk
357
+ max_tokens = config.get('maxTokensPerChunk', config.get('max_tokens_per_chunk'))
358
+ if max_tokens is not None:
359
+ if not isinstance(max_tokens, (int, float)) or max_tokens <= 0:
360
+ errors.append(f"max_tokens_per_chunk must be positive, got {max_tokens}")
361
+
362
+ # Validate target_tokens
363
+ target_tokens = config.get('targetTokensPerChunk', config.get('target_tokens'))
364
+ if target_tokens is not None:
365
+ if not isinstance(target_tokens, (int, float)) or target_tokens <= 0:
366
+ errors.append(f"target_tokens must be positive, got {target_tokens}")
367
+
368
+ # Validate max_pages
369
+ max_pages = config.get('maxPagesPerChunk', config.get('max_pages'))
370
+ if max_pages is not None:
371
+ if not isinstance(max_pages, (int, float)) or max_pages <= 0:
372
+ errors.append(f"max_pages must be positive, got {max_pages}")
373
+
374
+ # Validate overlap_tokens
375
+ overlap_tokens = config.get('overlapTokens', config.get('overlap_tokens'))
376
+ if overlap_tokens is not None:
377
+ if not isinstance(overlap_tokens, (int, float)) or overlap_tokens < 0:
378
+ errors.append(f"overlap_tokens must be non-negative, got {overlap_tokens}")
379
+
380
+ # If strict mode, raise exception
381
+ if errors and config.get('strict', False):
382
+ raise ConfigurationError("; ".join(errors))
383
+
384
+ return len(errors) == 0
385
+
386
+
387
+
388
+ def validate_fixed_pages_config(
389
+ chunk_size: int,
390
+ overlap_pages: int,
391
+ threshold: int
392
+ ) -> None:
393
+ """
394
+ Validate fixed-pages strategy configuration.
395
+
396
+ Args:
397
+ chunk_size: Number of pages per chunk
398
+ overlap_pages: Number of overlapping pages
399
+ threshold: Page count threshold for chunking
400
+
401
+ Raises:
402
+ ConfigurationError: If configuration is invalid
403
+ """
404
+ config = {
405
+ 'chunkSize': chunk_size,
406
+ 'overlapPages': overlap_pages,
407
+ 'pageThreshold': threshold,
408
+ 'strict': True
409
+ }
410
+ validate_configuration(config)
411
+
412
+
413
+ def validate_token_based_config(
414
+ max_tokens_per_chunk: int,
415
+ overlap_tokens: int,
416
+ threshold: int
417
+ ) -> None:
418
+ """
419
+ Validate token-based strategy configuration.
420
+
421
+ Args:
422
+ max_tokens_per_chunk: Maximum tokens per chunk
423
+ overlap_tokens: Overlapping tokens between chunks
424
+ threshold: Token count threshold for chunking
425
+
426
+ Raises:
427
+ ConfigurationError: If configuration is invalid
428
+ """
429
+ config = {
430
+ 'maxTokensPerChunk': max_tokens_per_chunk,
431
+ 'overlapTokens': overlap_tokens,
432
+ 'tokenThreshold': threshold,
433
+ 'strict': True
434
+ }
435
+ validate_configuration(config)
436
+
437
+
438
+ def validate_hybrid_config(
439
+ target_tokens: int,
440
+ max_pages: int,
441
+ overlap_tokens: int
442
+ ) -> None:
443
+ """
444
+ Validate hybrid strategy configuration.
445
+
446
+ Args:
447
+ target_tokens: Target tokens per chunk
448
+ max_pages: Maximum pages per chunk
449
+ overlap_tokens: Overlapping tokens between chunks
450
+
451
+ Raises:
452
+ ConfigurationError: If configuration is invalid
453
+ """
454
+ config = {
455
+ 'targetTokensPerChunk': target_tokens,
456
+ 'maxPagesPerChunk': max_pages,
457
+ 'overlapTokens': overlap_tokens,
458
+ 'strict': True
459
+ }
460
+ validate_configuration(config)