@cdklabs/cdk-appmod-catalog-blueprints 1.5.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/.jsii +2537 -204
  2. package/lib/document-processing/adapter/adapter.d.ts +4 -2
  3. package/lib/document-processing/adapter/adapter.js +1 -1
  4. package/lib/document-processing/adapter/queued-s3-adapter.d.ts +9 -2
  5. package/lib/document-processing/adapter/queued-s3-adapter.js +29 -15
  6. package/lib/document-processing/agentic-document-processing.d.ts +4 -0
  7. package/lib/document-processing/agentic-document-processing.js +20 -10
  8. package/lib/document-processing/base-document-processing.d.ts +54 -2
  9. package/lib/document-processing/base-document-processing.js +136 -82
  10. package/lib/document-processing/bedrock-document-processing.d.ts +202 -2
  11. package/lib/document-processing/bedrock-document-processing.js +717 -77
  12. package/lib/document-processing/chunking-config.d.ts +614 -0
  13. package/lib/document-processing/chunking-config.js +5 -0
  14. package/lib/document-processing/default-document-processing-config.js +1 -1
  15. package/lib/document-processing/index.d.ts +1 -0
  16. package/lib/document-processing/index.js +2 -1
  17. package/lib/document-processing/resources/aggregation/handler.py +567 -0
  18. package/lib/document-processing/resources/aggregation/requirements.txt +7 -0
  19. package/lib/document-processing/resources/aggregation/test_handler.py +362 -0
  20. package/lib/document-processing/resources/cleanup/handler.py +276 -0
  21. package/lib/document-processing/resources/cleanup/requirements.txt +5 -0
  22. package/lib/document-processing/resources/cleanup/test_handler.py +436 -0
  23. package/lib/document-processing/resources/default-bedrock-invoke/index.py +85 -3
  24. package/lib/document-processing/resources/default-bedrock-invoke/test_index.py +622 -0
  25. package/lib/document-processing/resources/pdf-chunking/README.md +313 -0
  26. package/lib/document-processing/resources/pdf-chunking/chunking_strategies.py +460 -0
  27. package/lib/document-processing/resources/pdf-chunking/error_handling.py +491 -0
  28. package/lib/document-processing/resources/pdf-chunking/handler.py +958 -0
  29. package/lib/document-processing/resources/pdf-chunking/metrics.py +435 -0
  30. package/lib/document-processing/resources/pdf-chunking/requirements.txt +3 -0
  31. package/lib/document-processing/resources/pdf-chunking/strategy_selection.py +420 -0
  32. package/lib/document-processing/resources/pdf-chunking/structured_logging.py +457 -0
  33. package/lib/document-processing/resources/pdf-chunking/test_chunking_strategies.py +353 -0
  34. package/lib/document-processing/resources/pdf-chunking/test_error_handling.py +487 -0
  35. package/lib/document-processing/resources/pdf-chunking/test_handler.py +609 -0
  36. package/lib/document-processing/resources/pdf-chunking/test_integration.py +694 -0
  37. package/lib/document-processing/resources/pdf-chunking/test_metrics.py +532 -0
  38. package/lib/document-processing/resources/pdf-chunking/test_strategy_selection.py +471 -0
  39. package/lib/document-processing/resources/pdf-chunking/test_structured_logging.py +449 -0
  40. package/lib/document-processing/resources/pdf-chunking/test_token_estimation.py +374 -0
  41. package/lib/document-processing/resources/pdf-chunking/token_estimation.py +189 -0
  42. package/lib/document-processing/tests/agentic-document-processing-nag.test.js +4 -3
  43. package/lib/document-processing/tests/agentic-document-processing.test.js +488 -4
  44. package/lib/document-processing/tests/base-document-processing-nag.test.js +9 -2
  45. package/lib/document-processing/tests/base-document-processing-schema.test.d.ts +1 -0
  46. package/lib/document-processing/tests/base-document-processing-schema.test.js +337 -0
  47. package/lib/document-processing/tests/base-document-processing.test.js +114 -8
  48. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.d.ts +1 -0
  49. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.js +382 -0
  50. package/lib/document-processing/tests/bedrock-document-processing-nag.test.js +4 -3
  51. package/lib/document-processing/tests/bedrock-document-processing-security.test.d.ts +1 -0
  52. package/lib/document-processing/tests/bedrock-document-processing-security.test.js +389 -0
  53. package/lib/document-processing/tests/bedrock-document-processing.test.js +808 -8
  54. package/lib/document-processing/tests/chunking-config.test.d.ts +1 -0
  55. package/lib/document-processing/tests/chunking-config.test.js +238 -0
  56. package/lib/document-processing/tests/queued-s3-adapter-nag.test.js +9 -2
  57. package/lib/document-processing/tests/queued-s3-adapter.test.js +17 -6
  58. package/lib/framework/agents/base-agent.js +1 -1
  59. package/lib/framework/agents/batch-agent.js +1 -1
  60. package/lib/framework/agents/default-agent-config.js +1 -1
  61. package/lib/framework/bedrock/bedrock.js +1 -1
  62. package/lib/framework/custom-resource/default-runtimes.js +1 -1
  63. package/lib/framework/foundation/access-log.js +1 -1
  64. package/lib/framework/foundation/eventbridge-broker.js +1 -1
  65. package/lib/framework/foundation/network.js +1 -1
  66. package/lib/framework/tests/access-log.test.js +5 -2
  67. package/lib/framework/tests/batch-agent.test.js +5 -2
  68. package/lib/framework/tests/bedrock.test.js +5 -2
  69. package/lib/framework/tests/eventbridge-broker.test.js +5 -2
  70. package/lib/framework/tests/framework-nag.test.js +16 -8
  71. package/lib/framework/tests/network.test.js +9 -4
  72. package/lib/tsconfig.tsbuildinfo +1 -1
  73. package/lib/utilities/data-loader.js +1 -1
  74. package/lib/utilities/lambda-iam-utils.js +1 -1
  75. package/lib/utilities/observability/cloudfront-distribution-observability-property-injector.js +1 -1
  76. package/lib/utilities/observability/default-observability-config.js +1 -1
  77. package/lib/utilities/observability/lambda-observability-property-injector.js +1 -1
  78. package/lib/utilities/observability/log-group-data-protection-utils.js +1 -1
  79. package/lib/utilities/observability/powertools-config.d.ts +10 -1
  80. package/lib/utilities/observability/powertools-config.js +19 -3
  81. package/lib/utilities/observability/state-machine-observability-property-injector.js +1 -1
  82. package/lib/utilities/test-utils.d.ts +43 -0
  83. package/lib/utilities/test-utils.js +56 -0
  84. package/lib/utilities/tests/data-loader-nag.test.js +3 -2
  85. package/lib/utilities/tests/data-loader.test.js +3 -2
  86. package/lib/webapp/frontend-construct.js +1 -1
  87. package/lib/webapp/tests/frontend-construct-nag.test.js +3 -2
  88. package/lib/webapp/tests/frontend-construct.test.js +3 -2
  89. package/package.json +6 -5
  90. package/lib/document-processing/resources/default-error-handler/index.js +0 -46
  91. package/lib/document-processing/resources/default-pdf-processor/index.js +0 -46
  92. package/lib/document-processing/resources/default-pdf-validator/index.js +0 -36
@@ -0,0 +1,435 @@
1
+ """
2
+ CloudWatch Metrics Module for PDF Chunking.
3
+
4
+ This module provides functions to emit CloudWatch metrics for PDF chunking operations
5
+ using AWS Lambda Powertools for efficient EMF (Embedded Metric Format) logging.
6
+
7
+ Metrics are only emitted when observability is enabled via the ENABLE_METRICS
8
+ environment variable (set to 'true'). This is controlled by the enableObservability
9
+ prop in the CDK construct.
10
+
11
+ Requirements: 7.4
12
+ """
13
+
14
+ import os
15
+ import time
16
+ import logging
17
+ from typing import Optional, List
18
+ from functools import wraps
19
+
20
+ from aws_lambda_powertools import Metrics
21
+ from aws_lambda_powertools.metrics import MetricUnit
22
+
23
+ # Configure logging
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # Check if metrics are enabled via environment variable
27
+ # This is set by the CDK construct when enableObservability is true
28
+ METRICS_ENABLED = os.environ.get('ENABLE_METRICS', 'false').lower() == 'true'
29
+
30
+ # Initialize Powertools Metrics
31
+ # Namespace and service are configured via environment variables:
32
+ # - POWERTOOLS_METRICS_NAMESPACE
33
+ # - POWERTOOLS_SERVICE_NAME
34
+ metrics = Metrics()
35
+
36
+
37
+ def _is_metrics_enabled() -> bool:
38
+ """
39
+ Check if metrics emission is enabled.
40
+
41
+ Metrics are enabled when the ENABLE_METRICS environment variable is set to 'true'.
42
+ This is controlled by the enableObservability prop in the CDK construct.
43
+
44
+ Returns:
45
+ True if metrics should be emitted, False otherwise
46
+ """
47
+ return METRICS_ENABLED
48
+
49
+
50
+ def emit_chunking_operation(
51
+ strategy: str,
52
+ requires_chunking: bool,
53
+ document_id: Optional[str] = None
54
+ ) -> None:
55
+ """
56
+ Emit ChunkingOperations metric.
57
+
58
+ Emits a count metric for each chunking operation with dimension for strategy.
59
+ Only emits when observability is enabled (ENABLE_METRICS=true).
60
+
61
+ Args:
62
+ strategy: Chunking strategy used (fixed-pages, token-based, hybrid)
63
+ requires_chunking: Whether chunking was required
64
+ document_id: Optional document ID for logging
65
+
66
+ Requirements: 7.4
67
+ """
68
+ if not _is_metrics_enabled():
69
+ return
70
+
71
+ try:
72
+ metrics.add_dimension(name="Strategy", value=strategy)
73
+ metrics.add_dimension(name="RequiresChunking", value=str(requires_chunking).lower())
74
+ metrics.add_metric(name="ChunkingOperations", unit=MetricUnit.Count, value=1)
75
+
76
+ logger.debug(
77
+ f"Emitted ChunkingOperations metric: strategy={strategy}, "
78
+ f"requires_chunking={requires_chunking}",
79
+ extra={
80
+ 'documentId': document_id,
81
+ 'strategy': strategy,
82
+ 'requiresChunking': requires_chunking
83
+ }
84
+ )
85
+
86
+ except Exception as e:
87
+ logger.warning(
88
+ f"Failed to emit ChunkingOperations metric: {str(e)}",
89
+ extra={'documentId': document_id, 'error': str(e)}
90
+ )
91
+
92
+
93
+ def emit_chunk_count(
94
+ chunk_count: int,
95
+ strategy: str,
96
+ document_id: Optional[str] = None
97
+ ) -> None:
98
+ """
99
+ Emit ChunkCount metric.
100
+
101
+ Emits the number of chunks created for a document.
102
+ Only emits when observability is enabled (ENABLE_METRICS=true).
103
+
104
+ Args:
105
+ chunk_count: Number of chunks created
106
+ strategy: Chunking strategy used
107
+ document_id: Optional document ID for logging
108
+
109
+ Requirements: 7.4
110
+ """
111
+ if not _is_metrics_enabled():
112
+ return
113
+
114
+ try:
115
+ metrics.add_dimension(name="Strategy", value=strategy)
116
+ metrics.add_metric(name="ChunkCount", unit=MetricUnit.Count, value=chunk_count)
117
+
118
+ logger.debug(
119
+ f"Emitted ChunkCount metric: count={chunk_count}, strategy={strategy}",
120
+ extra={
121
+ 'documentId': document_id,
122
+ 'chunkCount': chunk_count,
123
+ 'strategy': strategy
124
+ }
125
+ )
126
+
127
+ except Exception as e:
128
+ logger.warning(
129
+ f"Failed to emit ChunkCount metric: {str(e)}",
130
+ extra={'documentId': document_id, 'error': str(e)}
131
+ )
132
+
133
+
134
+ def emit_tokens_per_chunk(
135
+ tokens_per_chunk: List[int],
136
+ strategy: str,
137
+ document_id: Optional[str] = None
138
+ ) -> None:
139
+ """
140
+ Emit TokensPerChunk metrics.
141
+
142
+ Emits average and p99 tokens per chunk.
143
+ Only emits when observability is enabled (ENABLE_METRICS=true).
144
+
145
+ Args:
146
+ tokens_per_chunk: List of token counts for each chunk
147
+ strategy: Chunking strategy used
148
+ document_id: Optional document ID for logging
149
+
150
+ Requirements: 7.4
151
+ """
152
+ if not _is_metrics_enabled():
153
+ return
154
+
155
+ if not tokens_per_chunk:
156
+ return
157
+
158
+ try:
159
+ avg_tokens = sum(tokens_per_chunk) / len(tokens_per_chunk)
160
+ sorted_tokens = sorted(tokens_per_chunk)
161
+ p99_index = int(len(sorted_tokens) * 0.99)
162
+ p99_tokens = sorted_tokens[min(p99_index, len(sorted_tokens) - 1)]
163
+ max_tokens = max(tokens_per_chunk)
164
+
165
+ metrics.add_dimension(name="Strategy", value=strategy)
166
+ metrics.add_metric(name="TokensPerChunkAvg", unit=MetricUnit.Count, value=avg_tokens)
167
+ metrics.add_metric(name="TokensPerChunkP99", unit=MetricUnit.Count, value=p99_tokens)
168
+ metrics.add_metric(name="TokensPerChunkMax", unit=MetricUnit.Count, value=max_tokens)
169
+
170
+ logger.debug(
171
+ f"Emitted TokensPerChunk metrics: avg={avg_tokens:.0f}, p99={p99_tokens}, max={max_tokens}",
172
+ extra={
173
+ 'documentId': document_id,
174
+ 'avgTokens': avg_tokens,
175
+ 'p99Tokens': p99_tokens,
176
+ 'maxTokens': max_tokens,
177
+ 'strategy': strategy
178
+ }
179
+ )
180
+
181
+ except Exception as e:
182
+ logger.warning(
183
+ f"Failed to emit TokensPerChunk metric: {str(e)}",
184
+ extra={'documentId': document_id, 'error': str(e)}
185
+ )
186
+
187
+
188
+ def emit_chunk_processing_time(
189
+ processing_time_ms: float,
190
+ processing_mode: str,
191
+ document_id: Optional[str] = None
192
+ ) -> None:
193
+ """
194
+ Emit ChunkProcessingTime metric.
195
+
196
+ Emits processing time for chunking operation.
197
+ Only emits when observability is enabled (ENABLE_METRICS=true).
198
+
199
+ Args:
200
+ processing_time_ms: Processing time in milliseconds
201
+ processing_mode: Processing mode (sequential, parallel)
202
+ document_id: Optional document ID for logging
203
+
204
+ Requirements: 7.4
205
+ """
206
+ if not _is_metrics_enabled():
207
+ return
208
+
209
+ try:
210
+ metrics.add_dimension(name="ProcessingMode", value=processing_mode)
211
+ metrics.add_metric(name="ChunkProcessingTime", unit=MetricUnit.Milliseconds, value=processing_time_ms)
212
+
213
+ logger.debug(
214
+ f"Emitted ChunkProcessingTime metric: time={processing_time_ms:.2f}ms, "
215
+ f"mode={processing_mode}",
216
+ extra={
217
+ 'documentId': document_id,
218
+ 'processingTimeMs': processing_time_ms,
219
+ 'processingMode': processing_mode
220
+ }
221
+ )
222
+
223
+ except Exception as e:
224
+ logger.warning(
225
+ f"Failed to emit ChunkProcessingTime metric: {str(e)}",
226
+ extra={'documentId': document_id, 'error': str(e)}
227
+ )
228
+
229
+
230
+ def emit_chunk_failure_rate(
231
+ total_chunks: int,
232
+ failed_chunks: int,
233
+ document_id: Optional[str] = None
234
+ ) -> None:
235
+ """
236
+ Emit ChunkFailureRate metric.
237
+
238
+ Calculates and emits the percentage of failed chunks.
239
+ Only emits when observability is enabled (ENABLE_METRICS=true).
240
+
241
+ Args:
242
+ total_chunks: Total number of chunks
243
+ failed_chunks: Number of failed chunks
244
+ document_id: Optional document ID for logging
245
+
246
+ Requirements: 7.4
247
+ """
248
+ if not _is_metrics_enabled():
249
+ return
250
+
251
+ if total_chunks == 0:
252
+ return
253
+
254
+ try:
255
+ failure_rate = (failed_chunks / total_chunks) * 100
256
+
257
+ metrics.add_metric(name="ChunkFailureRate", unit=MetricUnit.Percent, value=failure_rate)
258
+ metrics.add_metric(name="FailedChunks", unit=MetricUnit.Count, value=failed_chunks)
259
+ metrics.add_metric(name="TotalChunks", unit=MetricUnit.Count, value=total_chunks)
260
+
261
+ logger.debug(
262
+ f"Emitted ChunkFailureRate metric: rate={failure_rate:.2f}%",
263
+ extra={
264
+ 'documentId': document_id,
265
+ 'failureRate': failure_rate,
266
+ 'totalChunks': total_chunks,
267
+ 'failedChunks': failed_chunks
268
+ }
269
+ )
270
+
271
+ except Exception as e:
272
+ logger.warning(
273
+ f"Failed to emit ChunkFailureRate metric: {str(e)}",
274
+ extra={'documentId': document_id, 'error': str(e)}
275
+ )
276
+
277
+
278
+ def emit_aggregation_time(
279
+ aggregation_time_ms: float,
280
+ document_id: Optional[str] = None
281
+ ) -> None:
282
+ """
283
+ Emit AggregationTime metric.
284
+
285
+ Emits the time taken to aggregate chunk results.
286
+ Only emits when observability is enabled (ENABLE_METRICS=true).
287
+
288
+ Args:
289
+ aggregation_time_ms: Aggregation time in milliseconds
290
+ document_id: Optional document ID for logging
291
+
292
+ Requirements: 7.4
293
+ """
294
+ if not _is_metrics_enabled():
295
+ return
296
+
297
+ try:
298
+ metrics.add_metric(name="AggregationTime", unit=MetricUnit.Milliseconds, value=aggregation_time_ms)
299
+
300
+ logger.debug(
301
+ f"Emitted AggregationTime metric: time={aggregation_time_ms:.2f}ms",
302
+ extra={
303
+ 'documentId': document_id,
304
+ 'aggregationTimeMs': aggregation_time_ms
305
+ }
306
+ )
307
+
308
+ except Exception as e:
309
+ logger.warning(
310
+ f"Failed to emit AggregationTime metric: {str(e)}",
311
+ extra={'documentId': document_id, 'error': str(e)}
312
+ )
313
+
314
+
315
+ def emit_strategy_usage(
316
+ strategy: str,
317
+ document_id: Optional[str] = None
318
+ ) -> None:
319
+ """
320
+ Emit StrategyUsage metric.
321
+
322
+ Emits a count metric for strategy usage tracking.
323
+ Only emits when observability is enabled (ENABLE_METRICS=true).
324
+
325
+ Args:
326
+ strategy: Chunking strategy used (fixed-pages, token-based, hybrid)
327
+ document_id: Optional document ID for logging
328
+
329
+ Requirements: 7.4
330
+ """
331
+ if not _is_metrics_enabled():
332
+ return
333
+
334
+ try:
335
+ metrics.add_dimension(name="Strategy", value=strategy)
336
+ metrics.add_metric(name="StrategyUsage", unit=MetricUnit.Count, value=1)
337
+
338
+ logger.debug(
339
+ f"Emitted StrategyUsage metric: strategy={strategy}",
340
+ extra={
341
+ 'documentId': document_id,
342
+ 'strategy': strategy
343
+ }
344
+ )
345
+
346
+ except Exception as e:
347
+ logger.warning(
348
+ f"Failed to emit StrategyUsage metric: {str(e)}",
349
+ extra={'documentId': document_id, 'error': str(e)}
350
+ )
351
+
352
+
353
+ def emit_chunking_metrics(
354
+ document_id: str,
355
+ strategy: str,
356
+ requires_chunking: bool,
357
+ chunk_count: int = 0,
358
+ tokens_per_chunk: Optional[List[int]] = None,
359
+ processing_time_ms: float = 0,
360
+ processing_mode: str = 'parallel'
361
+ ) -> None:
362
+ """
363
+ Convenience function to emit all chunking-related metrics.
364
+
365
+ Args:
366
+ document_id: Document identifier
367
+ strategy: Chunking strategy used
368
+ requires_chunking: Whether chunking was required
369
+ chunk_count: Number of chunks created (if chunking was required)
370
+ tokens_per_chunk: List of token counts per chunk
371
+ processing_time_ms: Total processing time in milliseconds
372
+ processing_mode: Processing mode (sequential, parallel)
373
+
374
+ Requirements: 7.4
375
+ """
376
+ # Always emit operation and strategy usage metrics
377
+ emit_chunking_operation(strategy, requires_chunking, document_id)
378
+ emit_strategy_usage(strategy, document_id)
379
+
380
+ # Emit chunk-specific metrics only if chunking was performed
381
+ if requires_chunking and chunk_count > 0:
382
+ emit_chunk_count(chunk_count, strategy, document_id)
383
+
384
+ if tokens_per_chunk:
385
+ emit_tokens_per_chunk(tokens_per_chunk, strategy, document_id)
386
+
387
+ if processing_time_ms > 0:
388
+ emit_chunk_processing_time(
389
+ processing_time_ms,
390
+ processing_mode,
391
+ document_id
392
+ )
393
+
394
+
395
+ def timed_operation(metric_name: str = 'OperationTime'):
396
+ """
397
+ Decorator to measure and emit operation timing.
398
+
399
+ Only emits metrics when observability is enabled (ENABLE_METRICS=true).
400
+
401
+ Args:
402
+ metric_name: Name of the metric to emit
403
+
404
+ Returns:
405
+ Decorated function
406
+ """
407
+ def decorator(func):
408
+ @wraps(func)
409
+ def wrapper(*args, **kwargs):
410
+ start_time = time.time()
411
+ try:
412
+ result = func(*args, **kwargs)
413
+ return result
414
+ finally:
415
+ if _is_metrics_enabled():
416
+ elapsed_ms = (time.time() - start_time) * 1000
417
+ try:
418
+ metrics.add_metric(name=metric_name, unit=MetricUnit.Milliseconds, value=elapsed_ms)
419
+ except Exception as e:
420
+ logger.warning(f"Failed to emit timing metric: {str(e)}")
421
+ return wrapper
422
+ return decorator
423
+
424
+
425
+ # Export the metrics instance for use with @metrics.log_metrics decorator
426
+ def get_metrics() -> Metrics:
427
+ """
428
+ Get the Powertools Metrics instance.
429
+
430
+ Use this to access the metrics instance for the @metrics.log_metrics decorator.
431
+
432
+ Returns:
433
+ Metrics instance
434
+ """
435
+ return metrics
@@ -0,0 +1,3 @@
1
+ PyPDF2>=3.0.0
2
+ boto3>=1.26.0
3
+ aws-lambda-powertools>=2.0.0