@cdklabs/cdk-appmod-catalog-blueprints 1.4.1 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/.jsii +2579 -194
  2. package/lib/document-processing/adapter/adapter.d.ts +4 -2
  3. package/lib/document-processing/adapter/adapter.js +1 -1
  4. package/lib/document-processing/adapter/queued-s3-adapter.d.ts +9 -2
  5. package/lib/document-processing/adapter/queued-s3-adapter.js +29 -15
  6. package/lib/document-processing/agentic-document-processing.d.ts +4 -0
  7. package/lib/document-processing/agentic-document-processing.js +20 -10
  8. package/lib/document-processing/base-document-processing.d.ts +54 -2
  9. package/lib/document-processing/base-document-processing.js +136 -82
  10. package/lib/document-processing/bedrock-document-processing.d.ts +202 -2
  11. package/lib/document-processing/bedrock-document-processing.js +717 -77
  12. package/lib/document-processing/chunking-config.d.ts +614 -0
  13. package/lib/document-processing/chunking-config.js +5 -0
  14. package/lib/document-processing/default-document-processing-config.js +1 -1
  15. package/lib/document-processing/index.d.ts +1 -0
  16. package/lib/document-processing/index.js +2 -1
  17. package/lib/document-processing/resources/aggregation/handler.py +567 -0
  18. package/lib/document-processing/resources/aggregation/requirements.txt +7 -0
  19. package/lib/document-processing/resources/aggregation/test_handler.py +362 -0
  20. package/lib/document-processing/resources/cleanup/handler.py +276 -0
  21. package/lib/document-processing/resources/cleanup/requirements.txt +5 -0
  22. package/lib/document-processing/resources/cleanup/test_handler.py +436 -0
  23. package/lib/document-processing/resources/default-bedrock-invoke/index.py +85 -3
  24. package/lib/document-processing/resources/default-bedrock-invoke/test_index.py +622 -0
  25. package/lib/document-processing/resources/pdf-chunking/README.md +313 -0
  26. package/lib/document-processing/resources/pdf-chunking/chunking_strategies.py +460 -0
  27. package/lib/document-processing/resources/pdf-chunking/error_handling.py +491 -0
  28. package/lib/document-processing/resources/pdf-chunking/handler.py +958 -0
  29. package/lib/document-processing/resources/pdf-chunking/metrics.py +435 -0
  30. package/lib/document-processing/resources/pdf-chunking/requirements.txt +3 -0
  31. package/lib/document-processing/resources/pdf-chunking/strategy_selection.py +420 -0
  32. package/lib/document-processing/resources/pdf-chunking/structured_logging.py +457 -0
  33. package/lib/document-processing/resources/pdf-chunking/test_chunking_strategies.py +353 -0
  34. package/lib/document-processing/resources/pdf-chunking/test_error_handling.py +487 -0
  35. package/lib/document-processing/resources/pdf-chunking/test_handler.py +609 -0
  36. package/lib/document-processing/resources/pdf-chunking/test_integration.py +694 -0
  37. package/lib/document-processing/resources/pdf-chunking/test_metrics.py +532 -0
  38. package/lib/document-processing/resources/pdf-chunking/test_strategy_selection.py +471 -0
  39. package/lib/document-processing/resources/pdf-chunking/test_structured_logging.py +449 -0
  40. package/lib/document-processing/resources/pdf-chunking/test_token_estimation.py +374 -0
  41. package/lib/document-processing/resources/pdf-chunking/token_estimation.py +189 -0
  42. package/lib/document-processing/tests/agentic-document-processing-nag.test.js +4 -3
  43. package/lib/document-processing/tests/agentic-document-processing.test.js +488 -4
  44. package/lib/document-processing/tests/base-document-processing-nag.test.js +9 -2
  45. package/lib/document-processing/tests/base-document-processing-schema.test.d.ts +1 -0
  46. package/lib/document-processing/tests/base-document-processing-schema.test.js +337 -0
  47. package/lib/document-processing/tests/base-document-processing.test.js +114 -8
  48. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.d.ts +1 -0
  49. package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.js +382 -0
  50. package/lib/document-processing/tests/bedrock-document-processing-nag.test.js +4 -3
  51. package/lib/document-processing/tests/bedrock-document-processing-security.test.d.ts +1 -0
  52. package/lib/document-processing/tests/bedrock-document-processing-security.test.js +389 -0
  53. package/lib/document-processing/tests/bedrock-document-processing.test.js +808 -8
  54. package/lib/document-processing/tests/chunking-config.test.d.ts +1 -0
  55. package/lib/document-processing/tests/chunking-config.test.js +238 -0
  56. package/lib/document-processing/tests/queued-s3-adapter-nag.test.js +9 -2
  57. package/lib/document-processing/tests/queued-s3-adapter.test.js +17 -6
  58. package/lib/framework/agents/base-agent.js +1 -1
  59. package/lib/framework/agents/batch-agent.js +1 -1
  60. package/lib/framework/agents/default-agent-config.js +1 -1
  61. package/lib/framework/bedrock/bedrock.js +1 -1
  62. package/lib/framework/custom-resource/default-runtimes.js +1 -1
  63. package/lib/framework/foundation/access-log.js +1 -1
  64. package/lib/framework/foundation/eventbridge-broker.js +1 -1
  65. package/lib/framework/foundation/network.d.ts +4 -2
  66. package/lib/framework/foundation/network.js +52 -41
  67. package/lib/framework/tests/access-log.test.js +5 -2
  68. package/lib/framework/tests/batch-agent.test.js +5 -2
  69. package/lib/framework/tests/bedrock.test.js +5 -2
  70. package/lib/framework/tests/eventbridge-broker.test.js +5 -2
  71. package/lib/framework/tests/framework-nag.test.js +26 -7
  72. package/lib/framework/tests/network.test.js +30 -2
  73. package/lib/tsconfig.tsbuildinfo +1 -1
  74. package/lib/utilities/data-loader.js +1 -1
  75. package/lib/utilities/lambda-iam-utils.js +1 -1
  76. package/lib/utilities/observability/cloudfront-distribution-observability-property-injector.js +1 -1
  77. package/lib/utilities/observability/default-observability-config.js +1 -1
  78. package/lib/utilities/observability/lambda-observability-property-injector.js +1 -1
  79. package/lib/utilities/observability/log-group-data-protection-utils.js +1 -1
  80. package/lib/utilities/observability/powertools-config.d.ts +10 -1
  81. package/lib/utilities/observability/powertools-config.js +19 -3
  82. package/lib/utilities/observability/state-machine-observability-property-injector.js +1 -1
  83. package/lib/utilities/test-utils.d.ts +43 -0
  84. package/lib/utilities/test-utils.js +56 -0
  85. package/lib/utilities/tests/data-loader-nag.test.js +3 -2
  86. package/lib/utilities/tests/data-loader.test.js +3 -2
  87. package/lib/webapp/frontend-construct.js +1 -1
  88. package/lib/webapp/tests/frontend-construct-nag.test.js +3 -2
  89. package/lib/webapp/tests/frontend-construct.test.js +3 -2
  90. package/package.json +6 -5
  91. package/lib/document-processing/resources/default-error-handler/index.js +0 -46
  92. package/lib/document-processing/resources/default-pdf-processor/index.js +0 -46
  93. package/lib/document-processing/resources/default-pdf-validator/index.js +0 -36
@@ -19,4 +19,5 @@ __exportStar(require("./bedrock-document-processing"), exports);
19
19
  __exportStar(require("./agentic-document-processing"), exports);
20
20
  __exportStar(require("./adapter"), exports);
21
21
  __exportStar(require("./default-document-processing-config"), exports);
22
- //# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiaW5kZXguanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyIuLi8uLi91c2UtY2FzZXMvZG9jdW1lbnQtcHJvY2Vzc2luZy9pbmRleC50cyJdLCJuYW1lcyI6W10sIm1hcHBpbmdzIjoiOzs7Ozs7Ozs7Ozs7Ozs7O0FBQUEsNkRBQTJDO0FBQzNDLGdFQUE4QztBQUM5QyxnRUFBOEM7QUFDOUMsNENBQTBCO0FBQzFCLHVFQUFxRCIsInNvdXJjZXNDb250ZW50IjpbImV4cG9ydCAqIGZyb20gJy4vYmFzZS1kb2N1bWVudC1wcm9jZXNzaW5nJztcbmV4cG9ydCAqIGZyb20gJy4vYmVkcm9jay1kb2N1bWVudC1wcm9jZXNzaW5nJztcbmV4cG9ydCAqIGZyb20gJy4vYWdlbnRpYy1kb2N1bWVudC1wcm9jZXNzaW5nJztcbmV4cG9ydCAqIGZyb20gJy4vYWRhcHRlcic7XG5leHBvcnQgKiBmcm9tICcuL2RlZmF1bHQtZG9jdW1lbnQtcHJvY2Vzc2luZy1jb25maWcnOyJdfQ==
22
+ __exportStar(require("./chunking-config"), exports);
23
+ //# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiaW5kZXguanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyIuLi8uLi91c2UtY2FzZXMvZG9jdW1lbnQtcHJvY2Vzc2luZy9pbmRleC50cyJdLCJuYW1lcyI6W10sIm1hcHBpbmdzIjoiOzs7Ozs7Ozs7Ozs7Ozs7O0FBQUEsNkRBQTJDO0FBQzNDLGdFQUE4QztBQUM5QyxnRUFBOEM7QUFDOUMsNENBQTBCO0FBQzFCLHVFQUFxRDtBQUNyRCxvREFBa0MiLCJzb3VyY2VzQ29udGVudCI6WyJleHBvcnQgKiBmcm9tICcuL2Jhc2UtZG9jdW1lbnQtcHJvY2Vzc2luZyc7XG5leHBvcnQgKiBmcm9tICcuL2JlZHJvY2stZG9jdW1lbnQtcHJvY2Vzc2luZyc7XG5leHBvcnQgKiBmcm9tICcuL2FnZW50aWMtZG9jdW1lbnQtcHJvY2Vzc2luZyc7XG5leHBvcnQgKiBmcm9tICcuL2FkYXB0ZXInO1xuZXhwb3J0ICogZnJvbSAnLi9kZWZhdWx0LWRvY3VtZW50LXByb2Nlc3NpbmctY29uZmlnJztcbmV4cG9ydCAqIGZyb20gJy4vY2h1bmtpbmctY29uZmlnJzsiXX0=
@@ -0,0 +1,567 @@
1
+ """
2
+ Aggregation Lambda Handler
3
+
4
+ This Lambda function aggregates results from multiple chunks into a coherent final result.
5
+ It implements majority voting for classification and entity deduplication.
6
+
7
+ Requirements: 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 7.2, 7.3, 7.4, 7.5
8
+ """
9
+
10
+ import json
11
+ import logging
12
+ import os
13
+ import sys
14
+ import time
15
+ import random
16
+ from typing import Dict, Any, List, Optional, Tuple
17
+ from collections import Counter
18
+ import boto3
19
+ from botocore.exceptions import ClientError
20
+
21
+ from aws_lambda_powertools import Metrics
22
+ from aws_lambda_powertools.metrics import MetricUnit
23
+
24
+ # Try to import structured logging from pdf-chunking module
25
+ # Fall back to standard logging if not available
26
+ try:
27
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'pdf-chunking'))
28
+ from structured_logging import (
29
+ get_logger,
30
+ log_chunking_operation,
31
+ with_correlation_id,
32
+ is_observability_enabled
33
+ )
34
+ structured_logger = get_logger(__name__)
35
+ USE_STRUCTURED_LOGGING = True
36
+ except ImportError:
37
+ USE_STRUCTURED_LOGGING = False
38
+ structured_logger = None
39
+
40
+ # Configure standard logging as fallback
41
+ logger = logging.getLogger()
42
+ logger.setLevel(os.environ.get('LOG_LEVEL', 'INFO'))
43
+
44
+ # Check if metrics are enabled via environment variable
45
+ # This is set by the CDK construct when enableObservability is true
46
+ METRICS_ENABLED = os.environ.get('ENABLE_METRICS', 'false').lower() == 'true'
47
+
48
+ # Initialize Powertools Metrics
49
+ metrics = Metrics()
50
+
51
+ # Initialize DynamoDB client
52
+ dynamodb = boto3.resource('dynamodb')
53
+
54
+
55
+ class AggregationError(Exception):
56
+ """Base exception for aggregation errors."""
57
+
58
+ def __init__(
59
+ self,
60
+ message: str,
61
+ error_type: str,
62
+ document_id: Optional[str] = None,
63
+ recoverable: bool = False,
64
+ details: Optional[Dict[str, Any]] = None
65
+ ):
66
+ super().__init__(message)
67
+ self.message = message
68
+ self.error_type = error_type
69
+ self.document_id = document_id
70
+ self.recoverable = recoverable
71
+ self.details = details or {}
72
+
73
+
74
+ class DynamoDBWriteError(AggregationError):
75
+ """Raised when DynamoDB write operation fails."""
76
+
77
+ def __init__(
78
+ self,
79
+ message: str,
80
+ document_id: Optional[str] = None,
81
+ details: Optional[Dict[str, Any]] = None
82
+ ):
83
+ super().__init__(
84
+ message=message,
85
+ error_type='DynamoDBWriteError',
86
+ document_id=document_id,
87
+ recoverable=True,
88
+ details=details
89
+ )
90
+
91
+
92
+ def retry_with_exponential_backoff(
93
+ max_retries: int = 3,
94
+ base_delay: float = 1.0,
95
+ max_delay: float = 30.0,
96
+ jitter: bool = True,
97
+ retryable_exceptions: Tuple = (ClientError,)
98
+ ):
99
+ """
100
+ Decorator for retrying operations with exponential backoff.
101
+
102
+ Args:
103
+ max_retries: Maximum number of retry attempts
104
+ base_delay: Base delay in seconds between retries
105
+ max_delay: Maximum delay in seconds
106
+ jitter: Whether to add random jitter to delay
107
+ retryable_exceptions: Tuple of exception types to retry
108
+
109
+ Returns:
110
+ Decorated function
111
+ """
112
+ def decorator(func):
113
+ def wrapper(*args, **kwargs):
114
+ last_exception = None
115
+
116
+ for attempt in range(max_retries + 1):
117
+ try:
118
+ return func(*args, **kwargs)
119
+ except retryable_exceptions as e:
120
+ # Check if it's a retryable DynamoDB error
121
+ if isinstance(e, ClientError):
122
+ error_code = e.response.get('Error', {}).get('Code', '')
123
+ if error_code not in [
124
+ 'ProvisionedThroughputExceededException',
125
+ 'ThrottlingException',
126
+ 'InternalServerError',
127
+ 'ServiceUnavailable'
128
+ ]:
129
+ # Non-retryable error
130
+ raise
131
+
132
+ last_exception = e
133
+
134
+ if attempt < max_retries:
135
+ # Calculate delay with exponential backoff
136
+ delay = min(base_delay * (2 ** attempt), max_delay)
137
+
138
+ # Add jitter if enabled
139
+ if jitter:
140
+ delay = delay * (0.5 + random.random())
141
+
142
+ logger.warning(
143
+ f"Retryable error on attempt {attempt + 1}/{max_retries + 1}: "
144
+ f"{str(e)}. Retrying in {delay:.2f}s",
145
+ extra={
146
+ 'attempt': attempt + 1,
147
+ 'maxRetries': max_retries + 1,
148
+ 'delay': delay,
149
+ 'errorType': type(e).__name__
150
+ }
151
+ )
152
+
153
+ time.sleep(delay)
154
+ else:
155
+ logger.error(
156
+ f"Max retries ({max_retries + 1}) exceeded: {str(e)}",
157
+ extra={
158
+ 'maxRetries': max_retries + 1,
159
+ 'errorType': type(e).__name__
160
+ }
161
+ )
162
+ raise DynamoDBWriteError(
163
+ message=f"DynamoDB write failed after {max_retries + 1} attempts: {str(e)}",
164
+ document_id=kwargs.get('document_id'),
165
+ details={'original_error': str(e)}
166
+ )
167
+
168
+ # Should not reach here, but raise last exception if we do
169
+ if last_exception:
170
+ raise last_exception
171
+
172
+ return wrapper
173
+ return decorator
174
+
175
+
176
+ @retry_with_exponential_backoff(max_retries=3, base_delay=1.0)
177
+ def write_to_dynamodb(
178
+ table_name: str,
179
+ document_id: str,
180
+ aggregated_result: Dict[str, Any]
181
+ ) -> None:
182
+ """
183
+ Write aggregated result to DynamoDB with retry logic.
184
+
185
+ Args:
186
+ table_name: DynamoDB table name
187
+ document_id: Document identifier
188
+ aggregated_result: Aggregated result to store
189
+
190
+ Raises:
191
+ DynamoDBWriteError: If write fails after all retries
192
+ """
193
+ table = dynamodb.Table(table_name)
194
+
195
+ table.update_item(
196
+ Key={'DocumentId': document_id},
197
+ UpdateExpression='SET AggregatedResult = :result, WorkflowStatus = :status',
198
+ ExpressionAttributeValues={
199
+ ':result': json.dumps(aggregated_result),
200
+ ':status': 'complete'
201
+ }
202
+ )
203
+
204
+ logger.info(
205
+ f"Successfully wrote aggregated result to DynamoDB for document {document_id}",
206
+ extra={'documentId': document_id}
207
+ )
208
+
209
+
210
+ @metrics.log_metrics
211
+ def handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
212
+ """
213
+ Lambda function handler for aggregating chunk results.
214
+
215
+ Args:
216
+ event: Lambda event payload containing:
217
+ - documentId: Document identifier
218
+ - chunkResults: Array of chunk processing results
219
+ - aggregationStrategy: Strategy to use (default: 'majority-vote')
220
+ context: Lambda context object
221
+
222
+ Returns:
223
+ AggregatedResult dictionary with:
224
+ - documentId: Document identifier
225
+ - classification: Aggregated classification
226
+ - classificationConfidence: Confidence score (0-1)
227
+ - entities: Deduplicated entities
228
+ - chunksSummary: Summary of chunk processing
229
+ - partialResult: Whether result is partial due to failures
230
+ """
231
+ start_time = time.time()
232
+
233
+ # Set up structured logging context if available
234
+ if USE_STRUCTURED_LOGGING and structured_logger:
235
+ # Get correlation ID from event
236
+ correlation_id = event.get('correlationId')
237
+ structured_logger.set_correlation_id(correlation_id)
238
+
239
+ try:
240
+ # Parse event
241
+ document_id = event.get('documentId')
242
+ chunk_results = event.get('chunkResults', [])
243
+ aggregation_strategy = event.get('aggregationStrategy', 'majority-vote')
244
+
245
+ if not document_id:
246
+ raise ValueError('Missing required field: documentId')
247
+
248
+ if not chunk_results:
249
+ raise ValueError('Missing required field: chunkResults')
250
+
251
+ # Set document context for structured logging
252
+ if USE_STRUCTURED_LOGGING and structured_logger:
253
+ structured_logger.set_document_context(document_id=document_id)
254
+ structured_logger.info(
255
+ f'Aggregating results for document {document_id}',
256
+ extra={
257
+ 'event': 'aggregation_started',
258
+ 'totalChunks': len(chunk_results),
259
+ 'aggregationStrategy': aggregation_strategy
260
+ }
261
+ )
262
+ else:
263
+ logger.info(f'Aggregating results for document {document_id}', extra={
264
+ 'documentId': document_id,
265
+ 'totalChunks': len(chunk_results),
266
+ 'aggregationStrategy': aggregation_strategy
267
+ })
268
+
269
+ # Calculate chunks summary
270
+ chunks_summary = calculate_chunks_summary(chunk_results)
271
+
272
+ # Determine if result is partial (< 50% success threshold)
273
+ success_rate = chunks_summary['successfulChunks'] / chunks_summary['totalChunks']
274
+ partial_result = success_rate < 0.5
275
+
276
+ # Handle insufficient successful chunks
277
+ if partial_result:
278
+ logger.warning(
279
+ f'Insufficient successful chunks for document {document_id}: '
280
+ f'{chunks_summary["successfulChunks"]}/{chunks_summary["totalChunks"]} '
281
+ f'({success_rate:.1%})',
282
+ extra={
283
+ 'documentId': document_id,
284
+ 'successfulChunks': chunks_summary['successfulChunks'],
285
+ 'totalChunks': chunks_summary['totalChunks'],
286
+ 'successRate': success_rate
287
+ }
288
+ )
289
+
290
+ # Aggregate classifications
291
+ classification, confidence = aggregate_classifications(chunk_results)
292
+
293
+ # Deduplicate entities
294
+ entities = deduplicate_entities(chunk_results)
295
+
296
+ # Build aggregated result
297
+ aggregated_result = {
298
+ 'documentId': document_id,
299
+ 'classification': classification,
300
+ 'classificationConfidence': confidence,
301
+ 'entities': entities,
302
+ 'chunksSummary': chunks_summary,
303
+ 'partialResult': partial_result
304
+ }
305
+
306
+ # Emit aggregation metrics (Requirements: 7.4)
307
+ aggregation_time_ms = (time.time() - start_time) * 1000
308
+ _emit_aggregation_metrics(
309
+ document_id=document_id,
310
+ aggregation_time_ms=aggregation_time_ms,
311
+ total_chunks=chunks_summary['totalChunks'],
312
+ failed_chunks=chunks_summary['failedChunks']
313
+ )
314
+
315
+ logger.info(
316
+ f'Successfully aggregated results for document {document_id}',
317
+ extra={
318
+ 'documentId': document_id,
319
+ 'classification': classification,
320
+ 'confidence': confidence,
321
+ 'entityCount': len(entities),
322
+ 'partialResult': partial_result,
323
+ 'aggregationTimeMs': aggregation_time_ms
324
+ }
325
+ )
326
+
327
+ return aggregated_result
328
+
329
+ except Exception as e:
330
+ logger.error(
331
+ f'Error aggregating results for document {event.get("documentId", "unknown")}',
332
+ extra={
333
+ 'documentId': event.get('documentId', 'unknown'),
334
+ 'error': str(e),
335
+ 'errorType': type(e).__name__
336
+ },
337
+ exc_info=True
338
+ )
339
+ raise
340
+
341
+
342
+ def _emit_aggregation_metrics(
343
+ document_id: str,
344
+ aggregation_time_ms: float,
345
+ total_chunks: int,
346
+ failed_chunks: int
347
+ ) -> None:
348
+ """
349
+ Emit CloudWatch metrics for aggregation operations.
350
+
351
+ Only emits when observability is enabled (ENABLE_METRICS=true).
352
+
353
+ Args:
354
+ document_id: Document identifier
355
+ aggregation_time_ms: Time taken for aggregation in milliseconds
356
+ total_chunks: Total number of chunks processed
357
+ failed_chunks: Number of failed chunks
358
+
359
+ Requirements: 7.4
360
+ """
361
+ if not METRICS_ENABLED:
362
+ return
363
+
364
+ try:
365
+ # Emit AggregationTime metric
366
+ metrics.add_metric(
367
+ name="AggregationTime",
368
+ unit=MetricUnit.Milliseconds,
369
+ value=aggregation_time_ms
370
+ )
371
+
372
+ # Emit ChunkFailureRate metric
373
+ if total_chunks > 0:
374
+ failure_rate = (failed_chunks / total_chunks) * 100
375
+ metrics.add_metric(
376
+ name="ChunkFailureRate",
377
+ unit=MetricUnit.Percent,
378
+ value=failure_rate
379
+ )
380
+ metrics.add_metric(
381
+ name="FailedChunks",
382
+ unit=MetricUnit.Count,
383
+ value=failed_chunks
384
+ )
385
+ metrics.add_metric(
386
+ name="TotalChunks",
387
+ unit=MetricUnit.Count,
388
+ value=total_chunks
389
+ )
390
+
391
+ logger.debug(
392
+ f"Emitted aggregation metrics for document {document_id}",
393
+ extra={
394
+ 'documentId': document_id,
395
+ 'aggregationTimeMs': aggregation_time_ms,
396
+ 'totalChunks': total_chunks,
397
+ 'failedChunks': failed_chunks
398
+ }
399
+ )
400
+
401
+ except Exception as e:
402
+ logger.warning(
403
+ f"Failed to emit aggregation metrics: {str(e)}",
404
+ extra={'documentId': document_id, 'error': str(e)}
405
+ )
406
+
407
+
408
+ def calculate_chunks_summary(chunk_results: List[Dict[str, Any]]) -> Dict[str, int]:
409
+ """
410
+ Calculate summary statistics for chunk processing.
411
+
412
+ Args:
413
+ chunk_results: List of chunk processing results
414
+
415
+ Returns:
416
+ Dictionary with totalChunks, successfulChunks, failedChunks
417
+ """
418
+ total_chunks = len(chunk_results)
419
+ failed_chunks = sum(1 for result in chunk_results if result.get('error'))
420
+ successful_chunks = total_chunks - failed_chunks
421
+
422
+ return {
423
+ 'totalChunks': total_chunks,
424
+ 'successfulChunks': successful_chunks,
425
+ 'failedChunks': failed_chunks
426
+ }
427
+
428
+
429
+ def aggregate_classifications(chunk_results: List[Dict[str, Any]]) -> tuple[Optional[str], float]:
430
+ """
431
+ Aggregate classification results using majority voting.
432
+
433
+ Strategy:
434
+ - Count classification results from all chunks
435
+ - Select the classification that appears most frequently
436
+ - Calculate confidence as (count of majority / total chunks)
437
+ - If tie, select first classification alphabetically
438
+
439
+ Args:
440
+ chunk_results: List of chunk processing results
441
+
442
+ Returns:
443
+ Tuple of (classification, confidence)
444
+ Returns (None, 0.0) if no classifications found
445
+ """
446
+ classifications = []
447
+
448
+ for result in chunk_results:
449
+ # Skip failed chunks
450
+ if result.get('error'):
451
+ continue
452
+
453
+ # Extract classification result
454
+ classification_result = result.get('classificationResult')
455
+ if classification_result:
456
+ classification = classification_result.get('documentClassification')
457
+ if classification:
458
+ classifications.append(classification)
459
+
460
+ if not classifications:
461
+ logger.warning('No classification results found in chunk results')
462
+ return None, 0.0
463
+
464
+ # Count occurrences
465
+ classification_counts = Counter(classifications)
466
+
467
+ # Get most common classification
468
+ # If tie, Counter.most_common() returns them in order of first occurrence
469
+ # We'll sort alphabetically to ensure deterministic behavior
470
+ max_count = max(classification_counts.values())
471
+ most_common = [
472
+ cls for cls, count in classification_counts.items()
473
+ if count == max_count
474
+ ]
475
+
476
+ # Sort alphabetically to handle ties deterministically
477
+ most_common.sort()
478
+ majority_classification = most_common[0]
479
+
480
+ # Calculate confidence
481
+ confidence = classification_counts[majority_classification] / len(classifications)
482
+
483
+ logger.info(
484
+ f'Aggregated classification: {majority_classification} '
485
+ f'(confidence: {confidence:.2%}, votes: {classification_counts[majority_classification]}/{len(classifications)})',
486
+ extra={
487
+ 'classification': majority_classification,
488
+ 'confidence': confidence,
489
+ 'votes': classification_counts[majority_classification],
490
+ 'totalVotes': len(classifications),
491
+ 'allClassifications': dict(classification_counts)
492
+ }
493
+ )
494
+
495
+ return majority_classification, confidence
496
+
497
+
498
+ def deduplicate_entities(chunk_results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
499
+ """
500
+ Deduplicate entities from multiple chunks.
501
+
502
+ Strategy:
503
+ - Combine entities from all chunks
504
+ - Remove exact duplicates by (type, value) for entities without page numbers
505
+ - Preserve all entities with page numbers (may appear on multiple pages)
506
+ - Sort entities by chunk index and page number
507
+
508
+ Args:
509
+ chunk_results: List of chunk processing results
510
+
511
+ Returns:
512
+ List of deduplicated entities
513
+ """
514
+ entities = []
515
+ seen_without_page = set()
516
+
517
+ for result in chunk_results:
518
+ # Skip failed chunks
519
+ if result.get('error'):
520
+ continue
521
+
522
+ # Extract processing result
523
+ processing_result = result.get('processingResult')
524
+ if not processing_result:
525
+ continue
526
+
527
+ chunk_index = result.get('chunkIndex', 0)
528
+ chunk_entities = processing_result.get('entities', [])
529
+
530
+ for entity in chunk_entities:
531
+ entity_type = entity.get('type')
532
+ entity_value = entity.get('value')
533
+
534
+ if not entity_type or not entity_value:
535
+ continue
536
+
537
+ # Add chunk index to entity for sorting
538
+ entity_with_chunk = {**entity, 'chunkIndex': chunk_index}
539
+
540
+ # For entities without page numbers, deduplicate by (type, value)
541
+ if 'page' not in entity:
542
+ key = (entity_type, entity_value)
543
+ if key not in seen_without_page:
544
+ entities.append(entity_with_chunk)
545
+ seen_without_page.add(key)
546
+ else:
547
+ # Keep all instances with page numbers
548
+ entities.append(entity_with_chunk)
549
+
550
+ # Sort entities by chunk index and page number
551
+ def sort_key(entity):
552
+ chunk_idx = entity.get('chunkIndex', 0)
553
+ page = entity.get('page', 0)
554
+ return (chunk_idx, page)
555
+
556
+ entities.sort(key=sort_key)
557
+
558
+ logger.info(
559
+ f'Deduplicated entities: {len(entities)} total',
560
+ extra={
561
+ 'totalEntities': len(entities),
562
+ 'entitiesWithoutPage': len(seen_without_page),
563
+ 'entitiesWithPage': len(entities) - len(seen_without_page)
564
+ }
565
+ )
566
+
567
+ return entities
@@ -0,0 +1,7 @@
1
+ # Aggregation Lambda Dependencies
2
+
3
+ # AWS SDK for Python
4
+ boto3>=1.26.0
5
+
6
+ # AWS Lambda Powertools for metrics (EMF format)
7
+ aws-lambda-powertools>=2.0.0