@cdklabs/cdk-appmod-catalog-blueprints 1.5.0 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.jsii +2537 -204
- package/lib/document-processing/adapter/adapter.d.ts +4 -2
- package/lib/document-processing/adapter/adapter.js +1 -1
- package/lib/document-processing/adapter/queued-s3-adapter.d.ts +9 -2
- package/lib/document-processing/adapter/queued-s3-adapter.js +29 -15
- package/lib/document-processing/agentic-document-processing.d.ts +4 -0
- package/lib/document-processing/agentic-document-processing.js +20 -10
- package/lib/document-processing/base-document-processing.d.ts +54 -2
- package/lib/document-processing/base-document-processing.js +136 -82
- package/lib/document-processing/bedrock-document-processing.d.ts +202 -2
- package/lib/document-processing/bedrock-document-processing.js +717 -77
- package/lib/document-processing/chunking-config.d.ts +614 -0
- package/lib/document-processing/chunking-config.js +5 -0
- package/lib/document-processing/default-document-processing-config.js +1 -1
- package/lib/document-processing/index.d.ts +1 -0
- package/lib/document-processing/index.js +2 -1
- package/lib/document-processing/resources/aggregation/handler.py +567 -0
- package/lib/document-processing/resources/aggregation/requirements.txt +7 -0
- package/lib/document-processing/resources/aggregation/test_handler.py +362 -0
- package/lib/document-processing/resources/cleanup/handler.py +276 -0
- package/lib/document-processing/resources/cleanup/requirements.txt +5 -0
- package/lib/document-processing/resources/cleanup/test_handler.py +436 -0
- package/lib/document-processing/resources/default-bedrock-invoke/index.py +85 -3
- package/lib/document-processing/resources/default-bedrock-invoke/test_index.py +622 -0
- package/lib/document-processing/resources/pdf-chunking/README.md +313 -0
- package/lib/document-processing/resources/pdf-chunking/chunking_strategies.py +460 -0
- package/lib/document-processing/resources/pdf-chunking/error_handling.py +491 -0
- package/lib/document-processing/resources/pdf-chunking/handler.py +958 -0
- package/lib/document-processing/resources/pdf-chunking/metrics.py +435 -0
- package/lib/document-processing/resources/pdf-chunking/requirements.txt +3 -0
- package/lib/document-processing/resources/pdf-chunking/strategy_selection.py +420 -0
- package/lib/document-processing/resources/pdf-chunking/structured_logging.py +457 -0
- package/lib/document-processing/resources/pdf-chunking/test_chunking_strategies.py +353 -0
- package/lib/document-processing/resources/pdf-chunking/test_error_handling.py +487 -0
- package/lib/document-processing/resources/pdf-chunking/test_handler.py +609 -0
- package/lib/document-processing/resources/pdf-chunking/test_integration.py +694 -0
- package/lib/document-processing/resources/pdf-chunking/test_metrics.py +532 -0
- package/lib/document-processing/resources/pdf-chunking/test_strategy_selection.py +471 -0
- package/lib/document-processing/resources/pdf-chunking/test_structured_logging.py +449 -0
- package/lib/document-processing/resources/pdf-chunking/test_token_estimation.py +374 -0
- package/lib/document-processing/resources/pdf-chunking/token_estimation.py +189 -0
- package/lib/document-processing/tests/agentic-document-processing-nag.test.js +4 -3
- package/lib/document-processing/tests/agentic-document-processing.test.js +488 -4
- package/lib/document-processing/tests/base-document-processing-nag.test.js +9 -2
- package/lib/document-processing/tests/base-document-processing-schema.test.d.ts +1 -0
- package/lib/document-processing/tests/base-document-processing-schema.test.js +337 -0
- package/lib/document-processing/tests/base-document-processing.test.js +114 -8
- package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.d.ts +1 -0
- package/lib/document-processing/tests/bedrock-document-processing-chunking-nag.test.js +382 -0
- package/lib/document-processing/tests/bedrock-document-processing-nag.test.js +4 -3
- package/lib/document-processing/tests/bedrock-document-processing-security.test.d.ts +1 -0
- package/lib/document-processing/tests/bedrock-document-processing-security.test.js +389 -0
- package/lib/document-processing/tests/bedrock-document-processing.test.js +808 -8
- package/lib/document-processing/tests/chunking-config.test.d.ts +1 -0
- package/lib/document-processing/tests/chunking-config.test.js +238 -0
- package/lib/document-processing/tests/queued-s3-adapter-nag.test.js +9 -2
- package/lib/document-processing/tests/queued-s3-adapter.test.js +17 -6
- package/lib/framework/agents/base-agent.js +1 -1
- package/lib/framework/agents/batch-agent.js +1 -1
- package/lib/framework/agents/default-agent-config.js +1 -1
- package/lib/framework/bedrock/bedrock.js +1 -1
- package/lib/framework/custom-resource/default-runtimes.js +1 -1
- package/lib/framework/foundation/access-log.js +1 -1
- package/lib/framework/foundation/eventbridge-broker.js +1 -1
- package/lib/framework/foundation/network.js +1 -1
- package/lib/framework/tests/access-log.test.js +5 -2
- package/lib/framework/tests/batch-agent.test.js +5 -2
- package/lib/framework/tests/bedrock.test.js +5 -2
- package/lib/framework/tests/eventbridge-broker.test.js +5 -2
- package/lib/framework/tests/framework-nag.test.js +16 -8
- package/lib/framework/tests/network.test.js +9 -4
- package/lib/tsconfig.tsbuildinfo +1 -1
- package/lib/utilities/data-loader.js +1 -1
- package/lib/utilities/lambda-iam-utils.js +1 -1
- package/lib/utilities/observability/cloudfront-distribution-observability-property-injector.js +1 -1
- package/lib/utilities/observability/default-observability-config.js +1 -1
- package/lib/utilities/observability/lambda-observability-property-injector.js +1 -1
- package/lib/utilities/observability/log-group-data-protection-utils.js +1 -1
- package/lib/utilities/observability/powertools-config.d.ts +10 -1
- package/lib/utilities/observability/powertools-config.js +19 -3
- package/lib/utilities/observability/state-machine-observability-property-injector.js +1 -1
- package/lib/utilities/test-utils.d.ts +43 -0
- package/lib/utilities/test-utils.js +56 -0
- package/lib/utilities/tests/data-loader-nag.test.js +3 -2
- package/lib/utilities/tests/data-loader.test.js +3 -2
- package/lib/webapp/frontend-construct.js +1 -1
- package/lib/webapp/tests/frontend-construct-nag.test.js +3 -2
- package/lib/webapp/tests/frontend-construct.test.js +3 -2
- package/package.json +6 -5
- package/lib/document-processing/resources/default-error-handler/index.js +0 -46
- package/lib/document-processing/resources/default-pdf-processor/index.js +0 -46
- package/lib/document-processing/resources/default-pdf-validator/index.js +0 -36
|
@@ -19,4 +19,5 @@ __exportStar(require("./bedrock-document-processing"), exports);
|
|
|
19
19
|
__exportStar(require("./agentic-document-processing"), exports);
|
|
20
20
|
__exportStar(require("./adapter"), exports);
|
|
21
21
|
__exportStar(require("./default-document-processing-config"), exports);
|
|
22
|
-
|
|
22
|
+
__exportStar(require("./chunking-config"), exports);
|
|
23
|
+
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiaW5kZXguanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyIuLi8uLi91c2UtY2FzZXMvZG9jdW1lbnQtcHJvY2Vzc2luZy9pbmRleC50cyJdLCJuYW1lcyI6W10sIm1hcHBpbmdzIjoiOzs7Ozs7Ozs7Ozs7Ozs7O0FBQUEsNkRBQTJDO0FBQzNDLGdFQUE4QztBQUM5QyxnRUFBOEM7QUFDOUMsNENBQTBCO0FBQzFCLHVFQUFxRDtBQUNyRCxvREFBa0MiLCJzb3VyY2VzQ29udGVudCI6WyJleHBvcnQgKiBmcm9tICcuL2Jhc2UtZG9jdW1lbnQtcHJvY2Vzc2luZyc7XG5leHBvcnQgKiBmcm9tICcuL2JlZHJvY2stZG9jdW1lbnQtcHJvY2Vzc2luZyc7XG5leHBvcnQgKiBmcm9tICcuL2FnZW50aWMtZG9jdW1lbnQtcHJvY2Vzc2luZyc7XG5leHBvcnQgKiBmcm9tICcuL2FkYXB0ZXInO1xuZXhwb3J0ICogZnJvbSAnLi9kZWZhdWx0LWRvY3VtZW50LXByb2Nlc3NpbmctY29uZmlnJztcbmV4cG9ydCAqIGZyb20gJy4vY2h1bmtpbmctY29uZmlnJzsiXX0=
|
|
@@ -0,0 +1,567 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Aggregation Lambda Handler
|
|
3
|
+
|
|
4
|
+
This Lambda function aggregates results from multiple chunks into a coherent final result.
|
|
5
|
+
It implements majority voting for classification and entity deduplication.
|
|
6
|
+
|
|
7
|
+
Requirements: 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 7.2, 7.3, 7.4, 7.5
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
import sys
|
|
14
|
+
import time
|
|
15
|
+
import random
|
|
16
|
+
from typing import Dict, Any, List, Optional, Tuple
|
|
17
|
+
from collections import Counter
|
|
18
|
+
import boto3
|
|
19
|
+
from botocore.exceptions import ClientError
|
|
20
|
+
|
|
21
|
+
from aws_lambda_powertools import Metrics
|
|
22
|
+
from aws_lambda_powertools.metrics import MetricUnit
|
|
23
|
+
|
|
24
|
+
# Try to import structured logging from pdf-chunking module
|
|
25
|
+
# Fall back to standard logging if not available
|
|
26
|
+
try:
|
|
27
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'pdf-chunking'))
|
|
28
|
+
from structured_logging import (
|
|
29
|
+
get_logger,
|
|
30
|
+
log_chunking_operation,
|
|
31
|
+
with_correlation_id,
|
|
32
|
+
is_observability_enabled
|
|
33
|
+
)
|
|
34
|
+
structured_logger = get_logger(__name__)
|
|
35
|
+
USE_STRUCTURED_LOGGING = True
|
|
36
|
+
except ImportError:
|
|
37
|
+
USE_STRUCTURED_LOGGING = False
|
|
38
|
+
structured_logger = None
|
|
39
|
+
|
|
40
|
+
# Configure standard logging as fallback
|
|
41
|
+
logger = logging.getLogger()
|
|
42
|
+
logger.setLevel(os.environ.get('LOG_LEVEL', 'INFO'))
|
|
43
|
+
|
|
44
|
+
# Check if metrics are enabled via environment variable
|
|
45
|
+
# This is set by the CDK construct when enableObservability is true
|
|
46
|
+
METRICS_ENABLED = os.environ.get('ENABLE_METRICS', 'false').lower() == 'true'
|
|
47
|
+
|
|
48
|
+
# Initialize Powertools Metrics
|
|
49
|
+
metrics = Metrics()
|
|
50
|
+
|
|
51
|
+
# Initialize DynamoDB client
|
|
52
|
+
dynamodb = boto3.resource('dynamodb')
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class AggregationError(Exception):
|
|
56
|
+
"""Base exception for aggregation errors."""
|
|
57
|
+
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
message: str,
|
|
61
|
+
error_type: str,
|
|
62
|
+
document_id: Optional[str] = None,
|
|
63
|
+
recoverable: bool = False,
|
|
64
|
+
details: Optional[Dict[str, Any]] = None
|
|
65
|
+
):
|
|
66
|
+
super().__init__(message)
|
|
67
|
+
self.message = message
|
|
68
|
+
self.error_type = error_type
|
|
69
|
+
self.document_id = document_id
|
|
70
|
+
self.recoverable = recoverable
|
|
71
|
+
self.details = details or {}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class DynamoDBWriteError(AggregationError):
|
|
75
|
+
"""Raised when DynamoDB write operation fails."""
|
|
76
|
+
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
message: str,
|
|
80
|
+
document_id: Optional[str] = None,
|
|
81
|
+
details: Optional[Dict[str, Any]] = None
|
|
82
|
+
):
|
|
83
|
+
super().__init__(
|
|
84
|
+
message=message,
|
|
85
|
+
error_type='DynamoDBWriteError',
|
|
86
|
+
document_id=document_id,
|
|
87
|
+
recoverable=True,
|
|
88
|
+
details=details
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def retry_with_exponential_backoff(
|
|
93
|
+
max_retries: int = 3,
|
|
94
|
+
base_delay: float = 1.0,
|
|
95
|
+
max_delay: float = 30.0,
|
|
96
|
+
jitter: bool = True,
|
|
97
|
+
retryable_exceptions: Tuple = (ClientError,)
|
|
98
|
+
):
|
|
99
|
+
"""
|
|
100
|
+
Decorator for retrying operations with exponential backoff.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
max_retries: Maximum number of retry attempts
|
|
104
|
+
base_delay: Base delay in seconds between retries
|
|
105
|
+
max_delay: Maximum delay in seconds
|
|
106
|
+
jitter: Whether to add random jitter to delay
|
|
107
|
+
retryable_exceptions: Tuple of exception types to retry
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
Decorated function
|
|
111
|
+
"""
|
|
112
|
+
def decorator(func):
|
|
113
|
+
def wrapper(*args, **kwargs):
|
|
114
|
+
last_exception = None
|
|
115
|
+
|
|
116
|
+
for attempt in range(max_retries + 1):
|
|
117
|
+
try:
|
|
118
|
+
return func(*args, **kwargs)
|
|
119
|
+
except retryable_exceptions as e:
|
|
120
|
+
# Check if it's a retryable DynamoDB error
|
|
121
|
+
if isinstance(e, ClientError):
|
|
122
|
+
error_code = e.response.get('Error', {}).get('Code', '')
|
|
123
|
+
if error_code not in [
|
|
124
|
+
'ProvisionedThroughputExceededException',
|
|
125
|
+
'ThrottlingException',
|
|
126
|
+
'InternalServerError',
|
|
127
|
+
'ServiceUnavailable'
|
|
128
|
+
]:
|
|
129
|
+
# Non-retryable error
|
|
130
|
+
raise
|
|
131
|
+
|
|
132
|
+
last_exception = e
|
|
133
|
+
|
|
134
|
+
if attempt < max_retries:
|
|
135
|
+
# Calculate delay with exponential backoff
|
|
136
|
+
delay = min(base_delay * (2 ** attempt), max_delay)
|
|
137
|
+
|
|
138
|
+
# Add jitter if enabled
|
|
139
|
+
if jitter:
|
|
140
|
+
delay = delay * (0.5 + random.random())
|
|
141
|
+
|
|
142
|
+
logger.warning(
|
|
143
|
+
f"Retryable error on attempt {attempt + 1}/{max_retries + 1}: "
|
|
144
|
+
f"{str(e)}. Retrying in {delay:.2f}s",
|
|
145
|
+
extra={
|
|
146
|
+
'attempt': attempt + 1,
|
|
147
|
+
'maxRetries': max_retries + 1,
|
|
148
|
+
'delay': delay,
|
|
149
|
+
'errorType': type(e).__name__
|
|
150
|
+
}
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
time.sleep(delay)
|
|
154
|
+
else:
|
|
155
|
+
logger.error(
|
|
156
|
+
f"Max retries ({max_retries + 1}) exceeded: {str(e)}",
|
|
157
|
+
extra={
|
|
158
|
+
'maxRetries': max_retries + 1,
|
|
159
|
+
'errorType': type(e).__name__
|
|
160
|
+
}
|
|
161
|
+
)
|
|
162
|
+
raise DynamoDBWriteError(
|
|
163
|
+
message=f"DynamoDB write failed after {max_retries + 1} attempts: {str(e)}",
|
|
164
|
+
document_id=kwargs.get('document_id'),
|
|
165
|
+
details={'original_error': str(e)}
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
# Should not reach here, but raise last exception if we do
|
|
169
|
+
if last_exception:
|
|
170
|
+
raise last_exception
|
|
171
|
+
|
|
172
|
+
return wrapper
|
|
173
|
+
return decorator
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
@retry_with_exponential_backoff(max_retries=3, base_delay=1.0)
|
|
177
|
+
def write_to_dynamodb(
|
|
178
|
+
table_name: str,
|
|
179
|
+
document_id: str,
|
|
180
|
+
aggregated_result: Dict[str, Any]
|
|
181
|
+
) -> None:
|
|
182
|
+
"""
|
|
183
|
+
Write aggregated result to DynamoDB with retry logic.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
table_name: DynamoDB table name
|
|
187
|
+
document_id: Document identifier
|
|
188
|
+
aggregated_result: Aggregated result to store
|
|
189
|
+
|
|
190
|
+
Raises:
|
|
191
|
+
DynamoDBWriteError: If write fails after all retries
|
|
192
|
+
"""
|
|
193
|
+
table = dynamodb.Table(table_name)
|
|
194
|
+
|
|
195
|
+
table.update_item(
|
|
196
|
+
Key={'DocumentId': document_id},
|
|
197
|
+
UpdateExpression='SET AggregatedResult = :result, WorkflowStatus = :status',
|
|
198
|
+
ExpressionAttributeValues={
|
|
199
|
+
':result': json.dumps(aggregated_result),
|
|
200
|
+
':status': 'complete'
|
|
201
|
+
}
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
logger.info(
|
|
205
|
+
f"Successfully wrote aggregated result to DynamoDB for document {document_id}",
|
|
206
|
+
extra={'documentId': document_id}
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
@metrics.log_metrics
|
|
211
|
+
def handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
|
|
212
|
+
"""
|
|
213
|
+
Lambda function handler for aggregating chunk results.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
event: Lambda event payload containing:
|
|
217
|
+
- documentId: Document identifier
|
|
218
|
+
- chunkResults: Array of chunk processing results
|
|
219
|
+
- aggregationStrategy: Strategy to use (default: 'majority-vote')
|
|
220
|
+
context: Lambda context object
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
AggregatedResult dictionary with:
|
|
224
|
+
- documentId: Document identifier
|
|
225
|
+
- classification: Aggregated classification
|
|
226
|
+
- classificationConfidence: Confidence score (0-1)
|
|
227
|
+
- entities: Deduplicated entities
|
|
228
|
+
- chunksSummary: Summary of chunk processing
|
|
229
|
+
- partialResult: Whether result is partial due to failures
|
|
230
|
+
"""
|
|
231
|
+
start_time = time.time()
|
|
232
|
+
|
|
233
|
+
# Set up structured logging context if available
|
|
234
|
+
if USE_STRUCTURED_LOGGING and structured_logger:
|
|
235
|
+
# Get correlation ID from event
|
|
236
|
+
correlation_id = event.get('correlationId')
|
|
237
|
+
structured_logger.set_correlation_id(correlation_id)
|
|
238
|
+
|
|
239
|
+
try:
|
|
240
|
+
# Parse event
|
|
241
|
+
document_id = event.get('documentId')
|
|
242
|
+
chunk_results = event.get('chunkResults', [])
|
|
243
|
+
aggregation_strategy = event.get('aggregationStrategy', 'majority-vote')
|
|
244
|
+
|
|
245
|
+
if not document_id:
|
|
246
|
+
raise ValueError('Missing required field: documentId')
|
|
247
|
+
|
|
248
|
+
if not chunk_results:
|
|
249
|
+
raise ValueError('Missing required field: chunkResults')
|
|
250
|
+
|
|
251
|
+
# Set document context for structured logging
|
|
252
|
+
if USE_STRUCTURED_LOGGING and structured_logger:
|
|
253
|
+
structured_logger.set_document_context(document_id=document_id)
|
|
254
|
+
structured_logger.info(
|
|
255
|
+
f'Aggregating results for document {document_id}',
|
|
256
|
+
extra={
|
|
257
|
+
'event': 'aggregation_started',
|
|
258
|
+
'totalChunks': len(chunk_results),
|
|
259
|
+
'aggregationStrategy': aggregation_strategy
|
|
260
|
+
}
|
|
261
|
+
)
|
|
262
|
+
else:
|
|
263
|
+
logger.info(f'Aggregating results for document {document_id}', extra={
|
|
264
|
+
'documentId': document_id,
|
|
265
|
+
'totalChunks': len(chunk_results),
|
|
266
|
+
'aggregationStrategy': aggregation_strategy
|
|
267
|
+
})
|
|
268
|
+
|
|
269
|
+
# Calculate chunks summary
|
|
270
|
+
chunks_summary = calculate_chunks_summary(chunk_results)
|
|
271
|
+
|
|
272
|
+
# Determine if result is partial (< 50% success threshold)
|
|
273
|
+
success_rate = chunks_summary['successfulChunks'] / chunks_summary['totalChunks']
|
|
274
|
+
partial_result = success_rate < 0.5
|
|
275
|
+
|
|
276
|
+
# Handle insufficient successful chunks
|
|
277
|
+
if partial_result:
|
|
278
|
+
logger.warning(
|
|
279
|
+
f'Insufficient successful chunks for document {document_id}: '
|
|
280
|
+
f'{chunks_summary["successfulChunks"]}/{chunks_summary["totalChunks"]} '
|
|
281
|
+
f'({success_rate:.1%})',
|
|
282
|
+
extra={
|
|
283
|
+
'documentId': document_id,
|
|
284
|
+
'successfulChunks': chunks_summary['successfulChunks'],
|
|
285
|
+
'totalChunks': chunks_summary['totalChunks'],
|
|
286
|
+
'successRate': success_rate
|
|
287
|
+
}
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
# Aggregate classifications
|
|
291
|
+
classification, confidence = aggregate_classifications(chunk_results)
|
|
292
|
+
|
|
293
|
+
# Deduplicate entities
|
|
294
|
+
entities = deduplicate_entities(chunk_results)
|
|
295
|
+
|
|
296
|
+
# Build aggregated result
|
|
297
|
+
aggregated_result = {
|
|
298
|
+
'documentId': document_id,
|
|
299
|
+
'classification': classification,
|
|
300
|
+
'classificationConfidence': confidence,
|
|
301
|
+
'entities': entities,
|
|
302
|
+
'chunksSummary': chunks_summary,
|
|
303
|
+
'partialResult': partial_result
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
# Emit aggregation metrics (Requirements: 7.4)
|
|
307
|
+
aggregation_time_ms = (time.time() - start_time) * 1000
|
|
308
|
+
_emit_aggregation_metrics(
|
|
309
|
+
document_id=document_id,
|
|
310
|
+
aggregation_time_ms=aggregation_time_ms,
|
|
311
|
+
total_chunks=chunks_summary['totalChunks'],
|
|
312
|
+
failed_chunks=chunks_summary['failedChunks']
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
logger.info(
|
|
316
|
+
f'Successfully aggregated results for document {document_id}',
|
|
317
|
+
extra={
|
|
318
|
+
'documentId': document_id,
|
|
319
|
+
'classification': classification,
|
|
320
|
+
'confidence': confidence,
|
|
321
|
+
'entityCount': len(entities),
|
|
322
|
+
'partialResult': partial_result,
|
|
323
|
+
'aggregationTimeMs': aggregation_time_ms
|
|
324
|
+
}
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
return aggregated_result
|
|
328
|
+
|
|
329
|
+
except Exception as e:
|
|
330
|
+
logger.error(
|
|
331
|
+
f'Error aggregating results for document {event.get("documentId", "unknown")}',
|
|
332
|
+
extra={
|
|
333
|
+
'documentId': event.get('documentId', 'unknown'),
|
|
334
|
+
'error': str(e),
|
|
335
|
+
'errorType': type(e).__name__
|
|
336
|
+
},
|
|
337
|
+
exc_info=True
|
|
338
|
+
)
|
|
339
|
+
raise
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def _emit_aggregation_metrics(
|
|
343
|
+
document_id: str,
|
|
344
|
+
aggregation_time_ms: float,
|
|
345
|
+
total_chunks: int,
|
|
346
|
+
failed_chunks: int
|
|
347
|
+
) -> None:
|
|
348
|
+
"""
|
|
349
|
+
Emit CloudWatch metrics for aggregation operations.
|
|
350
|
+
|
|
351
|
+
Only emits when observability is enabled (ENABLE_METRICS=true).
|
|
352
|
+
|
|
353
|
+
Args:
|
|
354
|
+
document_id: Document identifier
|
|
355
|
+
aggregation_time_ms: Time taken for aggregation in milliseconds
|
|
356
|
+
total_chunks: Total number of chunks processed
|
|
357
|
+
failed_chunks: Number of failed chunks
|
|
358
|
+
|
|
359
|
+
Requirements: 7.4
|
|
360
|
+
"""
|
|
361
|
+
if not METRICS_ENABLED:
|
|
362
|
+
return
|
|
363
|
+
|
|
364
|
+
try:
|
|
365
|
+
# Emit AggregationTime metric
|
|
366
|
+
metrics.add_metric(
|
|
367
|
+
name="AggregationTime",
|
|
368
|
+
unit=MetricUnit.Milliseconds,
|
|
369
|
+
value=aggregation_time_ms
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
# Emit ChunkFailureRate metric
|
|
373
|
+
if total_chunks > 0:
|
|
374
|
+
failure_rate = (failed_chunks / total_chunks) * 100
|
|
375
|
+
metrics.add_metric(
|
|
376
|
+
name="ChunkFailureRate",
|
|
377
|
+
unit=MetricUnit.Percent,
|
|
378
|
+
value=failure_rate
|
|
379
|
+
)
|
|
380
|
+
metrics.add_metric(
|
|
381
|
+
name="FailedChunks",
|
|
382
|
+
unit=MetricUnit.Count,
|
|
383
|
+
value=failed_chunks
|
|
384
|
+
)
|
|
385
|
+
metrics.add_metric(
|
|
386
|
+
name="TotalChunks",
|
|
387
|
+
unit=MetricUnit.Count,
|
|
388
|
+
value=total_chunks
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
logger.debug(
|
|
392
|
+
f"Emitted aggregation metrics for document {document_id}",
|
|
393
|
+
extra={
|
|
394
|
+
'documentId': document_id,
|
|
395
|
+
'aggregationTimeMs': aggregation_time_ms,
|
|
396
|
+
'totalChunks': total_chunks,
|
|
397
|
+
'failedChunks': failed_chunks
|
|
398
|
+
}
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
except Exception as e:
|
|
402
|
+
logger.warning(
|
|
403
|
+
f"Failed to emit aggregation metrics: {str(e)}",
|
|
404
|
+
extra={'documentId': document_id, 'error': str(e)}
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def calculate_chunks_summary(chunk_results: List[Dict[str, Any]]) -> Dict[str, int]:
|
|
409
|
+
"""
|
|
410
|
+
Calculate summary statistics for chunk processing.
|
|
411
|
+
|
|
412
|
+
Args:
|
|
413
|
+
chunk_results: List of chunk processing results
|
|
414
|
+
|
|
415
|
+
Returns:
|
|
416
|
+
Dictionary with totalChunks, successfulChunks, failedChunks
|
|
417
|
+
"""
|
|
418
|
+
total_chunks = len(chunk_results)
|
|
419
|
+
failed_chunks = sum(1 for result in chunk_results if result.get('error'))
|
|
420
|
+
successful_chunks = total_chunks - failed_chunks
|
|
421
|
+
|
|
422
|
+
return {
|
|
423
|
+
'totalChunks': total_chunks,
|
|
424
|
+
'successfulChunks': successful_chunks,
|
|
425
|
+
'failedChunks': failed_chunks
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def aggregate_classifications(chunk_results: List[Dict[str, Any]]) -> tuple[Optional[str], float]:
|
|
430
|
+
"""
|
|
431
|
+
Aggregate classification results using majority voting.
|
|
432
|
+
|
|
433
|
+
Strategy:
|
|
434
|
+
- Count classification results from all chunks
|
|
435
|
+
- Select the classification that appears most frequently
|
|
436
|
+
- Calculate confidence as (count of majority / total chunks)
|
|
437
|
+
- If tie, select first classification alphabetically
|
|
438
|
+
|
|
439
|
+
Args:
|
|
440
|
+
chunk_results: List of chunk processing results
|
|
441
|
+
|
|
442
|
+
Returns:
|
|
443
|
+
Tuple of (classification, confidence)
|
|
444
|
+
Returns (None, 0.0) if no classifications found
|
|
445
|
+
"""
|
|
446
|
+
classifications = []
|
|
447
|
+
|
|
448
|
+
for result in chunk_results:
|
|
449
|
+
# Skip failed chunks
|
|
450
|
+
if result.get('error'):
|
|
451
|
+
continue
|
|
452
|
+
|
|
453
|
+
# Extract classification result
|
|
454
|
+
classification_result = result.get('classificationResult')
|
|
455
|
+
if classification_result:
|
|
456
|
+
classification = classification_result.get('documentClassification')
|
|
457
|
+
if classification:
|
|
458
|
+
classifications.append(classification)
|
|
459
|
+
|
|
460
|
+
if not classifications:
|
|
461
|
+
logger.warning('No classification results found in chunk results')
|
|
462
|
+
return None, 0.0
|
|
463
|
+
|
|
464
|
+
# Count occurrences
|
|
465
|
+
classification_counts = Counter(classifications)
|
|
466
|
+
|
|
467
|
+
# Get most common classification
|
|
468
|
+
# If tie, Counter.most_common() returns them in order of first occurrence
|
|
469
|
+
# We'll sort alphabetically to ensure deterministic behavior
|
|
470
|
+
max_count = max(classification_counts.values())
|
|
471
|
+
most_common = [
|
|
472
|
+
cls for cls, count in classification_counts.items()
|
|
473
|
+
if count == max_count
|
|
474
|
+
]
|
|
475
|
+
|
|
476
|
+
# Sort alphabetically to handle ties deterministically
|
|
477
|
+
most_common.sort()
|
|
478
|
+
majority_classification = most_common[0]
|
|
479
|
+
|
|
480
|
+
# Calculate confidence
|
|
481
|
+
confidence = classification_counts[majority_classification] / len(classifications)
|
|
482
|
+
|
|
483
|
+
logger.info(
|
|
484
|
+
f'Aggregated classification: {majority_classification} '
|
|
485
|
+
f'(confidence: {confidence:.2%}, votes: {classification_counts[majority_classification]}/{len(classifications)})',
|
|
486
|
+
extra={
|
|
487
|
+
'classification': majority_classification,
|
|
488
|
+
'confidence': confidence,
|
|
489
|
+
'votes': classification_counts[majority_classification],
|
|
490
|
+
'totalVotes': len(classifications),
|
|
491
|
+
'allClassifications': dict(classification_counts)
|
|
492
|
+
}
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
return majority_classification, confidence
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
def deduplicate_entities(chunk_results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
499
|
+
"""
|
|
500
|
+
Deduplicate entities from multiple chunks.
|
|
501
|
+
|
|
502
|
+
Strategy:
|
|
503
|
+
- Combine entities from all chunks
|
|
504
|
+
- Remove exact duplicates by (type, value) for entities without page numbers
|
|
505
|
+
- Preserve all entities with page numbers (may appear on multiple pages)
|
|
506
|
+
- Sort entities by chunk index and page number
|
|
507
|
+
|
|
508
|
+
Args:
|
|
509
|
+
chunk_results: List of chunk processing results
|
|
510
|
+
|
|
511
|
+
Returns:
|
|
512
|
+
List of deduplicated entities
|
|
513
|
+
"""
|
|
514
|
+
entities = []
|
|
515
|
+
seen_without_page = set()
|
|
516
|
+
|
|
517
|
+
for result in chunk_results:
|
|
518
|
+
# Skip failed chunks
|
|
519
|
+
if result.get('error'):
|
|
520
|
+
continue
|
|
521
|
+
|
|
522
|
+
# Extract processing result
|
|
523
|
+
processing_result = result.get('processingResult')
|
|
524
|
+
if not processing_result:
|
|
525
|
+
continue
|
|
526
|
+
|
|
527
|
+
chunk_index = result.get('chunkIndex', 0)
|
|
528
|
+
chunk_entities = processing_result.get('entities', [])
|
|
529
|
+
|
|
530
|
+
for entity in chunk_entities:
|
|
531
|
+
entity_type = entity.get('type')
|
|
532
|
+
entity_value = entity.get('value')
|
|
533
|
+
|
|
534
|
+
if not entity_type or not entity_value:
|
|
535
|
+
continue
|
|
536
|
+
|
|
537
|
+
# Add chunk index to entity for sorting
|
|
538
|
+
entity_with_chunk = {**entity, 'chunkIndex': chunk_index}
|
|
539
|
+
|
|
540
|
+
# For entities without page numbers, deduplicate by (type, value)
|
|
541
|
+
if 'page' not in entity:
|
|
542
|
+
key = (entity_type, entity_value)
|
|
543
|
+
if key not in seen_without_page:
|
|
544
|
+
entities.append(entity_with_chunk)
|
|
545
|
+
seen_without_page.add(key)
|
|
546
|
+
else:
|
|
547
|
+
# Keep all instances with page numbers
|
|
548
|
+
entities.append(entity_with_chunk)
|
|
549
|
+
|
|
550
|
+
# Sort entities by chunk index and page number
|
|
551
|
+
def sort_key(entity):
|
|
552
|
+
chunk_idx = entity.get('chunkIndex', 0)
|
|
553
|
+
page = entity.get('page', 0)
|
|
554
|
+
return (chunk_idx, page)
|
|
555
|
+
|
|
556
|
+
entities.sort(key=sort_key)
|
|
557
|
+
|
|
558
|
+
logger.info(
|
|
559
|
+
f'Deduplicated entities: {len(entities)} total',
|
|
560
|
+
extra={
|
|
561
|
+
'totalEntities': len(entities),
|
|
562
|
+
'entitiesWithoutPage': len(seen_without_page),
|
|
563
|
+
'entitiesWithPage': len(entities) - len(seen_without_page)
|
|
564
|
+
}
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
return entities
|