npm - @cdklabs/cdk-appmod-catalog-blueprints - Versions diffs - 1.4.1 → 1.6.0 - Mend

@cdklabs/cdk-appmod-catalog-blueprints 1.4.1 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

package/lib/document-processing/resources/aggregation/test_handler.py ADDED Viewed

@@ -0,0 +1,362 @@
+"""
+Unit tests for aggregation Lambda handler
+"""
+import unittest
+from unittest.mock import MagicMock
+from handler import (
+    handler,
+    aggregate_classifications,
+    deduplicate_entities,
+    calculate_chunks_summary
+)
+class TestAggregationHandler(unittest.TestCase):
+    """Test cases for the main handler function"""
+    def test_handler_success(self):
+        """Test successful aggregation"""
+        event = {
+            'documentId': 'doc-123',
+            'chunkResults': [
+                {
+                    'chunkIndex': 0,
+                    'classificationResult': {
+                        'documentClassification': 'INVOICE'
+                    },
+                    'processingResult': {
+                        'entities': [
+                            {'type': 'AMOUNT', 'value': '100.00'},
+                            {'type': 'DATE', 'value': '2024-01-01', 'page': 1}
+                        ]
+                    }
+                },
+                {
+                    'chunkIndex': 1,
+                    'classificationResult': {
+                        'documentClassification': 'INVOICE'
+                    },
+                    'processingResult': {
+                        'entities': [
+                            {'type': 'AMOUNT', 'value': '100.00'},  # Duplicate without page
+                            {'type': 'VENDOR', 'value': 'Acme Corp'}
+                        ]
+                    }
+                }
+            ]
+        }
+        context = MagicMock()
+        result = handler(event, context)
+        self.assertEqual(result['documentId'], 'doc-123')
+        self.assertEqual(result['classification'], 'INVOICE')
+        self.assertEqual(result['classificationConfidence'], 1.0)
+        self.assertEqual(len(result['entities']), 3)  # Deduplicated AMOUNT
+        self.assertFalse(result['partialResult'])
+        self.assertEqual(result['chunksSummary']['totalChunks'], 2)
+        self.assertEqual(result['chunksSummary']['successfulChunks'], 2)
+        self.assertEqual(result['chunksSummary']['failedChunks'], 0)
+    def test_handler_missing_document_id(self):
+        """Test handler with missing documentId"""
+        event = {
+            'chunkResults': []
+        }
+        context = MagicMock()
+        with self.assertRaises(ValueError) as cm:
+            handler(event, context)
+        self.assertIn('documentId', str(cm.exception))
+    def test_handler_missing_chunk_results(self):
+        """Test handler with missing chunkResults"""
+        event = {
+            'documentId': 'doc-123'
+        }
+        context = MagicMock()
+        with self.assertRaises(ValueError) as cm:
+            handler(event, context)
+        self.assertIn('chunkResults', str(cm.exception))
+    def test_handler_partial_result(self):
+        """Test handler with partial results (< 50% success)"""
+        event = {
+            'documentId': 'doc-123',
+            'chunkResults': [
+                {
+                    'chunkIndex': 0,
+                    'classificationResult': {
+                        'documentClassification': 'INVOICE'
+                    },
+                    'processingResult': {
+                        'entities': []
+                    }
+                },
+                {
+                    'chunkIndex': 1,
+                    'error': 'Processing failed'
+                },
+                {
+                    'chunkIndex': 2,
+                    'error': 'Processing failed'
+                }
+            ]
+        }
+        context = MagicMock()
+        result = handler(event, context)
+        self.assertTrue(result['partialResult'])
+        self.assertEqual(result['chunksSummary']['successfulChunks'], 1)
+        self.assertEqual(result['chunksSummary']['failedChunks'], 2)
+class TestAggregateClassifications(unittest.TestCase):
+    """Test cases for classification aggregation"""
+    def test_majority_voting_clear_majority(self):
+        """Test majority voting with clear majority"""
+        chunk_results = [
+            {'classificationResult': {'documentClassification': 'INVOICE'}},
+            {'classificationResult': {'documentClassification': 'INVOICE'}},
+            {'classificationResult': {'documentClassification': 'RECEIPT'}}
+        ]
+        classification, confidence = aggregate_classifications(chunk_results)
+        self.assertEqual(classification, 'INVOICE')
+        self.assertAlmostEqual(confidence, 2/3, places=2)
+    def test_majority_voting_tie_alphabetical(self):
+        """Test majority voting with tie (alphabetical selection)"""
+        chunk_results = [
+            {'classificationResult': {'documentClassification': 'RECEIPT'}},
+            {'classificationResult': {'documentClassification': 'INVOICE'}}
+        ]
+        classification, confidence = aggregate_classifications(chunk_results)
+        # Should select 'INVOICE' (alphabetically first)
+        self.assertEqual(classification, 'INVOICE')
+        self.assertAlmostEqual(confidence, 0.5, places=2)
+    def test_no_classifications(self):
+        """Test with no classification results"""
+        chunk_results = [
+            {'error': 'Failed'},
+            {'processingResult': {'entities': []}}
+        ]
+        classification, confidence = aggregate_classifications(chunk_results)
+        self.assertIsNone(classification)
+        self.assertEqual(confidence, 0.0)
+    def test_skip_failed_chunks(self):
+        """Test that failed chunks are skipped"""
+        chunk_results = [
+            {'classificationResult': {'documentClassification': 'INVOICE'}},
+            {'error': 'Processing failed'},
+            {'classificationResult': {'documentClassification': 'INVOICE'}}
+        ]
+        classification, confidence = aggregate_classifications(chunk_results)
+        self.assertEqual(classification, 'INVOICE')
+        self.assertEqual(confidence, 1.0)  # 2/2 successful chunks
+class TestDeduplicateEntities(unittest.TestCase):
+    """Test cases for entity deduplication"""
+    def test_deduplicate_exact_duplicates(self):
+        """Test deduplication of exact duplicates without page numbers"""
+        chunk_results = [
+            {
+                'chunkIndex': 0,
+                'processingResult': {
+                    'entities': [
+                        {'type': 'AMOUNT', 'value': '100.00'},
+                        {'type': 'VENDOR', 'value': 'Acme Corp'}
+                    ]
+                }
+            },
+            {
+                'chunkIndex': 1,
+                'processingResult': {
+                    'entities': [
+                        {'type': 'AMOUNT', 'value': '100.00'},  # Duplicate
+                        {'type': 'DATE', 'value': '2024-01-01'}
+                    ]
+                }
+            }
+        ]
+        entities = deduplicate_entities(chunk_results)
+        # Should have 3 unique entities (AMOUNT deduplicated)
+        self.assertEqual(len(entities), 3)
+        # Check that AMOUNT appears only once
+        amount_entities = [e for e in entities if e['type'] == 'AMOUNT']
+        self.assertEqual(len(amount_entities), 1)
+    def test_preserve_entities_with_page_numbers(self):
+        """Test that entities with page numbers are preserved"""
+        chunk_results = [
+            {
+                'chunkIndex': 0,
+                'processingResult': {
+                    'entities': [
+                        {'type': 'NAME', 'value': 'John Doe', 'page': 1},
+                        {'type': 'NAME', 'value': 'John Doe', 'page': 2}
+                    ]
+                }
+            }
+        ]
+        entities = deduplicate_entities(chunk_results)
+        # Both entities should be preserved
+        self.assertEqual(len(entities), 2)
+        self.assertEqual(entities[0]['page'], 1)
+        self.assertEqual(entities[1]['page'], 2)
+    def test_sort_by_chunk_and_page(self):
+        """Test that entities are sorted by chunk index and page number"""
+        chunk_results = [
+            {
+                'chunkIndex': 1,
+                'processingResult': {
+                    'entities': [
+                        {'type': 'NAME', 'value': 'Jane Doe', 'page': 5}
+                    ]
+                }
+            },
+            {
+                'chunkIndex': 0,
+                'processingResult': {
+                    'entities': [
+                        {'type': 'NAME', 'value': 'John Doe', 'page': 2}
+                    ]
+                }
+            }
+        ]
+        entities = deduplicate_entities(chunk_results)
+        # Should be sorted by chunk index first
+        self.assertEqual(entities[0]['chunkIndex'], 0)
+        self.assertEqual(entities[1]['chunkIndex'], 1)
+    def test_skip_failed_chunks(self):
+        """Test that failed chunks are skipped"""
+        chunk_results = [
+            {
+                'chunkIndex': 0,
+                'processingResult': {
+                    'entities': [
+                        {'type': 'AMOUNT', 'value': '100.00'}
+                    ]
+                }
+            },
+            {
+                'chunkIndex': 1,
+                'error': 'Processing failed'
+            }
+        ]
+        entities = deduplicate_entities(chunk_results)
+        self.assertEqual(len(entities), 1)
+        self.assertEqual(entities[0]['type'], 'AMOUNT')
+    def test_skip_invalid_entities(self):
+        """Test that entities without type or value are skipped"""
+        chunk_results = [
+            {
+                'chunkIndex': 0,
+                'processingResult': {
+                    'entities': [
+                        {'type': 'AMOUNT', 'value': '100.00'},
+                        {'type': 'VENDOR'},  # Missing value
+                        {'value': 'Acme Corp'},  # Missing type
+                        {}  # Missing both
+                    ]
+                }
+            }
+        ]
+        entities = deduplicate_entities(chunk_results)
+        # Only the valid entity should be included
+        self.assertEqual(len(entities), 1)
+        self.assertEqual(entities[0]['type'], 'AMOUNT')
+class TestCalculateChunksSummary(unittest.TestCase):
+    """Test cases for chunks summary calculation"""
+    def test_all_successful(self):
+        """Test summary with all successful chunks"""
+        chunk_results = [
+            {'chunkIndex': 0, 'classificationResult': {}},
+            {'chunkIndex': 1, 'classificationResult': {}}
+        ]
+        summary = calculate_chunks_summary(chunk_results)
+        self.assertEqual(summary['totalChunks'], 2)
+        self.assertEqual(summary['successfulChunks'], 2)
+        self.assertEqual(summary['failedChunks'], 0)
+    def test_all_failed(self):
+        """Test summary with all failed chunks"""
+        chunk_results = [
+            {'chunkIndex': 0, 'error': 'Failed'},
+            {'chunkIndex': 1, 'error': 'Failed'}
+        ]
+        summary = calculate_chunks_summary(chunk_results)
+        self.assertEqual(summary['totalChunks'], 2)
+        self.assertEqual(summary['successfulChunks'], 0)
+        self.assertEqual(summary['failedChunks'], 2)
+    def test_mixed_results(self):
+        """Test summary with mixed success and failure"""
+        chunk_results = [
+            {'chunkIndex': 0, 'classificationResult': {}},
+            {'chunkIndex': 1, 'error': 'Failed'},
+            {'chunkIndex': 2, 'classificationResult': {}}
+        ]
+        summary = calculate_chunks_summary(chunk_results)
+        self.assertEqual(summary['totalChunks'], 3)
+        self.assertEqual(summary['successfulChunks'], 2)
+        self.assertEqual(summary['failedChunks'], 1)
+    def test_50_percent_threshold(self):
+        """Test partial results with 50% failure threshold"""
+        chunk_results = [
+            {'chunkIndex': 0, 'classificationResult': {}},
+            {'chunkIndex': 1, 'error': 'Failed'}
+        ]
+        summary = calculate_chunks_summary(chunk_results)
+        success_rate = summary['successfulChunks'] / summary['totalChunks']
+        # Exactly 50% success rate
+        self.assertEqual(success_rate, 0.5)
+if __name__ == '__main__':
+    unittest.main()

package/lib/document-processing/resources/cleanup/handler.py ADDED Viewed

@@ -0,0 +1,276 @@
+"""
+Cleanup Lambda Handler
+This Lambda function removes temporary chunk files from S3 after successful aggregation.
+It uses batch delete for efficiency and handles errors gracefully without failing the workflow.
+Requirements: 8.4, 7.5
+"""
+import json
+import logging
+import os
+import sys
+from typing import Any, Dict, List
+import boto3
+from botocore.exceptions import ClientError
+# Try to import structured logging from pdf-chunking module
+# Fall back to standard logging if not available
+try:
+    sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'pdf-chunking'))
+    from structured_logging import (
+        get_logger,
+        log_chunking_operation,
+        with_correlation_id,
+        is_observability_enabled
+    )
+    structured_logger = get_logger(__name__)
+    USE_STRUCTURED_LOGGING = True
+except ImportError:
+    USE_STRUCTURED_LOGGING = False
+    structured_logger = None
+# Configure standard logging as fallback
+logger = logging.getLogger()
+logger.setLevel(os.environ.get('LOG_LEVEL', 'INFO'))
+# Initialize S3 client
+s3_client = boto3.client('s3')
+def handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
+    """
+    Lambda handler for cleaning up temporary chunk files from S3.
+    Args:
+        event: Lambda event payload containing:
+            - documentId: Document identifier
+            - chunks: Array of chunk metadata with bucket and key information
+        context: Lambda context object
+    Returns:
+        CleanupResponse with deletedChunks count and errors array
+    """
+    # Set up structured logging context if available
+    if USE_STRUCTURED_LOGGING and structured_logger:
+        correlation_id = event.get('correlationId')
+        structured_logger.set_correlation_id(correlation_id)
+    try:
+        # Parse event
+        document_id = event.get('documentId')
+        chunks = event.get('chunks', [])
+        if not document_id:
+            error_msg = "Missing required field: documentId"
+            if USE_STRUCTURED_LOGGING and structured_logger:
+                structured_logger.error(error_msg, extra={'event': 'cleanup_error'})
+            else:
+                logger.error(error_msg)
+            return {
+                'documentId': None,
+                'deletedChunks': 0,
+                'errors': [error_msg]
+            }
+        # Set document context for structured logging
+        if USE_STRUCTURED_LOGGING and structured_logger:
+            structured_logger.set_document_context(document_id=document_id)
+        if not chunks:
+            if USE_STRUCTURED_LOGGING and structured_logger:
+                structured_logger.info(
+                    f"No chunks to clean up for document {document_id}",
+                    extra={'event': 'cleanup_skipped', 'reason': 'no_chunks'}
+                )
+            else:
+                logger.info(f"No chunks to clean up for document {document_id}")
+            return {
+                'documentId': document_id,
+                'deletedChunks': 0,
+                'errors': []
+            }
+        if USE_STRUCTURED_LOGGING and structured_logger:
+            structured_logger.info(
+                f"Starting cleanup for document {document_id}",
+                extra={
+                    'event': 'cleanup_started',
+                    'chunkCount': len(chunks)
+                }
+            )
+        else:
+            logger.info(f"Starting cleanup for document {document_id} with {len(chunks)} chunks")
+        # Extract S3 keys for all chunks
+        chunk_keys = []
+        for chunk in chunks:
+            bucket = chunk.get('bucket')
+            key = chunk.get('key')
+            if bucket and key:
+                chunk_keys.append({
+                    'bucket': bucket,
+                    'key': key,
+                    'chunkId': chunk.get('chunkId', 'unknown')
+                })
+            else:
+                if USE_STRUCTURED_LOGGING and structured_logger:
+                    structured_logger.warning(
+                        f"Chunk missing bucket or key",
+                        extra={'chunk': chunk, 'event': 'invalid_chunk'}
+                    )
+                else:
+                    logger.warning(f"Chunk missing bucket or key: {chunk}")
+        if not chunk_keys:
+            if USE_STRUCTURED_LOGGING and structured_logger:
+                structured_logger.warning(
+                    f"No valid chunk keys found for document {document_id}",
+                    extra={'event': 'cleanup_skipped', 'reason': 'no_valid_keys'}
+                )
+            else:
+                logger.warning(f"No valid chunk keys found for document {document_id}")
+            return {
+                'documentId': document_id,
+                'deletedChunks': 0,
+                'errors': ['No valid chunk keys found']
+            }
+        # Delete chunks using batch delete
+        deleted_count, errors = delete_chunks_batch(chunk_keys, document_id)
+        if USE_STRUCTURED_LOGGING and structured_logger:
+            structured_logger.info(
+                f"Cleanup completed for document {document_id}",
+                extra={
+                    'event': 'cleanup_completed',
+                    'deletedChunks': deleted_count,
+                    'totalChunks': len(chunk_keys),
+                    'errorCount': len(errors)
+                }
+            )
+        else:
+            logger.info(
+                f"Cleanup completed for document {document_id}: "
+                f"deleted {deleted_count}/{len(chunk_keys)} chunks, "
+                f"{len(errors)} errors"
+            )
+        return {
+            'documentId': document_id,
+            'deletedChunks': deleted_count,
+            'errors': errors
+        }
+    except Exception as e:
+        if USE_STRUCTURED_LOGGING and structured_logger:
+            structured_logger.error(
+                f"Unexpected error during cleanup: {str(e)}",
+                extra={'event': 'cleanup_error', 'errorType': type(e).__name__},
+                exc_info=True
+            )
+        else:
+            logger.error(f"Unexpected error during cleanup: {str(e)}", exc_info=True)
+        return {
+            'documentId': event.get('documentId'),
+            'deletedChunks': 0,
+            'errors': [f"Unexpected error: {str(e)}"]
+        }
+def delete_chunks_batch(
+    chunk_keys: List[Dict[str, str]],
+    document_id: str
+) -> tuple[int, List[str]]:
+    """
+    Delete chunks from S3 using batch delete operations.
+    S3 batch delete supports up to 1000 objects per request.
+    This function groups chunks by bucket and processes them in batches.
+    Args:
+        chunk_keys: List of dicts with 'bucket', 'key', and 'chunkId'
+        document_id: Document identifier for logging
+    Returns:
+        Tuple of (deleted_count, errors_list)
+    """
+    deleted_count = 0
+    errors = []
+    # Group chunks by bucket
+    chunks_by_bucket: Dict[str, List[Dict[str, str]]] = {}
+    for chunk in chunk_keys:
+        bucket = chunk['bucket']
+        if bucket not in chunks_by_bucket:
+            chunks_by_bucket[bucket] = []
+        chunks_by_bucket[bucket].append(chunk)
+    # Process each bucket
+    for bucket, chunks in chunks_by_bucket.items():
+        logger.info(f"Deleting {len(chunks)} chunks from bucket {bucket}")
+        # Process in batches of 1000 (S3 limit)
+        batch_size = 1000
+        for i in range(0, len(chunks), batch_size):
+            batch = chunks[i:i + batch_size]
+            # Prepare delete request
+            objects_to_delete = [{'Key': chunk['key']} for chunk in batch]
+            try:
+                response = s3_client.delete_objects(
+                    Bucket=bucket,
+                    Delete={'Objects': objects_to_delete}
+                )
+                # Count successful deletions
+                deleted = response.get('Deleted', [])
+                deleted_count += len(deleted)
+                # Log any errors from S3
+                s3_errors = response.get('Errors', [])
+                for error in s3_errors:
+                    error_key = error.get('Key', 'unknown')
+                    error_code = error.get('Code', 'unknown')
+                    error_message = error.get('Message', 'unknown')
+                    error_msg = (
+                        f"Failed to delete {error_key}: "
+                        f"{error_code} - {error_message}"
+                    )
+                    logger.warning(error_msg)
+                    errors.append(error_msg)
+                logger.info(
+                    f"Batch delete completed: {len(deleted)} deleted, "
+                    f"{len(s3_errors)} errors"
+                )
+            except ClientError as e:
+                error_code = e.response.get('Error', {}).get('Code', 'Unknown')
+                error_message = e.response.get('Error', {}).get('Message', str(e))
+                error_msg = (
+                    f"S3 batch delete failed for bucket {bucket}: "
+                    f"{error_code} - {error_message}"
+                )
+                logger.error(error_msg)
+                errors.append(error_msg)
+                # Log which chunks failed
+                for chunk in batch:
+                    logger.error(
+                        f"Failed to delete chunk {chunk['chunkId']} "
+                        f"at s3://{bucket}/{chunk['key']}"
+                    )
+            except Exception as e:
+                error_msg = f"Unexpected error during batch delete: {str(e)}"
+                logger.error(error_msg, exc_info=True)
+                errors.append(error_msg)
+    return deleted_count, errors

package/lib/document-processing/resources/cleanup/requirements.txt ADDED Viewed

@@ -0,0 +1,5 @@
+# Cleanup Lambda Dependencies
+# Requirements: 8.4
+# AWS SDK for Python (S3 operations)
+boto3>=1.26.0