PyPI - document-analyzer - Versions diffs - 0.1.0__py3-none-any.whl - Mend

document-analyzer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

document_analyzer/__init__.py +31 -0
document_analyzer/__main__.py +5 -0
document_analyzer/analyzers/__init__.py +14 -0
document_analyzer/analyzers/cedula_analyzer.py +412 -0
document_analyzer/analyzers/document_analyzer.py +187 -0
document_analyzer/analyzers/passport_analyzer.py +294 -0
document_analyzer/cli.py +401 -0
document_analyzer/config/__init__.py +30 -0
document_analyzer/config/constants.py +230 -0
document_analyzer/config/logger.py +36 -0
document_analyzer/services/__init__.py +3 -0
document_analyzer/services/paddleocr_service.py +107 -0
document_analyzer/startup.py +24 -0
document_analyzer/utils/__init__.py +57 -0
document_analyzer/utils/cedula_utils.py +155 -0
document_analyzer/utils/common_utils.py +229 -0
document_analyzer/utils/extract_cedula_signature.py +431 -0
document_analyzer/utils/passport_language_detector.py +277 -0
document_analyzer/utils/passport_utils.py +260 -0
document_analyzer-0.1.0.dist-info/METADATA +520 -0
document_analyzer-0.1.0.dist-info/RECORD +25 -0
document_analyzer-0.1.0.dist-info/WHEEL +5 -0
document_analyzer-0.1.0.dist-info/entry_points.txt +2 -0
document_analyzer-0.1.0.dist-info/licenses/LICENSE +201 -0
document_analyzer-0.1.0.dist-info/top_level.txt +1 -0

document_analyzer/analyzers/passport_analyzer.py ADDED Viewed

@@ -0,0 +1,294 @@
+import cv2
+import time
+import numpy as np
+from ..config import DocumentAnalyzerLoggerAdapter, logger
+from ..utils import (
+    ensure_bytesio,
+    preprocess_image,
+    extract_mrz_data,
+    extract_place_of_birth,
+    extract_data_with_boxes,
+    detect_passport_language,
+)
+# Start and end messages for logging
+START_MSG = "======= PassportAnalyzer Started ======="
+END_MSG = "======= PassportAnalyzer Ended ======="
+ERROR_END_MSG = "======= PassportAnalyzer Ended With Error ======="
+class PassportAnalyzer:
+    """A comprehensive analyzer for passport documents.
+    This class provides complete functionality for analyzing passports including
+    OCR text extraction, MRZ parsing, and field extraction.
+    The analyzer can process various image formats and extract key information:
+    - Personal details (dates, places, nationality)
+    - Document identifiers (passport numbers, expiry dates)
+    - MRZ data parsing
+    Attributes:
+        user_email (str): Optional user email for logging context.
+        logger (DocumentAnalyzerLoggerAdapter): Custom logger with user context.
+        passport_np (np.ndarray): Image data as OpenCV-compatible numpy array.
+        ocr (PaddleOCR): OCR engine instance configured for detected language.
+    Examples:
+        >>> analyzer = PassportAnalyzer("passport_image.jpg", "user@example.com")
+        >>> results = analyzer.analyze_passport()
+        >>> print(results['passport_info']['id_number'])
+    """
+    def __init__(
+        self,
+        passport_file,
+        user_email=None,
+        ocr_instance=None,
+        lang_detector_instance=None,
+        normalize_input=True,
+        preprocess_image=True,
+    ):
+        """Initialize the PassportAnalyzer with an image file.
+        Args:
+            passport_file: Input passport image in various formats:
+                - File path (str)
+                - File-like object (Django upload, etc.)
+                - BytesIO object
+            user_email (str, optional): User email for logging context.
+            ocr_instance (PaddleOCR, optional): Pre-initialized PaddleOCR instance.
+                If not provided, language will be detected and appropriate model used.
+            lang_detector_instance: Language detector instance for passport language detection.
+            normalize_input (bool): Whether to normalize input.
+            preprocess_image (bool): Whether to preprocess image.
+        Raises:
+            ValueError: If image cannot be decoded or is corrupted.
+            IOError: If file path cannot be read.
+        """
+        self.start_time = time.time()
+        # Custom logger adapter
+        self.logger = DocumentAnalyzerLoggerAdapter(logger, {"user_email": user_email})
+        try:
+            # Convert to BytesIO (if not already)
+            passport_stream = ensure_bytesio(passport_file)
+            # Read image bytes into OpenCV-compatible format (BGR)
+            self.passport_np = cv2.imdecode(
+                np.frombuffer(passport_stream.read(), np.uint8), cv2.IMREAD_COLOR
+            )
+            if self.passport_np is None:
+                raise ValueError(
+                    "Could not decode image - invalid format or corrupted file"
+                )
+        except Exception as e:
+            self.logger.error(f"Failed to load input file: {e}")
+            raise
+        # Use provided OCR instance or detect language and use appropriate model
+        if ocr_instance is not None:
+            # Explicit OCR instance takes priority
+            self.ocr = ocr_instance
+            self.logger.debug("Using provided PaddleOCR instance for passport analysis")
+        else:
+            # Always run language detection (except if explicit OCR provided)
+            passport_file.seek(0)
+            detected_lang = detect_passport_language(
+                passport_file, ocr_instance=lang_detector_instance, logger=self.logger
+            )
+            self.logger.info(f"Detected passport language: '{detected_lang}'")
+            from ..services.paddleocr_service import PaddleOCRService
+            self.ocr = PaddleOCRService.get_instance(detected_lang)
+            self.logger.debug(
+                f"Using PaddleOCR instance for language: '{detected_lang}'"
+            )
+    def parse_passport_information(self, extracted_data):
+        """Parse required passport fields from OCR extracted data.
+        Extracts and parses specific information fields from the OCR text data
+        including MRZ data (dates, nationality, passport number) and place of birth
+        using pattern matching and contextual analysis.
+        Args:
+            extracted_data (list): List of text data dictionaries from OCR
+                extraction, each containing text, bbox, confidence, and position information.
+        Returns:
+            dict: Dictionary containing parsed passport information with keys:
+                - date_of_birth (str): Birth date in DD-MMM-YYYY format
+                - place_of_birth (str): Place of birth
+                - nationality (str): Nationality (3-letter code)
+                - expiry_date (str): Document expiry date in DD-MMM-YYYY format
+                - passport_number (str): Passport number
+        Note:
+            - Dates are converted from MRZ format (YYMMDD) to DD-MMM-YYYY
+            - MRZ data is parsed from the machine-readable zone at bottom of passport
+            - Place of birth is extracted from text fields using indicators
+        Examples:
+            >>> extracted = analyzer.extract_data_with_boxes(image)
+            >>> info = analyzer.parse_passport_information(extracted)
+            >>> print(info['passport_number'])  # e.g., "AB1234567"
+            >>> print(info['date_of_birth'])    # e.g., "15-MAR-1985"
+        """
+        self.logger.debug("Starting passport information parsing")
+        # Extract MRZ data (passport number, nationality, dates)
+        mrz_data = extract_mrz_data(extracted_data, logger=self.logger)
+        # Extract place of birth from text fields
+        place_of_birth = extract_place_of_birth(extracted_data, logger=self.logger)
+        passport_info = {
+            "date_of_birth": mrz_data.get("date_of_birth", ""),
+            "place_of_birth": place_of_birth,
+            "nationality": mrz_data.get("nationality", ""),
+            "expiry_date": mrz_data.get("expiry_date", ""),
+            "passport_number": mrz_data.get("passport_number", ""),
+        }
+        self.logger.debug(f"Parsed passport info: {passport_info}")
+        return passport_info
+    def analyze_passport(self):
+        """Main function to analyze a passport image.
+        Orchestrates the complete analysis pipeline including image preprocessing,
+        OCR text extraction, MRZ parsing, information extraction, and result compilation.
+        This is the primary entry point for passport analysis.
+        Returns:
+            dict: Complete analysis results containing:
+                - success (str): Analysis status - "passport_info" or "none"
+                - passport_info (dict): Parsed document information
+                - signature (None): Always None for passports
+                - raw_extracted_data (list): OCR results for debugging
+                - error (str): Error message if analysis fails
+        Note:
+            Success status indicates what information was successfully extracted:
+                - "passport_info": All or most required fields extracted
+                - "none": Could not extract sufficient information
+        Raises:
+            Exception: Caught internally and returned in error field of result dict.
+        Examples:
+            >>> analyzer = PassportAnalyzer("passport.jpg")
+            >>> result = analyzer.analyze_passport()
+            >>> if result['success'] == 'passport_info':
+            ...     print("Analysis successful")
+            ...     print(f"Passport: {result['passport_info']['passport_number']}")
+            ...     print(f"DOB: {result['passport_info']['dob']}")
+            ...     print(f"POB: {result['passport_info']['pob']}")
+        """
+        try:
+            self.logger.info(START_MSG)
+            # Load image if path is provided
+            image_path_or_array = self.passport_np
+            if isinstance(image_path_or_array, str):
+                self.logger.debug(f"Loading image from path: '{image_path_or_array}'")
+                image = cv2.imread(image_path_or_array)
+                if image is None:
+                    self.logger.error(
+                        f"Couldn't load image from '{image_path_or_array}'"
+                    )
+                    raise ValueError(
+                        f"Couldn't load image from '{image_path_or_array}'"
+                    )
+            else:
+                self.logger.debug("Using provided image array")
+                image = image_path_or_array.copy()
+            # Preprocess the image
+            processed_image = preprocess_image(image, logger=self.logger)
+            # Extract data with bounding boxes
+            extracted_data = extract_data_with_boxes(
+                processed_image, ocr=self.ocr, logger=self.logger
+            )
+            if not extracted_data:
+                self.logger.warning("Couldn't extract data from the passport image")
+                return {
+                    "success": "none",
+                    "passport_info": {},
+                    "signature": None,
+                    "raw_extracted_data": [],
+                }
+            self.logger.info(f"Extracted {len(extracted_data)} data boxes")
+            # Parse passport information
+            raw_passport_info = self.parse_passport_information(extracted_data)
+            # Convert to desired field names (matching cedula format)
+            passport_info = {
+                "type": "passport",
+                "dob": raw_passport_info.get("date_of_birth", ""),
+                "pob": raw_passport_info.get("place_of_birth", ""),
+                "nationality": raw_passport_info.get("nationality", ""),
+                "expiry": raw_passport_info.get("expiry_date", ""),
+                "id_number": raw_passport_info.get("passport_number", ""),
+            }
+            success_status = "passport_info"
+            self.logger.info(
+                f"Success: '{success_status.capitalize()}' | Date of Birth: '{passport_info['dob']}' "
+                f"| Place of Birth: '{passport_info['pob']}' | Nationality: '{passport_info['nationality']}' "
+                f"| Expiry: '{passport_info['expiry']}' | Passport Number: '{passport_info['id_number']}'"
+            )
+            end_time = time.time()
+            elapsed_time = end_time - self.start_time
+            self.logger.info(f"Passport analysis took: {elapsed_time:.2f} seconds")
+            self.logger.info(END_MSG)
+            return {
+                "success": success_status,
+                "passport_info": passport_info,
+                "signature": None,
+                "raw_extracted_data": extracted_data,  # For debugging
+            }
+        except Exception as e:
+            self.logger.error(f"Error in PassportAnalyzer: {str(e)}")
+            self.logger.info(ERROR_END_MSG)
+            return {
+                "success": "none",
+                "passport_info": {},
+                "signature": None,
+                "raw_extracted_data": [],
+            }
+# Convenience function for easy import and use
+def analyze_passport(
+    passport_file, user_email=None, ocr_instance=None, lang_detector_instance=True
+):
+    """Convenience function for passport analysis using PassportAnalyzer.
+    Args:
+        passport_file: Input passport image.
+        user_email (str, optional): User email for logging.
+        ocr_instance (PaddleOCR, optional): Pre-initialized OCR instance.
+        lang_detector_instance: Language detector instance.
+    Returns:
+        dict: Analysis results.
+    """
+    analyzer = PassportAnalyzer(
+        passport_file, user_email, ocr_instance, lang_detector_instance
+    )
+    return analyzer.analyze_passport()

document_analyzer/cli.py ADDED Viewed

@@ -0,0 +1,401 @@
+import sys
+import json
+import logging
+import argparse
+from pathlib import Path
+from typing import Optional, Dict, Any
+from importlib.metadata import version, PackageNotFoundError
+from .analyzers import DocumentAnalyzer, CedulaAnalyzer, PassportAnalyzer
+from .config import logger as project_logger
+try:
+    __version__ = version("document-analyzer")
+except PackageNotFoundError:
+    __version__ = "unknown"
+# Supported image formats
+SUPPORTED_FORMATS = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".gif", ".pdf"}
+class CLIError(Exception):
+    """Custom exception for CLI errors."""
+    def __init__(self, message: str, exit_code: int = 1):
+        self.message = message
+        self.exit_code = exit_code
+        super().__init__(self.message)
+def validate_input_file(file_path: str) -> Path:
+    """
+    Validate that input file exists, is readable, and has supported format.
+    Args:
+        file_path: Path to the input file
+    Returns:
+        Path object if valid
+    Raises:
+        CLIError: If file is invalid or unsupported
+    """
+    try:
+        path = Path(file_path)
+        # Check if file exists
+        if not path.exists():
+            raise CLIError(f"Error: File not found: {file_path}", exit_code=1)
+        # Check if it's a file (not a directory)
+        if not path.is_file():
+            raise CLIError(f"Error: Path is not a file: {file_path}", exit_code=1)
+        # Check if file is empty
+        if path.stat().st_size == 0:
+            raise CLIError(f"Error: File is empty: {file_path}", exit_code=1)
+        # Check file format
+        if path.suffix.lower() not in SUPPORTED_FORMATS:
+            raise CLIError(
+                f"Error: Unsupported file format '{path.suffix}'. "
+                f"Supported formats: {', '.join(sorted(SUPPORTED_FORMATS))}",
+                exit_code=1,
+            )
+        # Check if file is readable
+        try:
+            with open(path, "rb") as f:
+                f.read(1)
+        except PermissionError:
+            raise CLIError(
+                f"Error: Permission denied reading file: {file_path}", exit_code=1
+            )
+        return path
+    except CLIError:
+        raise
+    except Exception as e:
+        raise CLIError(f"Error: Failed to validate file: {str(e)}", exit_code=1)
+def validate_output_path(output_path: str) -> Path:
+    """
+    Validate that output path directory is writable.
+    Args:
+        output_path: Path to the output file
+    Returns:
+        Path object if valid
+    Raises:
+        CLIError: If output directory is not writable
+    """
+    try:
+        path = Path(output_path)
+        output_dir = path.parent
+        # Create parent directories if they don't exist
+        if not output_dir.exists():
+            try:
+                output_dir.mkdir(parents=True, exist_ok=True)
+            except PermissionError:
+                raise CLIError(
+                    f"Error: Permission denied creating directory: {output_dir}",
+                    exit_code=1,
+                )
+        # Check if directory is writable by attempting to write a test file
+        test_file = output_dir / ".write_test"
+        try:
+            test_file.touch()
+            test_file.unlink()
+        except PermissionError:
+            raise CLIError(
+                f"Error: Output directory is not writable: {output_dir}", exit_code=1
+            )
+        return path
+    except CLIError:
+        raise
+    except Exception as e:
+        raise CLIError(f"Error: Failed to validate output path: {str(e)}", exit_code=1)
+def detect_and_analyze(
+    file_path: Path, user_email: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Auto-detect document type and analyze it.
+    Args:
+        file_path: Path to the document file
+        user_email: Optional user email for logging
+    Returns:
+        Dictionary with analysis results
+    Raises:
+        CLIError: If detection or analysis fails
+    """
+    try:
+        project_logger.debug(f"Attempting auto-detection on: {file_path}")
+        # Use DocumentAnalyzer for auto-detection
+        analyzer = DocumentAnalyzer(str(file_path), user_email=user_email)
+        doc_type = analyzer.detect_document_type()
+        project_logger.debug(f"Detected document type: {doc_type}")
+        if doc_type == "unknown":
+            raise CLIError(
+                "Error: Could not determine document type. "
+                "Please specify --type (cedula or passport).",
+                exit_code=1,
+            )
+        # Now analyze with the appropriate analyzer
+        if doc_type == "cedula":
+            cedula_analyzer = CedulaAnalyzer(str(file_path), user_email=user_email)
+            result = cedula_analyzer.analyze_cedula()
+            result["document_type"] = "cedula"
+            return result
+        elif doc_type == "passport":
+            passport_analyzer = PassportAnalyzer(str(file_path), user_email=user_email)
+            result = passport_analyzer.analyze_passport()
+            result["document_type"] = "passport"
+            return result
+        else:
+            raise CLIError("Error: Unknown document type after detection.", exit_code=2)
+    except CLIError:
+        raise
+    except ValueError as e:
+        # Invalid image format or corrupted file
+        raise CLIError(f"Error: Invalid or corrupted image file: {str(e)}", exit_code=2)
+    except Exception as e:
+        project_logger.error(f"Auto-detection failed: {str(e)}", exc_info=True)
+        raise CLIError(f"Error: Failed to analyze document: {str(e)}", exit_code=2)
+def analyze_cedula(file_path: Path, user_email: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Analyze a cédula document.
+    Args:
+        file_path: Path to the cédula image
+        user_email: Optional user email for logging
+    Returns:
+        Dictionary with analysis results
+    Raises:
+        CLIError: If analysis fails
+    """
+    try:
+        project_logger.debug(f"Analyzing cédula: {file_path}")
+        analyzer = CedulaAnalyzer(str(file_path), user_email=user_email)
+        result = analyzer.analyze_cedula()
+        result["document_type"] = "cedula"
+        return result
+    except ValueError as e:
+        raise CLIError(f"Error: Invalid or corrupted image file: {str(e)}", exit_code=2)
+    except Exception as e:
+        project_logger.error(f"Cédula analysis failed: {str(e)}", exc_info=True)
+        raise CLIError(f"Error: Failed to analyze cédula: {str(e)}", exit_code=2)
+def analyze_passport(
+    file_path: Path, user_email: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Analyze a passport document.
+    Args:
+        file_path: Path to the passport image
+        user_email: Optional user email for logging
+    Returns:
+        Dictionary with analysis results
+    Raises:
+        CLIError: If analysis fails
+    """
+    try:
+        project_logger.debug(f"Analyzing passport: {file_path}")
+        analyzer = PassportAnalyzer(str(file_path), user_email=user_email)
+        result = analyzer.analyze_passport()
+        result["document_type"] = "passport"
+        return result
+    except ValueError as e:
+        raise CLIError(f"Error: Invalid or corrupted image file: {str(e)}", exit_code=2)
+    except Exception as e:
+        project_logger.error(f"Passport analysis failed: {str(e)}", exc_info=True)
+        raise CLIError(f"Error: Failed to analyze passport: {str(e)}", exit_code=2)
+def format_result_json(result: Dict[str, Any]) -> str:
+    """Format analysis result as JSON string."""
+    return json.dumps(result, indent=4, default=str)
+def setup_logging(verbose: bool) -> None:
+    """Configure logging for the entire package.
+    With -v: Show all DEBUG logs from the document_analyzer package.
+    Without -v: Show only WARNING and ERROR logs from the document_analyzer package.
+    This is scoped to document_analyzer only to avoid capturing logs from
+    other libraries.
+    """
+    level = logging.DEBUG if verbose else logging.WARNING
+    # Get the package logger (scoped to document_analyzer)
+    package_logger = logging.getLogger("document_analyzer")
+    # Prevent duplicate handlers if setup_logging is called multiple times
+    if not package_logger.handlers:
+        handler = logging.StreamHandler(sys.stderr)
+        handler.setLevel(level)
+        formatter = logging.Formatter("%(levelname)s: %(message)s")
+        handler.setFormatter(formatter)
+        package_logger.addHandler(handler)
+    else:
+        # Update existing handlers to use the new level
+        for handler in package_logger.handlers:
+            handler.setLevel(level)
+    package_logger.setLevel(level)
+def create_parser() -> argparse.ArgumentParser:
+    """Create and configure the argument parser."""
+    parser = argparse.ArgumentParser(
+        prog="document-analyzer",
+        description="Analyze Panamanian identity cards (cédulas) and passports using OCR.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  %(prog)s analyze doc.jpg
+  %(prog)s analyze cedula.jpg --type cedula
+  %(prog)s analyze doc.jpg --save result.json -v
+        """,
+    )
+    # Global options
+    parser.add_argument(
+        "--version", action="version", version=f"%(prog)s {__version__}"
+    )
+    # Subcommands
+    subparsers = parser.add_subparsers(dest="command", help="Commands")
+    # analyze command
+    analyze_cmd = subparsers.add_parser("analyze", help="Analyze a document")
+    analyze_cmd.add_argument("path", help="Path to the document image file")
+    analyze_cmd.add_argument(
+        "--type",
+        choices=["auto", "cedula", "passport"],
+        default="auto",
+        help="Document type (default: auto-detect)",
+    )
+    analyze_cmd.add_argument(
+        "--save",
+        metavar="FILE",
+        help="Save result to file instead of printing to stdout",
+    )
+    analyze_cmd.add_argument(
+        "-v", "--verbose", action="store_true", help="Enable debug-level logging"
+    )
+    return parser
+def main(argv: Optional[list] = None) -> int:
+    """
+    Main entry point for the CLI.
+    Args:
+        argv: Command-line arguments (default: sys.argv[1:])
+    Returns:
+        Exit code (0 for success, 1 for user error, 2 for processing error)
+    """
+    parser = create_parser()
+    try:
+        args = parser.parse_args(argv)
+        # Handle no command
+        if not args.command:
+            parser.print_help()
+            return 0
+        # Setup logging
+        setup_logging(args.verbose)
+        # Validate input file
+        project_logger.debug(f"Validating input file: {args.path}")
+        input_path = validate_input_file(args.path)
+        # Analyze document based on type
+        if args.type == "auto":
+            project_logger.debug("Using auto-detection for document type")
+            result = detect_and_analyze(input_path)
+        elif args.type == "cedula":
+            result = analyze_cedula(input_path)
+        elif args.type == "passport":
+            result = analyze_passport(input_path)
+        else:
+            raise CLIError(f"Unknown document type: {args.type}", exit_code=1)
+        # Format output as JSON
+        output_json = format_result_json(result)
+        # Handle output destination
+        if args.save:
+            project_logger.debug(f"Validating output path: {args.save}")
+            output_path = validate_output_path(args.save)
+            try:
+                with open(output_path, "w") as f:
+                    f.write(output_json)
+                print(f"Result saved to: {output_path}")
+                project_logger.debug(f"Result saved to: {output_path}")
+            except PermissionError:
+                raise CLIError(
+                    f"Error: Permission denied writing to: {args.save}", exit_code=1
+                )
+            except Exception as e:
+                raise CLIError(
+                    f"Error: Failed to write output file: {str(e)}", exit_code=1
+                )
+        else:
+            # Print to stdout
+            print(output_json)
+        return 0
+    except CLIError as e:
+        print(e.message, file=sys.stderr)
+        return e.exit_code
+    except KeyboardInterrupt:
+        print("\nOperation cancelled by user.", file=sys.stderr)
+        return 1
+    except Exception as e:
+        project_logger.error(f"Unexpected error: {str(e)}", exc_info=True)
+        print(f"Error: An unexpected error occurred: {str(e)}", file=sys.stderr)
+        return 2
+if __name__ == "__main__":
+    sys.exit(main())