PyPI - document-analyzer - Versions diffs - 0.1.0__py3-none-any.whl - Mend

document-analyzer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

document_analyzer/__init__.py +31 -0
document_analyzer/__main__.py +5 -0
document_analyzer/analyzers/__init__.py +14 -0
document_analyzer/analyzers/cedula_analyzer.py +412 -0
document_analyzer/analyzers/document_analyzer.py +187 -0
document_analyzer/analyzers/passport_analyzer.py +294 -0
document_analyzer/cli.py +401 -0
document_analyzer/config/__init__.py +30 -0
document_analyzer/config/constants.py +230 -0
document_analyzer/config/logger.py +36 -0
document_analyzer/services/__init__.py +3 -0
document_analyzer/services/paddleocr_service.py +107 -0
document_analyzer/startup.py +24 -0
document_analyzer/utils/__init__.py +57 -0
document_analyzer/utils/cedula_utils.py +155 -0
document_analyzer/utils/common_utils.py +229 -0
document_analyzer/utils/extract_cedula_signature.py +431 -0
document_analyzer/utils/passport_language_detector.py +277 -0
document_analyzer/utils/passport_utils.py +260 -0
document_analyzer-0.1.0.dist-info/METADATA +520 -0
document_analyzer-0.1.0.dist-info/RECORD +25 -0
document_analyzer-0.1.0.dist-info/WHEEL +5 -0
document_analyzer-0.1.0.dist-info/entry_points.txt +2 -0
document_analyzer-0.1.0.dist-info/licenses/LICENSE +201 -0
document_analyzer-0.1.0.dist-info/top_level.txt +1 -0

document_analyzer/utils/passport_language_detector.py ADDED Viewed

@@ -0,0 +1,277 @@
+import re
+import cv2
+import numpy as np
+from typing import Optional, Dict, Tuple, Union
+from .common_utils import ensure_bytesio, preprocess_image
+from ..config import (
+    logger,
+    SPANISH_COUNTRIES,
+    SPANISH_KEYWORDS,
+    COUNTRY_NAME_MAPPINGS,
+)
+class PassportLanguageDetector:
+    """
+    Passport language detector for determining optimal OCR language.
+    This class analyzes passport documents to determine whether Spanish or English
+    OCR would provide better results based on document content and country indicators.
+    Example:
+        >>> detector = PassportLanguageDetector()
+        >>> language = detector.detect_language(passport_file)
+        >>> print(f"Use {language} OCR for this passport")
+        # Or use class methods directly:
+        >>> language = PassportLanguageDetector.detect_passport_language(passport_file)
+    """
+    def __init__(self, default_confidence_threshold=3):
+        """
+        Initialize the detector.
+        Args:
+            default_confidence_threshold: Minimum score required to detect Spanish
+        """
+        self.confidence_threshold = default_confidence_threshold
+        self.logger = logger
+    def _prepare_image(self, passport_file):
+        """Convert various input types to OpenCV image."""
+        try:
+            if isinstance(passport_file, np.ndarray):
+                return passport_file.copy()
+            elif isinstance(passport_file, str):
+                # File path
+                return cv2.imread(passport_file)
+            else:
+                # File object or BytesIO
+                passport_stream = ensure_bytesio(passport_file)
+                image_data = np.frombuffer(passport_stream.read(), np.uint8)
+                return cv2.imdecode(image_data, cv2.IMREAD_COLOR)
+        except Exception as e:
+            self.logger.error(f"Error preparing image: {e}")
+            return None
+    def _get_ocr_instance(self, ocr_instance):
+        """Get or create OCR instance."""
+        if ocr_instance is not None:
+            return ocr_instance
+        from ..services.paddleocr_service import PaddleOCRService
+        return PaddleOCRService.get_instance("es")
+    def _extract_text_from_image(self, image: np.ndarray, ocr):
+        """Extract all text from image using OCR."""
+        try:
+            # Preprocess image for better OCR
+            processed_image = preprocess_image(image)
+            # Run OCR
+            results = ocr.predict(processed_image)
+            if not results or not results[0]:
+                return ""
+            # Combine all text
+            all_text = ""
+            for line in results[0]:
+                if len(line) >= 2:
+                    text = line[1][0].upper().strip()
+                    all_text += " " + text
+            return all_text.strip()
+        except Exception as e:
+            self.logger.error(f"Error extracting text: {e}")
+            return ""
+    def _analyze_language_indicators(self, text):
+        """Analyze text for Spanish language indicators."""
+        spanish_score = 0
+        total_indicators = 0
+        found_indicators = {
+            "spanish_keywords": [],
+            "spanish_countries": [],
+            "spanish_chars": [],
+            "spanish_patterns": [],
+            "country_names": [],
+        }
+        # Method 1: Spanish keywords
+        words = text.split()
+        for word in words:
+            if word in SPANISH_KEYWORDS:
+                spanish_score += 3
+                total_indicators += 1
+                found_indicators["spanish_keywords"].append(word)
+        # Method 2: Spanish country codes
+        for country in SPANISH_COUNTRIES:
+            if country in text:
+                spanish_score += 5
+                total_indicators += 1
+                found_indicators["spanish_countries"].append(country)
+        # Method 3: Spanish-specific characters
+        spanish_chars = ["Ñ", "Á", "É", "Í", "Ó", "Ú", "Ü"]
+        for char in spanish_chars:
+            if char in text:
+                spanish_score += 2
+                total_indicators += 1
+                found_indicators["spanish_chars"].append(char)
+        # Method 4: Spanish phrase patterns
+        spanish_patterns = [
+            r"REPUBLICA\s+DE",
+            r"REPÚBLICA\s+DE",
+            r"LUGAR\s+DE\s+NAC",
+            r"FECHA\s+DE\s+NAC",
+            r"DOCUMENTO\s+DE\s+IDENTIDAD",
+            r"CEDULA\s+DE",
+            r"CÉDULA\s+DE",
+            r"PASAPORTE\s+DE",
+        ]
+        for pattern in spanish_patterns:
+            matches = re.findall(pattern, text)
+            if matches:
+                spanish_score += 3
+                total_indicators += len(matches)
+                found_indicators["spanish_patterns"].extend(matches)
+        # Method 5: Country names
+        for country_name, code in COUNTRY_NAME_MAPPINGS.items():
+            if country_name in text and code in SPANISH_COUNTRIES:
+                spanish_score += 4
+                total_indicators += 1
+                found_indicators["country_names"].append(country_name)
+        return {
+            "spanish_score": spanish_score,
+            "total_indicators": total_indicators,
+            "found_indicators": found_indicators,
+        }
+    def _calculate_confidence(self, spanish_score, total_indicators):
+        """Calculate confidence level based on detection metrics."""
+        if spanish_score >= 8 or total_indicators >= 4:
+            return "high"
+        elif spanish_score >= 3 or total_indicators >= 2:
+            return "medium"
+        else:
+            return "low"
+    def detect_with_details(self, passport_file, ocr_instance=None):
+        """
+        Detect language with detailed analysis information.
+        Args:
+            passport_file: File object, BytesIO, image path, or numpy array
+            ocr_instance: Optional pre-initialized OCR instance
+        Returns:
+            Tuple[str, Dict]: (detected_language, analysis_details)
+        """
+        try:
+            # Convert input to OpenCV image
+            image = self._prepare_image(passport_file)
+            if image is None:
+                return "en", {"error": "Could not decode image", "confidence": "low"}
+            # Get OCR instance
+            ocr = self._get_ocr_instance(ocr_instance)
+            # Extract text
+            all_text = self._extract_text_from_image(image, ocr)
+            if not all_text:
+                return "en", {"error": "No text extracted", "confidence": "low"}
+            # Analyze for language indicators
+            analysis = self._analyze_language_indicators(all_text)
+            # Make decision
+            detected_language = (
+                "es" if analysis["spanish_score"] >= self.confidence_threshold else "en"
+            )
+            confidence = self._calculate_confidence(
+                analysis["spanish_score"], analysis["total_indicators"]
+            )
+            # Prepare detailed results
+            details = {
+                "detected_language": detected_language,
+                "confidence": confidence,
+                "spanish_score": analysis["spanish_score"],
+                "total_indicators": analysis["total_indicators"],
+                "found_indicators": analysis["found_indicators"],
+                "text_sample": (
+                    all_text[:300] + "..." if len(all_text) > 300 else all_text
+                ),
+                "analysis_method": "PassportLanguageDetector",
+            }
+            self.logger.info(
+                f"Passport language detected: {detected_language} (confidence: {confidence})"
+            )
+            return detected_language, details
+        except Exception as e:
+            self.logger.error(f"Language detection failed: {e}")
+            return "en", {"error": str(e), "confidence": "low"}
+    @classmethod
+    def detect_passport_language(
+        cls, passport_file, ocr_instance=None, confidence_threshold=3
+    ):
+        """
+        Class method for detecting passport language.
+        Args:
+            passport_file: File object, BytesIO, or image array
+            ocr_instance: Optional pre-initialized OCR instance
+            confidence_threshold: Minimum score for Spanish detection
+        Returns:
+            Tuple[str, Dict]: (detected_language, analysis_details)
+        """
+        detector = cls(confidence_threshold)
+        return detector.detect_with_details(passport_file, ocr_instance)
+# Convenience functions for easy import and use
+def detect_passport_language(passport_file, ocr_instance=None):
+    """
+    Convenience function for detecting passport language.
+    Args:
+        passport_file: File object, BytesIO, or image array
+        ocr_instance: Optional pre-initialized OCR instance
+    Returns:
+        str: 'es' for Spanish, 'en' for English
+    """
+    detected_language, _ = PassportLanguageDetector.detect_with_details(
+        passport_file, ocr_instance
+    )
+    return detected_language
+def get_passport_language_details(passport_file, ocr_instance=None):
+    """
+    Get detailed passport language detection information.
+    Args:
+        passport_file: File object, BytesIO, or image array
+        ocr_instance: Optional pre-initialized OCR instance
+    Returns:
+        Dict: Detailed detection information including confidence and indicators
+    """
+    _, details = PassportLanguageDetector.detect_with_details(
+        passport_file, ocr_instance
+    )
+    return details

document_analyzer/utils/passport_utils.py ADDED Viewed

@@ -0,0 +1,260 @@
+import re
+from ..config import FORBIDDEN_TERMS, BIRTH_PLACE_INDICATORS, ENGLISH_MONTHS
+def clean_passport_number(raw_number):
+    """
+    Clean passport number by fixing OCR mistakes.
+    - Replace 0 with O when it should be a letter.
+    - Remove invalid characters.
+    """
+    number = raw_number.strip().replace("<", "")
+    # Replace 0 with O if surrounded by letters (common mistake)
+    number = re.sub(r"([A-Z])0([A-Z])", r"\1O\2", number)
+    number = re.sub(r"([A-Z])0", r"\1O", number)
+    number = re.sub(r"0([A-Z])", r"O\1", number)
+    # Keep only alphanumeric
+    number = re.sub(r"[^A-Z0-9]", "", number)
+    return number
+def parse_mrz_date(date_str, logger=None):
+    """Parse MRZ date format (YYMMDD) to DD-MMM-YYYY format."""
+    if len(date_str) != 6:
+        return ""
+    try:
+        year = int(date_str[:2])
+        month = int(date_str[2:4])
+        day = int(date_str[4:6])
+        # Fix: for expiry_date, YY >= 30 should still map to 2000+
+        # Assume all passport dates are between 1950–2099
+        if year >= 50:
+            year += 1900
+        else:
+            year += 2000
+        if 1 <= month <= 12:
+            return f"{day:02d}-{ENGLISH_MONTHS[month]}-{year}"
+    except (ValueError, IndexError):
+        if logger:
+            logger.warning(f"Invalid MRZ date format: {date_str}")
+    return ""
+def parse_mrz_lines(mrz_lines, logger=None):
+    """Parse MRZ lines to extract passport information."""
+    passport_info = {
+        "date_of_birth": "",
+        "nationality": "",
+        "expiry_date": "",
+        "passport_number": "",
+    }
+    if len(mrz_lines) < 2:
+        if logger:
+            logger.warning("Insufficient MRZ lines for parsing")
+        return passport_info
+    try:
+        line2 = mrz_lines[1]
+        # Passport Number
+        raw_passport_number = line2[0:9]
+        passport_info["passport_number"] = clean_passport_number(raw_passport_number)
+        # Nationality
+        passport_info["nationality"] = line2[10:13]
+        # DOB (YYMMDD at index 13–19)
+        dob_str = line2[13:19]
+        passport_info["date_of_birth"] = parse_mrz_date(dob_str, logger)
+        # Expiry Date (YYMMDD at index 21–27)
+        expiry_str = line2[21:27]
+        passport_info["expiry_date"] = parse_mrz_date(expiry_str, logger)
+        if logger:
+            logger.debug(f"Parsed MRZ data: {passport_info}")
+    except Exception as e:
+        if logger:
+            logger.error(f"Error parsing MRZ: {str(e)}")
+    return passport_info
+def aggressive_clean_pob(text):
+    """Aggressively clean POB text to remove document field contamination."""
+    if not text:
+        return ""
+    # First basic cleanup
+    cleaned = text.strip(" :/.,;-")
+    # Remove OCR artifacts
+    artifacts = ["<<<", ">>>", "<<", ">>", "||", "|"]
+    for artifact in artifacts:
+        cleaned = cleaned.replace(artifact, "")
+    # AGGRESSIVE: Split by common separators and take only the first meaningful part
+    separators = [
+        r"\s+m/",
+        r"\s+rte",
+        r"\s+/Place",
+        r"\s+Place\s+of",
+        r"\s+Lugar\s+de",
+        r"\s+Date\s+of",
+        r"\s+Authority",
+        r"\s+Fecha\s+de",
+        r"\s+SURAT",
+    ]
+    for separator in separators:
+        parts = re.split(separator, cleaned, flags=re.IGNORECASE)
+        if len(parts) > 1:
+            cleaned = parts[0].strip()
+            break
+    # Remove any trailing fragments that look like document fields
+    unwanted_endings = [
+        r"\s+m$",
+        r"\s+rt$",
+        r"\s+rte$",
+        r"\s+/P$",
+        r"\s+Pl$",
+        r"\s+Place$",
+        r"\s+of$",
+        r"\s+Issue$",
+        r"\s+Auth$",
+    ]
+    for ending in unwanted_endings:
+        cleaned = re.sub(ending, "", cleaned, flags=re.IGNORECASE)
+    # Final cleanup
+    cleaned = cleaned.strip(" /:-.,")
+    return cleaned
+def is_clean_place_name(text):
+    """Very strict validation for place names."""
+    if not text or len(text) < 3:
+        return False
+    text_upper = text.upper().strip()
+    # Reject if contains document field indicators
+    for term in FORBIDDEN_TERMS:
+        if term in text_upper:
+            return False
+    # Must be mostly alphabetic (allow spaces, commas, but not too many special chars)
+    alpha_chars = sum(1 for c in text if c.isalpha())
+    total_chars = len(text.replace(" ", "").replace(",", ""))
+    if total_chars > 0 and alpha_chars / total_chars < 0.7:  # At least 70% letters
+        return False
+    # Reasonable length for place names
+    if len(text) > 40:
+        return False
+    return True
+def extract_mrz_data(extracted_data, logger=None):
+    """Extract and parse MRZ (Machine Readable Zone) data.
+    Args:
+        extracted_data (list): List of text data from OCR extraction.
+        logger: Logger instance for logging.
+    Returns:
+        dict: Parsed MRZ data containing passport info.
+    """
+    if logger:
+        logger.debug("Starting MRZ data extraction")
+    mrz_lines = []
+    # Find MRZ lines (typically at bottom, contain mostly uppercase and special chars)
+    for item in extracted_data:
+        text = item["text"].strip()
+        # MRZ lines are typically long, contain < characters, and are mostly uppercase
+        if len(text) > 20 and "<" in text and text.isupper():
+            mrz_lines.append(text)
+    if not mrz_lines:
+        if logger:
+            logger.warning("No MRZ lines found")
+        return {}
+    # Sort MRZ lines by vertical position (top to bottom)
+    mrz_items = [
+        (item, item["text"])
+        for item in extracted_data
+        if len(item["text"]) > 20 and "<" in item["text"] and item["text"].isupper()
+    ]
+    mrz_items.sort(key=lambda x: x[0]["center_y"])
+    mrz_lines = [item[1] for item in mrz_items]
+    if logger:
+        logger.debug(f"Found {len(mrz_lines)} MRZ lines")
+    return parse_mrz_lines(mrz_lines, logger)
+def extract_place_of_birth(extracted_data, logger=None):
+    """Extract place of birth from passport OCR data.
+    Args:
+        extracted_data (list): List of text data from OCR extraction.
+        logger: Logger instance for logging.
+    Returns:
+        str: Extracted place of birth or empty string.
+    """
+    if logger:
+        logger.debug("Searching for place of birth")
+    # Method 1: Look for indicators and extract carefully
+    for i, item in enumerate(extracted_data):
+        text = item["text"].upper()
+        for indicator in BIRTH_PLACE_INDICATORS:
+            if indicator in text:
+                if logger:
+                    logger.debug(f"Found birth place indicator: '{indicator}'")
+                # Extract from same line (after indicator)
+                parts = text.split(indicator)
+                if len(parts) > 1 and parts[1].strip():
+                    candidate = aggressive_clean_pob(parts[1])
+                    if is_clean_place_name(candidate):
+                        if logger:
+                            logger.debug(f"Found POB on same line: '{candidate}'")
+                        return candidate
+                # Check next few lines with aggressive cleaning
+                for offset in range(1, min(4, len(extracted_data) - i)):
+                    if i + offset < len(extracted_data):
+                        next_text = extracted_data[i + offset]["text"]
+                        candidate = aggressive_clean_pob(next_text)
+                        if is_clean_place_name(candidate):
+                            if logger:
+                                logger.debug(
+                                    f"Found POB on line +{offset}: '{candidate}'"
+                                )
+                            return candidate
+    return ""