PyPI - document-analyzer - Versions diffs - 0.1.0__py3-none-any.whl - Mend

document-analyzer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

document_analyzer/__init__.py +31 -0
document_analyzer/__main__.py +5 -0
document_analyzer/analyzers/__init__.py +14 -0
document_analyzer/analyzers/cedula_analyzer.py +412 -0
document_analyzer/analyzers/document_analyzer.py +187 -0
document_analyzer/analyzers/passport_analyzer.py +294 -0
document_analyzer/cli.py +401 -0
document_analyzer/config/__init__.py +30 -0
document_analyzer/config/constants.py +230 -0
document_analyzer/config/logger.py +36 -0
document_analyzer/services/__init__.py +3 -0
document_analyzer/services/paddleocr_service.py +107 -0
document_analyzer/startup.py +24 -0
document_analyzer/utils/__init__.py +57 -0
document_analyzer/utils/cedula_utils.py +155 -0
document_analyzer/utils/common_utils.py +229 -0
document_analyzer/utils/extract_cedula_signature.py +431 -0
document_analyzer/utils/passport_language_detector.py +277 -0
document_analyzer/utils/passport_utils.py +260 -0
document_analyzer-0.1.0.dist-info/METADATA +520 -0
document_analyzer-0.1.0.dist-info/RECORD +25 -0
document_analyzer-0.1.0.dist-info/WHEEL +5 -0
document_analyzer-0.1.0.dist-info/entry_points.txt +2 -0
document_analyzer-0.1.0.dist-info/licenses/LICENSE +201 -0
document_analyzer-0.1.0.dist-info/top_level.txt +1 -0

document_analyzer/config/__init__.py ADDED Viewed

@@ -0,0 +1,30 @@
+from .logger import DocumentAnalyzerLoggerAdapter, logger
+from .constants import (
+    SPANISH_COUNTRIES,
+    SPANISH_KEYWORDS,
+    COUNTRY_NAME_MAPPINGS,
+    CEDULA_INDICATORS,
+    PASSPORT_INDICATORS,
+    BIRTH_PLACE_INDICATORS,
+    DOCUMENT_KEYWORDS_ES,
+    FORBIDDEN_TERMS,
+    SPANISH_TO_ENGLISH_MONTHS,
+    ENGLISH_MONTHS,
+)
+__all__ = [
+    # Logging
+    "DocumentAnalyzerLoggerAdapter",
+    "logger",
+    # Constants
+    "SPANISH_COUNTRIES",
+    "SPANISH_KEYWORDS",
+    "COUNTRY_NAME_MAPPINGS",
+    "CEDULA_INDICATORS",
+    "PASSPORT_INDICATORS",
+    "BIRTH_PLACE_INDICATORS",
+    "DOCUMENT_KEYWORDS_ES",
+    "FORBIDDEN_TERMS",
+    "SPANISH_TO_ENGLISH_MONTHS",
+    "ENGLISH_MONTHS",
+]

document_analyzer/config/constants.py ADDED Viewed

@@ -0,0 +1,230 @@
+# Spanish-speaking countries (ISO 3166-1 alpha-3 codes)
+SPANISH_COUNTRIES = {
+    "ESP",
+    "ARG",
+    "BOL",
+    "CHL",
+    "COL",
+    "CRI",
+    "CUB",
+    "DOM",
+    "ECU",
+    "SLV",
+    "GTM",
+    "HND",
+    "MEX",
+    "NIC",
+    "PAN",
+    "PRY",
+    "PER",
+    "URY",
+    "VEN",
+}
+# Spanish keywords commonly found in passports
+SPANISH_KEYWORDS = {
+    "REPUBLICA",
+    "REPÚBLICA",
+    "PASAPORTE",
+    "CEDULA",
+    "CÉDULA",
+    "NACIONALIDAD",
+    "FECHA",
+    "NACIMIENTO",
+    "LUGAR",
+    "EXPEDICION",
+    "EXPEDICIÓN",
+    "VENCIMIENTO",
+    "DOCUMENTO",
+    "IDENTIDAD",
+    "COLOMBIANA",
+    "PANAMENA",
+    "PANAMEÑA",
+    "ESPAÑOLA",
+    "ARGENTINA",
+    "MEXICANA",
+    "VENEZOLANA",
+    "PERUANA",
+    "CHILENA",
+    "ECUATORIANA",
+    "BOLIVIANA",
+    "COSTARRICENSE",
+    "CUBANA",
+    "DOMINICANA",
+    "SALVADOREÑA",
+    "GUATEMALTECA",
+    "HONDUREÑA",
+    "NICARAGÜENSE",
+    "PARAGUAYA",
+    "URUGUAYA",
+}
+# Country name to code mappings
+COUNTRY_NAME_MAPPINGS = {
+    "COLOMBIA": "COL",
+    "COLOMBIANA": "COL",
+    "REPUBLIC OF COLOMBIA": "COL",
+    "PANAMA": "PAN",
+    "PANAMENA": "PAN",
+    "PANAMEÑA": "PAN",
+    "REPUBLIC OF PANAMA": "PAN",
+    "ESPAÑA": "ESP",
+    "SPAIN": "ESP",
+    "ESPAÑOLA": "ESP",
+    "KINGDOM OF SPAIN": "ESP",
+    "ARGENTINA": "ARG",
+    "REPUBLIC OF ARGENTINA": "ARG",
+    "MEXICO": "MEX",
+    "MEXICANA": "MEX",
+    "UNITED MEXICAN STATES": "MEX",
+    "VENEZUELA": "VEN",
+    "VENEZOLANA": "VEN",
+    "REPUBLIC OF VENEZUELA": "VEN",
+    "PERU": "PER",
+    "PERUANA": "PER",
+    "REPUBLIC OF PERU": "PER",
+    "CHILE": "CHL",
+    "CHILENA": "CHL",
+    "REPUBLIC OF CHILE": "CHL",
+    "ECUADOR": "ECU",
+    "ECUATORIANA": "ECU",
+    "REPUBLIC OF ECUADOR": "ECU",
+    "BOLIVIA": "BOL",
+    "BOLIVIANA": "BOL",
+    "COSTA RICA": "CRI",
+    "COSTARRICENSE": "CRI",
+    "CUBA": "CUB",
+    "CUBANA": "CUB",
+    "REPUBLIC OF CUBA": "CUB",
+}
+# Cedula indicators
+CEDULA_INDICATORS = [
+    "REPÚBLICA DE PANAMÁ",
+    "REPUBLICA DE PANAMA",
+    "TRIBUNAL ELECTORAL",
+    "CÉDULA",
+    "CEDULA",
+    "CARNÉ DE IDENTIDAD",
+    "CARNE DE IDENTIDAD",
+    "DOCUMENTO DE IDENTIDAD",
+    "PANAMÁ",
+    "PANAMA",
+]
+# Passport indicators
+PASSPORT_INDICATORS = [
+    "PASSPORT",
+    "PASSEPORT",
+    "PASAPORTE",
+    "REPUBLIC OF",
+    "KINGDOM OF",
+    "UNITED STATES",
+    "CANADA",
+    "AUSTRALIA",
+    "GERMANY",
+    "FRANCE",
+    "SPAIN",
+    "ITALY",
+    "BRAZIL",
+    "MEXICO",
+    "INDIA",
+    "P<",
+]
+# Place of birth indicators
+BIRTH_PLACE_INDICATORS = [
+    "PLACE OF BIRTH",
+    "BIRTHPLACE",
+    "POB",
+    "BIRTH PLACE",
+    "P.O.B",
+    "LUGAR DE NACIMIENTO",
+    "LUGAR DE NAC",
+    "LUGAR NACIMIENTO",
+    "LIEU DE NAISSANCE",
+    "LIEU NAISSANCE",
+    "JANM STHAN",
+    "POB:",
+    "BIRTH:",
+    "LUGAR DE NAC.:",
+]
+# Keywords that are commonly found in Panamanian documents
+# such as Cedula and Passport.
+DOCUMENT_KEYWORDS_ES = [
+    "REPÚBLICA",
+    "PANAMÁ",
+    "DOCUMENTO",
+    "IDENTIDAD",
+    "TRIBUNAL",
+    "ELECTORAL",
+    "CARNÉ",
+    "PERMANENTE",
+    "NOMBRE",
+    "USUAL",
+    "FECHA",
+    "LUGAR",
+    "NACIMIENTO",
+    "NACIONALIDAD",
+    "SEXO",
+    "TIPO",
+    "SANGRE",
+    "EXPEDIDA",
+    "EXPIRA",
+]
+# Forbidden terms that should not
+# appear in a place name
+FORBIDDEN_TERMS = [
+    "ISSUE",
+    "EXPIRY",
+    "AUTHORITY",
+    "SIGNATURE",
+    "DATE",
+    "PASSPORT",
+    "PLACE OF ISSUE",
+    "LUGAR DE EXPEDICION",
+    "FECHA DE",
+    "RTE",
+    "DOCUMENT",
+    "VALID",
+    "HOLDER",
+    "TYPE",
+    "CODE",
+    "NUMBER",
+]
+# Mapping of Spanish month abbreviations to
+# English month abbreviations
+SPANISH_TO_ENGLISH_MONTHS = {
+    "ENE": "JAN",
+    "FEB": "FEB",
+    "MAR": "MAR",
+    "ABR": "APR",
+    "MAY": "MAY",
+    "JUN": "JUN",
+    "JUL": "JUL",
+    "AGO": "AUG",
+    "SEP": "SEP",
+    "OCT": "OCT",
+    "NOV": "NOV",
+    "DIC": "DEC",
+}
+# English months
+ENGLISH_MONTHS = [
+    "",
+    "JAN",
+    "FEB",
+    "MAR",
+    "APR",
+    "MAY",
+    "JUN",
+    "JUL",
+    "AUG",
+    "SEP",
+    "OCT",
+    "NOV",
+    "DEC",
+]

document_analyzer/config/logger.py ADDED Viewed

@@ -0,0 +1,36 @@
+import logging
+logger = logging.getLogger("document_analyzer")
+class DocumentAnalyzerLoggerAdapter(logging.LoggerAdapter):
+    """A logging adapter that adds user context to document_analyzer log messages.
+    This adapter extends the standard LoggerAdapter to automatically prepend
+    log messages with the DocumentAnalyzer prefix and user email information,
+    providing better traceability and context for debugging and monitoring
+    purposes.
+    The adapter formats log messages in the pattern:
+        [DocumentAnalyzer] (user_email) original_message
+    Args:
+        logger: The base logging.Logger instance to wrap.
+        extra (dict, optional): Context dictionary. Should contain 'user_email'
+            for user identification. Defaults to "unknown" if missing.
+    Example:
+        >>> extra_info = {"user_email": "user@example.com"}
+        >>> adapter = DocumentAnalyzerLoggerAdapter(logger, extra_info)
+        >>> adapter.info("Processing document")
+        # Output: [DocumentAnalyzer] (user@example.com) Processing document
+    """
+    def __init__(self, logger, extra=None):
+        if extra is None:
+            extra = {}
+        super().__init__(logger, extra)
+    def process(self, msg, kwargs):
+        user_email = self.extra.get("user_email", "unknown")
+        return f"[DocumentAnalyzer] ({user_email}) {msg}", kwargs

document_analyzer/services/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .paddleocr_service import PaddleOCRService
+__all__ = ["PaddleOCRService"]

document_analyzer/services/paddleocr_service.py ADDED Viewed

@@ -0,0 +1,107 @@
+import os
+# Suppress PaddleOCR verbose output
+os.environ["GLOG_minloglevel"] = "2"
+os.environ["FLAGS_enable_pir_api"] = "true"
+os.environ["FLAGS_print_model_stats"] = "false"
+import threading
+from paddleocr import PaddleOCR
+from ..config import logger
+class PaddleOCRService:
+    """
+    Thread-safe manager for PaddleOCR instances with automatic language detection.
+    """
+    LANGUAGES = ("es", "en")
+    _ocr_instances = {}
+    _lock = threading.Lock()
+    @classmethod
+    def initialize(cls, langs=LANGUAGES):
+        """Initialize PaddleOCR for one or more languages - thread safe"""
+        with cls._lock:
+            for lang in langs:
+                logger.info(f"Initializing PaddleOCR for language '{lang}'...")
+                if lang not in cls.LANGUAGES:
+                    raise ValueError(
+                        f"Unsupported language: {lang}. Allowed: {cls.LANGUAGES}"
+                    )
+                if lang in cls._ocr_instances:
+                    logger.info(
+                        f"PaddleOCR for language '{lang}' is already initialized."
+                    )
+                    continue
+                logger.debug(f"Loading PaddleOCR models for language '{lang}'...")
+                cls._ocr_instances[lang] = PaddleOCR(
+                    lang=lang, use_textline_orientation=True
+                )
+            logger.info(
+                f"PaddleOCR initialized for language(s): {list(cls._ocr_instances.keys())}"
+            )
+            return cls._ocr_instances
+    @classmethod
+    def get_instance(cls, lang="es"):
+        """Return OCR instance for a specific language, initializing if necessary"""
+        if lang not in cls.LANGUAGES:
+            raise ValueError(f"Unsupported language: {lang}. Allowed: {cls.LANGUAGES}")
+        if lang not in cls._ocr_instances:
+            logger.info(f"OCR instance for {lang} not found, initializing now...")
+            cls.initialize([lang])
+        return cls._ocr_instances[lang]
+    @classmethod
+    def get_auto_instance(cls, passport_file):
+        """
+        Automatically detect optimal language and return OCR instance for passport.
+        Args:
+            passport_file: File object or BytesIO containing passport image
+        Returns:
+            tuple: (ocr_instance, detected_language)
+        """
+        try:
+            from ..utils.passport_language_detector import detect_passport_language
+            # Detect language using the PassportLanguageDetector
+            detected_lang, detection_details = detect_passport_language(passport_file)
+            logger.info(f"Auto-detected passport language: {detected_lang}")
+            # Get appropriate OCR instance
+            ocr_instance = cls.get_instance(detected_lang)
+            return ocr_instance, detected_lang, detection_details
+        except Exception as e:
+            logger.error(f"Language detection failed: {e}, defaulting to English")
+            return cls.get_instance("en"), "en", None
+    @classmethod
+    def is_ready(cls, lang=None):
+        """Check if PaddleOCR is ready (global or per language)"""
+        if lang:
+            return lang in cls._ocr_instances
+        return bool(cls._ocr_instances)
+    @classmethod
+    def list_loaded_languages(cls):
+        """Get list of currently loaded language models"""
+        return list(cls._ocr_instances.keys())
+    @classmethod
+    def clear_cache(cls):
+        """Clear all cached OCR instances (use with caution)"""
+        with cls._lock:
+            logger.warning("Clearing all PaddleOCR instances from cache")
+            cls._ocr_instances.clear()
+            logger.info("PaddleOCR cache cleared")

document_analyzer/startup.py ADDED Viewed

@@ -0,0 +1,24 @@
+import os
+import sys
+from .config import logger
+def startup_services():
+    try:
+        if any(cmd in sys.argv for cmd in ["runserver", "gunicorn", "uwsgi"]):
+            run_main = os.environ.get("RUN_MAIN")
+            is_dev_reloader = run_main == "true"
+            is_manual_runserver = "--noreload" in sys.argv
+            is_prod = "gunicorn" in sys.argv or "uwsgi" in sys.argv
+            if is_dev_reloader or is_manual_runserver or is_prod:
+                from .services import PaddleOCRService
+                if PaddleOCRService.is_ready():
+                    logger.info("PaddleOCR already initialized, reusing instance.")
+                else:
+                    PaddleOCRService.initialize()
+    except ImportError:
+        logger.error("Failed to import PaddleOCRService during app startup")
+    except Exception as e:
+        logger.error("Failed to initialize PaddleOCR during app startup: %s", e)

document_analyzer/utils/__init__.py ADDED Viewed

@@ -0,0 +1,57 @@
+from .common_utils import (
+    ensure_bytesio,
+    preprocess_image,
+    create_text_data,
+    extract_data_with_boxes,
+)
+from .cedula_utils import draw_bounding_boxes, convert_spanish_date_to_english
+from .extract_cedula_signature import (
+    find_expira_block,
+    fallback_signature_detection,
+    identify_signature_box,
+    process_signature_to_bw,
+    extract_signature_image,
+)
+from .passport_utils import (
+    clean_passport_number,
+    parse_mrz_date,
+    parse_mrz_lines,
+    aggressive_clean_pob,
+    is_clean_place_name,
+    extract_mrz_data,
+    extract_place_of_birth,
+)
+from .passport_language_detector import (
+    PassportLanguageDetector,
+    detect_passport_language,
+    get_passport_language_details,
+)
+__all__ = [
+    # Common utilities
+    "ensure_bytesio",
+    "preprocess_image",
+    "create_text_data",
+    "extract_data_with_boxes",
+    # Cedula utilities
+    "draw_bounding_boxes",
+    "convert_spanish_date_to_english",
+    # Cedula signature extraction
+    "find_expira_block",
+    "fallback_signature_detection",
+    "identify_signature_box",
+    "process_signature_to_bw",
+    "extract_signature_image",
+    # Passport utilities
+    "clean_passport_number",
+    "parse_mrz_date",
+    "parse_mrz_lines",
+    "aggressive_clean_pob",
+    "is_clean_place_name",
+    "extract_mrz_data",
+    "extract_place_of_birth",
+    # Passport language detection
+    "PassportLanguageDetector",
+    "detect_passport_language",
+    "get_passport_language_details",
+]

document_analyzer/utils/cedula_utils.py ADDED Viewed

@@ -0,0 +1,155 @@
+import re
+import cv2
+import numpy as np
+from ..config import logger as default_logger
+# =============================================================================
+# VISUALIZATION UTILITY
+# =============================================================================
+def draw_bounding_boxes(image, extracted_data, signature_box=None, logger=None):
+    """Draw bounding boxes on the cedula image for visualization.
+    Creates a visualization of the OCR results by drawing bounding boxes around
+    detected text areas. Special highlighting is applied to signature areas.
+    Args:
+        image (np.ndarray): Original cédula image in BGR format.
+        extracted_data (list): List of text data dictionaries from OCR extraction.
+        signature_box (dict, optional): Specific text data dict identified as
+            signature area. Will be highlighted in red.
+        logger (logging.Logger, optional): Logger instance for debug messages.
+            Defaults to module's default logger.
+    Returns:
+        np.ndarray: Copy of input image with bounding boxes and labels drawn.
+            Regular text boxes are green, signature box is red.
+    Note:
+        - Text labels are truncated to prevent overcrowding
+        - Background rectangles ensure label readability
+        - Label positioning adapts to avoid image boundaries
+    Examples:
+        >>> vis_image = draw_bounding_boxes(image, extracted_data, signature_box)
+        >>> cv2.imshow("Visualization", vis_image)
+        >>> cv2.waitKey(0)
+    """
+    if logger is None:
+        logger = default_logger
+    logger.debug("Drawing bounding boxes for visualization")
+    # Create a copy of the image to draw on
+    vis_image = image.copy()
+    # Draw all text bounding boxes
+    for i, item in enumerate(extracted_data):
+        bbox = item["bbox"]
+        text = item["text"]
+        _ = item["confidence"]
+        # Convert bbox to integer coordinates
+        points = np.array(bbox, dtype=np.int32)
+        # Determine color based on whether this is the signature box
+        if signature_box and item is signature_box:
+            color = (0, 0, 255)  # Red for signature
+            thickness = 3
+            label = (
+                f"SIGNATURE: {text[:20]}..." if len(text) > 20 else f"SIGNATURE: {text}"
+            )
+        else:
+            color = (0, 255, 0)  # Green for regular text
+            thickness = 2
+            label = f"{i+1}: {text[:15]}..." if len(text) > 15 else f"{i+1}: {text}"
+        # Draw the bounding box
+        cv2.polylines(vis_image, [points], True, color, thickness)
+        # Draw text label above the box
+        label_y = int(min([p[1] for p in points])) - 10
+        if label_y < 20:
+            label_y = int(max([p[1] for p in points])) + 25
+        # Add background rectangle for text readability
+        (text_width, text_height), _ = cv2.getTextSize(
+            label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1
+        )
+        cv2.rectangle(
+            vis_image,
+            (int(min([p[0] for p in points])), label_y - text_height - 5),
+            (int(min([p[0] for p in points])) + text_width, label_y + 5),
+            (255, 255, 255),
+            -1,
+        )
+        # Draw the text
+        cv2.putText(
+            vis_image,
+            label,
+            (int(min([p[0] for p in points])), label_y),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.5,
+            (0, 0, 0),
+            1,
+        )
+    logger.debug("Bounding box visualization completed")
+    return vis_image
+# =============================================================================
+# DATE CONVERSION UTILITY
+# =============================================================================
+def convert_spanish_date_to_english(date_str):
+    """Convert Spanish date format to English format.
+    Converts date strings from Spanish month abbreviations to English
+    equivalents while maintaining the DD-MMM-YYYY format commonly used
+    in Panamanian documents.
+    Args:
+        date_str (str): Date string in Spanish format (e.g., "14-AGO-1947").
+            Can also be None or non-string values.
+    Returns:
+        str: Date string in English format (e.g., "14-AUG-1947").
+            Returns original input if conversion is not possible.
+    Note:
+        - Only processes strings matching DD-MMM-YYYY pattern
+        - Case-insensitive matching for month abbreviations
+        - Preserves original format if no Spanish months are detected
+        - Handles edge cases gracefully by returning original input
+    Examples:
+        >>> convert_spanish_date_to_english("14-AGO-1947")
+        '14-AUG-1947'
+        >>> convert_spanish_date_to_english("23-MAR-1940")
+        '23-MAR-1940'  # MAR is same in both languages
+        >>> convert_spanish_date_to_english("invalid-date")
+        'invalid-date'  # Returns original if no match
+        >>> convert_spanish_date_to_english(None)
+        None  # Handles None input gracefully
+    """
+    if not date_str or not isinstance(date_str, str):
+        return date_str
+    date_pattern = r"(\d{1,2})-([A-Z]{3})-(\d{4})"
+    match = re.search(date_pattern, date_str.upper())
+    if not match:
+        return date_str
+    from ..config import SPANISH_TO_ENGLISH_MONTHS
+    day, spanish_month, year = match.groups()
+    english_month = SPANISH_TO_ENGLISH_MONTHS.get(spanish_month, spanish_month)
+    return f"{day}-{english_month}-{year}"