document-analyzer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,30 @@
1
+ from .logger import DocumentAnalyzerLoggerAdapter, logger
2
+ from .constants import (
3
+ SPANISH_COUNTRIES,
4
+ SPANISH_KEYWORDS,
5
+ COUNTRY_NAME_MAPPINGS,
6
+ CEDULA_INDICATORS,
7
+ PASSPORT_INDICATORS,
8
+ BIRTH_PLACE_INDICATORS,
9
+ DOCUMENT_KEYWORDS_ES,
10
+ FORBIDDEN_TERMS,
11
+ SPANISH_TO_ENGLISH_MONTHS,
12
+ ENGLISH_MONTHS,
13
+ )
14
+
15
+ __all__ = [
16
+ # Logging
17
+ "DocumentAnalyzerLoggerAdapter",
18
+ "logger",
19
+ # Constants
20
+ "SPANISH_COUNTRIES",
21
+ "SPANISH_KEYWORDS",
22
+ "COUNTRY_NAME_MAPPINGS",
23
+ "CEDULA_INDICATORS",
24
+ "PASSPORT_INDICATORS",
25
+ "BIRTH_PLACE_INDICATORS",
26
+ "DOCUMENT_KEYWORDS_ES",
27
+ "FORBIDDEN_TERMS",
28
+ "SPANISH_TO_ENGLISH_MONTHS",
29
+ "ENGLISH_MONTHS",
30
+ ]
@@ -0,0 +1,230 @@
1
+ # Spanish-speaking countries (ISO 3166-1 alpha-3 codes)
2
+ SPANISH_COUNTRIES = {
3
+ "ESP",
4
+ "ARG",
5
+ "BOL",
6
+ "CHL",
7
+ "COL",
8
+ "CRI",
9
+ "CUB",
10
+ "DOM",
11
+ "ECU",
12
+ "SLV",
13
+ "GTM",
14
+ "HND",
15
+ "MEX",
16
+ "NIC",
17
+ "PAN",
18
+ "PRY",
19
+ "PER",
20
+ "URY",
21
+ "VEN",
22
+ }
23
+
24
+ # Spanish keywords commonly found in passports
25
+ SPANISH_KEYWORDS = {
26
+ "REPUBLICA",
27
+ "REPÚBLICA",
28
+ "PASAPORTE",
29
+ "CEDULA",
30
+ "CÉDULA",
31
+ "NACIONALIDAD",
32
+ "FECHA",
33
+ "NACIMIENTO",
34
+ "LUGAR",
35
+ "EXPEDICION",
36
+ "EXPEDICIÓN",
37
+ "VENCIMIENTO",
38
+ "DOCUMENTO",
39
+ "IDENTIDAD",
40
+ "COLOMBIANA",
41
+ "PANAMENA",
42
+ "PANAMEÑA",
43
+ "ESPAÑOLA",
44
+ "ARGENTINA",
45
+ "MEXICANA",
46
+ "VENEZOLANA",
47
+ "PERUANA",
48
+ "CHILENA",
49
+ "ECUATORIANA",
50
+ "BOLIVIANA",
51
+ "COSTARRICENSE",
52
+ "CUBANA",
53
+ "DOMINICANA",
54
+ "SALVADOREÑA",
55
+ "GUATEMALTECA",
56
+ "HONDUREÑA",
57
+ "NICARAGÜENSE",
58
+ "PARAGUAYA",
59
+ "URUGUAYA",
60
+ }
61
+
62
+ # Country name to code mappings
63
+ COUNTRY_NAME_MAPPINGS = {
64
+ "COLOMBIA": "COL",
65
+ "COLOMBIANA": "COL",
66
+ "REPUBLIC OF COLOMBIA": "COL",
67
+ "PANAMA": "PAN",
68
+ "PANAMENA": "PAN",
69
+ "PANAMEÑA": "PAN",
70
+ "REPUBLIC OF PANAMA": "PAN",
71
+ "ESPAÑA": "ESP",
72
+ "SPAIN": "ESP",
73
+ "ESPAÑOLA": "ESP",
74
+ "KINGDOM OF SPAIN": "ESP",
75
+ "ARGENTINA": "ARG",
76
+ "REPUBLIC OF ARGENTINA": "ARG",
77
+ "MEXICO": "MEX",
78
+ "MEXICANA": "MEX",
79
+ "UNITED MEXICAN STATES": "MEX",
80
+ "VENEZUELA": "VEN",
81
+ "VENEZOLANA": "VEN",
82
+ "REPUBLIC OF VENEZUELA": "VEN",
83
+ "PERU": "PER",
84
+ "PERUANA": "PER",
85
+ "REPUBLIC OF PERU": "PER",
86
+ "CHILE": "CHL",
87
+ "CHILENA": "CHL",
88
+ "REPUBLIC OF CHILE": "CHL",
89
+ "ECUADOR": "ECU",
90
+ "ECUATORIANA": "ECU",
91
+ "REPUBLIC OF ECUADOR": "ECU",
92
+ "BOLIVIA": "BOL",
93
+ "BOLIVIANA": "BOL",
94
+ "COSTA RICA": "CRI",
95
+ "COSTARRICENSE": "CRI",
96
+ "CUBA": "CUB",
97
+ "CUBANA": "CUB",
98
+ "REPUBLIC OF CUBA": "CUB",
99
+ }
100
+
101
+ # Cedula indicators
102
+ CEDULA_INDICATORS = [
103
+ "REPÚBLICA DE PANAMÁ",
104
+ "REPUBLICA DE PANAMA",
105
+ "TRIBUNAL ELECTORAL",
106
+ "CÉDULA",
107
+ "CEDULA",
108
+ "CARNÉ DE IDENTIDAD",
109
+ "CARNE DE IDENTIDAD",
110
+ "DOCUMENTO DE IDENTIDAD",
111
+ "PANAMÁ",
112
+ "PANAMA",
113
+ ]
114
+
115
+ # Passport indicators
116
+ PASSPORT_INDICATORS = [
117
+ "PASSPORT",
118
+ "PASSEPORT",
119
+ "PASAPORTE",
120
+ "REPUBLIC OF",
121
+ "KINGDOM OF",
122
+ "UNITED STATES",
123
+ "CANADA",
124
+ "AUSTRALIA",
125
+ "GERMANY",
126
+ "FRANCE",
127
+ "SPAIN",
128
+ "ITALY",
129
+ "BRAZIL",
130
+ "MEXICO",
131
+ "INDIA",
132
+ "P<",
133
+ ]
134
+
135
+ # Place of birth indicators
136
+ BIRTH_PLACE_INDICATORS = [
137
+ "PLACE OF BIRTH",
138
+ "BIRTHPLACE",
139
+ "POB",
140
+ "BIRTH PLACE",
141
+ "P.O.B",
142
+ "LUGAR DE NACIMIENTO",
143
+ "LUGAR DE NAC",
144
+ "LUGAR NACIMIENTO",
145
+ "LIEU DE NAISSANCE",
146
+ "LIEU NAISSANCE",
147
+ "JANM STHAN",
148
+ "POB:",
149
+ "BIRTH:",
150
+ "LUGAR DE NAC.:",
151
+ ]
152
+
153
+ # Keywords that are commonly found in Panamanian documents
154
+ # such as Cedula and Passport.
155
+ DOCUMENT_KEYWORDS_ES = [
156
+ "REPÚBLICA",
157
+ "PANAMÁ",
158
+ "DOCUMENTO",
159
+ "IDENTIDAD",
160
+ "TRIBUNAL",
161
+ "ELECTORAL",
162
+ "CARNÉ",
163
+ "PERMANENTE",
164
+ "NOMBRE",
165
+ "USUAL",
166
+ "FECHA",
167
+ "LUGAR",
168
+ "NACIMIENTO",
169
+ "NACIONALIDAD",
170
+ "SEXO",
171
+ "TIPO",
172
+ "SANGRE",
173
+ "EXPEDIDA",
174
+ "EXPIRA",
175
+ ]
176
+
177
+ # Forbidden terms that should not
178
+ # appear in a place name
179
+ FORBIDDEN_TERMS = [
180
+ "ISSUE",
181
+ "EXPIRY",
182
+ "AUTHORITY",
183
+ "SIGNATURE",
184
+ "DATE",
185
+ "PASSPORT",
186
+ "PLACE OF ISSUE",
187
+ "LUGAR DE EXPEDICION",
188
+ "FECHA DE",
189
+ "RTE",
190
+ "DOCUMENT",
191
+ "VALID",
192
+ "HOLDER",
193
+ "TYPE",
194
+ "CODE",
195
+ "NUMBER",
196
+ ]
197
+
198
+ # Mapping of Spanish month abbreviations to
199
+ # English month abbreviations
200
+ SPANISH_TO_ENGLISH_MONTHS = {
201
+ "ENE": "JAN",
202
+ "FEB": "FEB",
203
+ "MAR": "MAR",
204
+ "ABR": "APR",
205
+ "MAY": "MAY",
206
+ "JUN": "JUN",
207
+ "JUL": "JUL",
208
+ "AGO": "AUG",
209
+ "SEP": "SEP",
210
+ "OCT": "OCT",
211
+ "NOV": "NOV",
212
+ "DIC": "DEC",
213
+ }
214
+
215
+ # English months
216
+ ENGLISH_MONTHS = [
217
+ "",
218
+ "JAN",
219
+ "FEB",
220
+ "MAR",
221
+ "APR",
222
+ "MAY",
223
+ "JUN",
224
+ "JUL",
225
+ "AUG",
226
+ "SEP",
227
+ "OCT",
228
+ "NOV",
229
+ "DEC",
230
+ ]
@@ -0,0 +1,36 @@
1
+ import logging
2
+
3
+ logger = logging.getLogger("document_analyzer")
4
+
5
+
6
+ class DocumentAnalyzerLoggerAdapter(logging.LoggerAdapter):
7
+ """A logging adapter that adds user context to document_analyzer log messages.
8
+
9
+ This adapter extends the standard LoggerAdapter to automatically prepend
10
+ log messages with the DocumentAnalyzer prefix and user email information,
11
+ providing better traceability and context for debugging and monitoring
12
+ purposes.
13
+
14
+ The adapter formats log messages in the pattern:
15
+ [DocumentAnalyzer] (user_email) original_message
16
+
17
+ Args:
18
+ logger: The base logging.Logger instance to wrap.
19
+ extra (dict, optional): Context dictionary. Should contain 'user_email'
20
+ for user identification. Defaults to "unknown" if missing.
21
+
22
+ Example:
23
+ >>> extra_info = {"user_email": "user@example.com"}
24
+ >>> adapter = DocumentAnalyzerLoggerAdapter(logger, extra_info)
25
+ >>> adapter.info("Processing document")
26
+ # Output: [DocumentAnalyzer] (user@example.com) Processing document
27
+ """
28
+
29
+ def __init__(self, logger, extra=None):
30
+ if extra is None:
31
+ extra = {}
32
+ super().__init__(logger, extra)
33
+
34
+ def process(self, msg, kwargs):
35
+ user_email = self.extra.get("user_email", "unknown")
36
+ return f"[DocumentAnalyzer] ({user_email}) {msg}", kwargs
@@ -0,0 +1,3 @@
1
+ from .paddleocr_service import PaddleOCRService
2
+
3
+ __all__ = ["PaddleOCRService"]
@@ -0,0 +1,107 @@
1
+ import os
2
+
3
+ # Suppress PaddleOCR verbose output
4
+ os.environ["GLOG_minloglevel"] = "2"
5
+ os.environ["FLAGS_enable_pir_api"] = "true"
6
+ os.environ["FLAGS_print_model_stats"] = "false"
7
+
8
+ import threading
9
+ from paddleocr import PaddleOCR
10
+
11
+ from ..config import logger
12
+
13
+
14
+ class PaddleOCRService:
15
+ """
16
+ Thread-safe manager for PaddleOCR instances with automatic language detection.
17
+ """
18
+
19
+ LANGUAGES = ("es", "en")
20
+ _ocr_instances = {}
21
+ _lock = threading.Lock()
22
+
23
+ @classmethod
24
+ def initialize(cls, langs=LANGUAGES):
25
+ """Initialize PaddleOCR for one or more languages - thread safe"""
26
+ with cls._lock:
27
+ for lang in langs:
28
+ logger.info(f"Initializing PaddleOCR for language '{lang}'...")
29
+ if lang not in cls.LANGUAGES:
30
+ raise ValueError(
31
+ f"Unsupported language: {lang}. Allowed: {cls.LANGUAGES}"
32
+ )
33
+
34
+ if lang in cls._ocr_instances:
35
+ logger.info(
36
+ f"PaddleOCR for language '{lang}' is already initialized."
37
+ )
38
+ continue
39
+
40
+ logger.debug(f"Loading PaddleOCR models for language '{lang}'...")
41
+ cls._ocr_instances[lang] = PaddleOCR(
42
+ lang=lang, use_textline_orientation=True
43
+ )
44
+
45
+ logger.info(
46
+ f"PaddleOCR initialized for language(s): {list(cls._ocr_instances.keys())}"
47
+ )
48
+ return cls._ocr_instances
49
+
50
+ @classmethod
51
+ def get_instance(cls, lang="es"):
52
+ """Return OCR instance for a specific language, initializing if necessary"""
53
+ if lang not in cls.LANGUAGES:
54
+ raise ValueError(f"Unsupported language: {lang}. Allowed: {cls.LANGUAGES}")
55
+
56
+ if lang not in cls._ocr_instances:
57
+ logger.info(f"OCR instance for {lang} not found, initializing now...")
58
+ cls.initialize([lang])
59
+
60
+ return cls._ocr_instances[lang]
61
+
62
+ @classmethod
63
+ def get_auto_instance(cls, passport_file):
64
+ """
65
+ Automatically detect optimal language and return OCR instance for passport.
66
+
67
+ Args:
68
+ passport_file: File object or BytesIO containing passport image
69
+
70
+ Returns:
71
+ tuple: (ocr_instance, detected_language)
72
+ """
73
+ try:
74
+ from ..utils.passport_language_detector import detect_passport_language
75
+
76
+ # Detect language using the PassportLanguageDetector
77
+ detected_lang, detection_details = detect_passport_language(passport_file)
78
+ logger.info(f"Auto-detected passport language: {detected_lang}")
79
+
80
+ # Get appropriate OCR instance
81
+ ocr_instance = cls.get_instance(detected_lang)
82
+
83
+ return ocr_instance, detected_lang, detection_details
84
+
85
+ except Exception as e:
86
+ logger.error(f"Language detection failed: {e}, defaulting to English")
87
+ return cls.get_instance("en"), "en", None
88
+
89
+ @classmethod
90
+ def is_ready(cls, lang=None):
91
+ """Check if PaddleOCR is ready (global or per language)"""
92
+ if lang:
93
+ return lang in cls._ocr_instances
94
+ return bool(cls._ocr_instances)
95
+
96
+ @classmethod
97
+ def list_loaded_languages(cls):
98
+ """Get list of currently loaded language models"""
99
+ return list(cls._ocr_instances.keys())
100
+
101
+ @classmethod
102
+ def clear_cache(cls):
103
+ """Clear all cached OCR instances (use with caution)"""
104
+ with cls._lock:
105
+ logger.warning("Clearing all PaddleOCR instances from cache")
106
+ cls._ocr_instances.clear()
107
+ logger.info("PaddleOCR cache cleared")
@@ -0,0 +1,24 @@
1
+ import os
2
+ import sys
3
+ from .config import logger
4
+
5
+
6
+ def startup_services():
7
+ try:
8
+ if any(cmd in sys.argv for cmd in ["runserver", "gunicorn", "uwsgi"]):
9
+ run_main = os.environ.get("RUN_MAIN")
10
+ is_dev_reloader = run_main == "true"
11
+ is_manual_runserver = "--noreload" in sys.argv
12
+ is_prod = "gunicorn" in sys.argv or "uwsgi" in sys.argv
13
+
14
+ if is_dev_reloader or is_manual_runserver or is_prod:
15
+ from .services import PaddleOCRService
16
+
17
+ if PaddleOCRService.is_ready():
18
+ logger.info("PaddleOCR already initialized, reusing instance.")
19
+ else:
20
+ PaddleOCRService.initialize()
21
+ except ImportError:
22
+ logger.error("Failed to import PaddleOCRService during app startup")
23
+ except Exception as e:
24
+ logger.error("Failed to initialize PaddleOCR during app startup: %s", e)
@@ -0,0 +1,57 @@
1
+ from .common_utils import (
2
+ ensure_bytesio,
3
+ preprocess_image,
4
+ create_text_data,
5
+ extract_data_with_boxes,
6
+ )
7
+ from .cedula_utils import draw_bounding_boxes, convert_spanish_date_to_english
8
+ from .extract_cedula_signature import (
9
+ find_expira_block,
10
+ fallback_signature_detection,
11
+ identify_signature_box,
12
+ process_signature_to_bw,
13
+ extract_signature_image,
14
+ )
15
+ from .passport_utils import (
16
+ clean_passport_number,
17
+ parse_mrz_date,
18
+ parse_mrz_lines,
19
+ aggressive_clean_pob,
20
+ is_clean_place_name,
21
+ extract_mrz_data,
22
+ extract_place_of_birth,
23
+ )
24
+ from .passport_language_detector import (
25
+ PassportLanguageDetector,
26
+ detect_passport_language,
27
+ get_passport_language_details,
28
+ )
29
+
30
+ __all__ = [
31
+ # Common utilities
32
+ "ensure_bytesio",
33
+ "preprocess_image",
34
+ "create_text_data",
35
+ "extract_data_with_boxes",
36
+ # Cedula utilities
37
+ "draw_bounding_boxes",
38
+ "convert_spanish_date_to_english",
39
+ # Cedula signature extraction
40
+ "find_expira_block",
41
+ "fallback_signature_detection",
42
+ "identify_signature_box",
43
+ "process_signature_to_bw",
44
+ "extract_signature_image",
45
+ # Passport utilities
46
+ "clean_passport_number",
47
+ "parse_mrz_date",
48
+ "parse_mrz_lines",
49
+ "aggressive_clean_pob",
50
+ "is_clean_place_name",
51
+ "extract_mrz_data",
52
+ "extract_place_of_birth",
53
+ # Passport language detection
54
+ "PassportLanguageDetector",
55
+ "detect_passport_language",
56
+ "get_passport_language_details",
57
+ ]
@@ -0,0 +1,155 @@
1
+ import re
2
+ import cv2
3
+ import numpy as np
4
+
5
+ from ..config import logger as default_logger
6
+
7
+ # =============================================================================
8
+ # VISUALIZATION UTILITY
9
+ # =============================================================================
10
+
11
+
12
+ def draw_bounding_boxes(image, extracted_data, signature_box=None, logger=None):
13
+ """Draw bounding boxes on the cedula image for visualization.
14
+
15
+ Creates a visualization of the OCR results by drawing bounding boxes around
16
+ detected text areas. Special highlighting is applied to signature areas.
17
+
18
+ Args:
19
+ image (np.ndarray): Original cédula image in BGR format.
20
+ extracted_data (list): List of text data dictionaries from OCR extraction.
21
+ signature_box (dict, optional): Specific text data dict identified as
22
+ signature area. Will be highlighted in red.
23
+ logger (logging.Logger, optional): Logger instance for debug messages.
24
+ Defaults to module's default logger.
25
+
26
+ Returns:
27
+ np.ndarray: Copy of input image with bounding boxes and labels drawn.
28
+ Regular text boxes are green, signature box is red.
29
+
30
+ Note:
31
+ - Text labels are truncated to prevent overcrowding
32
+ - Background rectangles ensure label readability
33
+ - Label positioning adapts to avoid image boundaries
34
+
35
+ Examples:
36
+ >>> vis_image = draw_bounding_boxes(image, extracted_data, signature_box)
37
+ >>> cv2.imshow("Visualization", vis_image)
38
+ >>> cv2.waitKey(0)
39
+ """
40
+ if logger is None:
41
+ logger = default_logger
42
+
43
+ logger.debug("Drawing bounding boxes for visualization")
44
+
45
+ # Create a copy of the image to draw on
46
+ vis_image = image.copy()
47
+
48
+ # Draw all text bounding boxes
49
+ for i, item in enumerate(extracted_data):
50
+ bbox = item["bbox"]
51
+ text = item["text"]
52
+ _ = item["confidence"]
53
+
54
+ # Convert bbox to integer coordinates
55
+ points = np.array(bbox, dtype=np.int32)
56
+
57
+ # Determine color based on whether this is the signature box
58
+ if signature_box and item is signature_box:
59
+ color = (0, 0, 255) # Red for signature
60
+ thickness = 3
61
+ label = (
62
+ f"SIGNATURE: {text[:20]}..." if len(text) > 20 else f"SIGNATURE: {text}"
63
+ )
64
+ else:
65
+ color = (0, 255, 0) # Green for regular text
66
+ thickness = 2
67
+ label = f"{i+1}: {text[:15]}..." if len(text) > 15 else f"{i+1}: {text}"
68
+
69
+ # Draw the bounding box
70
+ cv2.polylines(vis_image, [points], True, color, thickness)
71
+
72
+ # Draw text label above the box
73
+ label_y = int(min([p[1] for p in points])) - 10
74
+ if label_y < 20:
75
+ label_y = int(max([p[1] for p in points])) + 25
76
+
77
+ # Add background rectangle for text readability
78
+ (text_width, text_height), _ = cv2.getTextSize(
79
+ label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1
80
+ )
81
+ cv2.rectangle(
82
+ vis_image,
83
+ (int(min([p[0] for p in points])), label_y - text_height - 5),
84
+ (int(min([p[0] for p in points])) + text_width, label_y + 5),
85
+ (255, 255, 255),
86
+ -1,
87
+ )
88
+
89
+ # Draw the text
90
+ cv2.putText(
91
+ vis_image,
92
+ label,
93
+ (int(min([p[0] for p in points])), label_y),
94
+ cv2.FONT_HERSHEY_SIMPLEX,
95
+ 0.5,
96
+ (0, 0, 0),
97
+ 1,
98
+ )
99
+
100
+ logger.debug("Bounding box visualization completed")
101
+
102
+ return vis_image
103
+
104
+
105
+ # =============================================================================
106
+ # DATE CONVERSION UTILITY
107
+ # =============================================================================
108
+
109
+
110
+ def convert_spanish_date_to_english(date_str):
111
+ """Convert Spanish date format to English format.
112
+
113
+ Converts date strings from Spanish month abbreviations to English
114
+ equivalents while maintaining the DD-MMM-YYYY format commonly used
115
+ in Panamanian documents.
116
+
117
+ Args:
118
+ date_str (str): Date string in Spanish format (e.g., "14-AGO-1947").
119
+ Can also be None or non-string values.
120
+
121
+ Returns:
122
+ str: Date string in English format (e.g., "14-AUG-1947").
123
+ Returns original input if conversion is not possible.
124
+
125
+ Note:
126
+ - Only processes strings matching DD-MMM-YYYY pattern
127
+ - Case-insensitive matching for month abbreviations
128
+ - Preserves original format if no Spanish months are detected
129
+ - Handles edge cases gracefully by returning original input
130
+
131
+ Examples:
132
+ >>> convert_spanish_date_to_english("14-AGO-1947")
133
+ '14-AUG-1947'
134
+ >>> convert_spanish_date_to_english("23-MAR-1940")
135
+ '23-MAR-1940' # MAR is same in both languages
136
+ >>> convert_spanish_date_to_english("invalid-date")
137
+ 'invalid-date' # Returns original if no match
138
+ >>> convert_spanish_date_to_english(None)
139
+ None # Handles None input gracefully
140
+ """
141
+ if not date_str or not isinstance(date_str, str):
142
+ return date_str
143
+
144
+ date_pattern = r"(\d{1,2})-([A-Z]{3})-(\d{4})"
145
+ match = re.search(date_pattern, date_str.upper())
146
+
147
+ if not match:
148
+ return date_str
149
+
150
+ from ..config import SPANISH_TO_ENGLISH_MONTHS
151
+
152
+ day, spanish_month, year = match.groups()
153
+ english_month = SPANISH_TO_ENGLISH_MONTHS.get(spanish_month, spanish_month)
154
+
155
+ return f"{day}-{english_month}-{year}"