document-analyzer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,277 @@
1
+ import re
2
+ import cv2
3
+ import numpy as np
4
+ from typing import Optional, Dict, Tuple, Union
5
+
6
+ from .common_utils import ensure_bytesio, preprocess_image
7
+ from ..config import (
8
+ logger,
9
+ SPANISH_COUNTRIES,
10
+ SPANISH_KEYWORDS,
11
+ COUNTRY_NAME_MAPPINGS,
12
+ )
13
+
14
+
15
+ class PassportLanguageDetector:
16
+ """
17
+ Passport language detector for determining optimal OCR language.
18
+
19
+ This class analyzes passport documents to determine whether Spanish or English
20
+ OCR would provide better results based on document content and country indicators.
21
+
22
+ Example:
23
+ >>> detector = PassportLanguageDetector()
24
+ >>> language = detector.detect_language(passport_file)
25
+ >>> print(f"Use {language} OCR for this passport")
26
+
27
+ # Or use class methods directly:
28
+ >>> language = PassportLanguageDetector.detect_passport_language(passport_file)
29
+ """
30
+
31
+ def __init__(self, default_confidence_threshold=3):
32
+ """
33
+ Initialize the detector.
34
+
35
+ Args:
36
+ default_confidence_threshold: Minimum score required to detect Spanish
37
+ """
38
+ self.confidence_threshold = default_confidence_threshold
39
+ self.logger = logger
40
+
41
+ def _prepare_image(self, passport_file):
42
+ """Convert various input types to OpenCV image."""
43
+ try:
44
+ if isinstance(passport_file, np.ndarray):
45
+ return passport_file.copy()
46
+ elif isinstance(passport_file, str):
47
+ # File path
48
+ return cv2.imread(passport_file)
49
+ else:
50
+ # File object or BytesIO
51
+ passport_stream = ensure_bytesio(passport_file)
52
+ image_data = np.frombuffer(passport_stream.read(), np.uint8)
53
+ return cv2.imdecode(image_data, cv2.IMREAD_COLOR)
54
+ except Exception as e:
55
+ self.logger.error(f"Error preparing image: {e}")
56
+ return None
57
+
58
+ def _get_ocr_instance(self, ocr_instance):
59
+ """Get or create OCR instance."""
60
+ if ocr_instance is not None:
61
+ return ocr_instance
62
+
63
+ from ..services.paddleocr_service import PaddleOCRService
64
+
65
+ return PaddleOCRService.get_instance("es")
66
+
67
+ def _extract_text_from_image(self, image: np.ndarray, ocr):
68
+ """Extract all text from image using OCR."""
69
+ try:
70
+ # Preprocess image for better OCR
71
+ processed_image = preprocess_image(image)
72
+
73
+ # Run OCR
74
+ results = ocr.predict(processed_image)
75
+
76
+ if not results or not results[0]:
77
+ return ""
78
+
79
+ # Combine all text
80
+ all_text = ""
81
+ for line in results[0]:
82
+ if len(line) >= 2:
83
+ text = line[1][0].upper().strip()
84
+ all_text += " " + text
85
+
86
+ return all_text.strip()
87
+
88
+ except Exception as e:
89
+ self.logger.error(f"Error extracting text: {e}")
90
+ return ""
91
+
92
+ def _analyze_language_indicators(self, text):
93
+ """Analyze text for Spanish language indicators."""
94
+ spanish_score = 0
95
+ total_indicators = 0
96
+ found_indicators = {
97
+ "spanish_keywords": [],
98
+ "spanish_countries": [],
99
+ "spanish_chars": [],
100
+ "spanish_patterns": [],
101
+ "country_names": [],
102
+ }
103
+
104
+ # Method 1: Spanish keywords
105
+ words = text.split()
106
+ for word in words:
107
+ if word in SPANISH_KEYWORDS:
108
+ spanish_score += 3
109
+ total_indicators += 1
110
+ found_indicators["spanish_keywords"].append(word)
111
+
112
+ # Method 2: Spanish country codes
113
+ for country in SPANISH_COUNTRIES:
114
+ if country in text:
115
+ spanish_score += 5
116
+ total_indicators += 1
117
+ found_indicators["spanish_countries"].append(country)
118
+
119
+ # Method 3: Spanish-specific characters
120
+ spanish_chars = ["Ñ", "Á", "É", "Í", "Ó", "Ú", "Ü"]
121
+ for char in spanish_chars:
122
+ if char in text:
123
+ spanish_score += 2
124
+ total_indicators += 1
125
+ found_indicators["spanish_chars"].append(char)
126
+
127
+ # Method 4: Spanish phrase patterns
128
+ spanish_patterns = [
129
+ r"REPUBLICA\s+DE",
130
+ r"REPÚBLICA\s+DE",
131
+ r"LUGAR\s+DE\s+NAC",
132
+ r"FECHA\s+DE\s+NAC",
133
+ r"DOCUMENTO\s+DE\s+IDENTIDAD",
134
+ r"CEDULA\s+DE",
135
+ r"CÉDULA\s+DE",
136
+ r"PASAPORTE\s+DE",
137
+ ]
138
+
139
+ for pattern in spanish_patterns:
140
+ matches = re.findall(pattern, text)
141
+ if matches:
142
+ spanish_score += 3
143
+ total_indicators += len(matches)
144
+ found_indicators["spanish_patterns"].extend(matches)
145
+
146
+ # Method 5: Country names
147
+ for country_name, code in COUNTRY_NAME_MAPPINGS.items():
148
+ if country_name in text and code in SPANISH_COUNTRIES:
149
+ spanish_score += 4
150
+ total_indicators += 1
151
+ found_indicators["country_names"].append(country_name)
152
+
153
+ return {
154
+ "spanish_score": spanish_score,
155
+ "total_indicators": total_indicators,
156
+ "found_indicators": found_indicators,
157
+ }
158
+
159
+ def _calculate_confidence(self, spanish_score, total_indicators):
160
+ """Calculate confidence level based on detection metrics."""
161
+ if spanish_score >= 8 or total_indicators >= 4:
162
+ return "high"
163
+ elif spanish_score >= 3 or total_indicators >= 2:
164
+ return "medium"
165
+ else:
166
+ return "low"
167
+
168
+ def detect_with_details(self, passport_file, ocr_instance=None):
169
+ """
170
+ Detect language with detailed analysis information.
171
+
172
+ Args:
173
+ passport_file: File object, BytesIO, image path, or numpy array
174
+ ocr_instance: Optional pre-initialized OCR instance
175
+
176
+ Returns:
177
+ Tuple[str, Dict]: (detected_language, analysis_details)
178
+ """
179
+ try:
180
+ # Convert input to OpenCV image
181
+ image = self._prepare_image(passport_file)
182
+ if image is None:
183
+ return "en", {"error": "Could not decode image", "confidence": "low"}
184
+
185
+ # Get OCR instance
186
+ ocr = self._get_ocr_instance(ocr_instance)
187
+
188
+ # Extract text
189
+ all_text = self._extract_text_from_image(image, ocr)
190
+ if not all_text:
191
+ return "en", {"error": "No text extracted", "confidence": "low"}
192
+
193
+ # Analyze for language indicators
194
+ analysis = self._analyze_language_indicators(all_text)
195
+
196
+ # Make decision
197
+ detected_language = (
198
+ "es" if analysis["spanish_score"] >= self.confidence_threshold else "en"
199
+ )
200
+ confidence = self._calculate_confidence(
201
+ analysis["spanish_score"], analysis["total_indicators"]
202
+ )
203
+
204
+ # Prepare detailed results
205
+ details = {
206
+ "detected_language": detected_language,
207
+ "confidence": confidence,
208
+ "spanish_score": analysis["spanish_score"],
209
+ "total_indicators": analysis["total_indicators"],
210
+ "found_indicators": analysis["found_indicators"],
211
+ "text_sample": (
212
+ all_text[:300] + "..." if len(all_text) > 300 else all_text
213
+ ),
214
+ "analysis_method": "PassportLanguageDetector",
215
+ }
216
+
217
+ self.logger.info(
218
+ f"Passport language detected: {detected_language} (confidence: {confidence})"
219
+ )
220
+ return detected_language, details
221
+
222
+ except Exception as e:
223
+ self.logger.error(f"Language detection failed: {e}")
224
+ return "en", {"error": str(e), "confidence": "low"}
225
+
226
+ @classmethod
227
+ def detect_passport_language(
228
+ cls, passport_file, ocr_instance=None, confidence_threshold=3
229
+ ):
230
+ """
231
+ Class method for detecting passport language.
232
+
233
+ Args:
234
+ passport_file: File object, BytesIO, or image array
235
+ ocr_instance: Optional pre-initialized OCR instance
236
+ confidence_threshold: Minimum score for Spanish detection
237
+
238
+ Returns:
239
+ Tuple[str, Dict]: (detected_language, analysis_details)
240
+ """
241
+ detector = cls(confidence_threshold)
242
+ return detector.detect_with_details(passport_file, ocr_instance)
243
+
244
+
245
+ # Convenience functions for easy import and use
246
+ def detect_passport_language(passport_file, ocr_instance=None):
247
+ """
248
+ Convenience function for detecting passport language.
249
+
250
+ Args:
251
+ passport_file: File object, BytesIO, or image array
252
+ ocr_instance: Optional pre-initialized OCR instance
253
+
254
+ Returns:
255
+ str: 'es' for Spanish, 'en' for English
256
+ """
257
+ detected_language, _ = PassportLanguageDetector.detect_with_details(
258
+ passport_file, ocr_instance
259
+ )
260
+ return detected_language
261
+
262
+
263
+ def get_passport_language_details(passport_file, ocr_instance=None):
264
+ """
265
+ Get detailed passport language detection information.
266
+
267
+ Args:
268
+ passport_file: File object, BytesIO, or image array
269
+ ocr_instance: Optional pre-initialized OCR instance
270
+
271
+ Returns:
272
+ Dict: Detailed detection information including confidence and indicators
273
+ """
274
+ _, details = PassportLanguageDetector.detect_with_details(
275
+ passport_file, ocr_instance
276
+ )
277
+ return details
@@ -0,0 +1,260 @@
1
+ import re
2
+
3
+ from ..config import FORBIDDEN_TERMS, BIRTH_PLACE_INDICATORS, ENGLISH_MONTHS
4
+
5
+
6
+ def clean_passport_number(raw_number):
7
+ """
8
+ Clean passport number by fixing OCR mistakes.
9
+ - Replace 0 with O when it should be a letter.
10
+ - Remove invalid characters.
11
+ """
12
+ number = raw_number.strip().replace("<", "")
13
+
14
+ # Replace 0 with O if surrounded by letters (common mistake)
15
+ number = re.sub(r"([A-Z])0([A-Z])", r"\1O\2", number)
16
+ number = re.sub(r"([A-Z])0", r"\1O", number)
17
+ number = re.sub(r"0([A-Z])", r"O\1", number)
18
+
19
+ # Keep only alphanumeric
20
+ number = re.sub(r"[^A-Z0-9]", "", number)
21
+
22
+ return number
23
+
24
+
25
+ def parse_mrz_date(date_str, logger=None):
26
+ """Parse MRZ date format (YYMMDD) to DD-MMM-YYYY format."""
27
+ if len(date_str) != 6:
28
+ return ""
29
+
30
+ try:
31
+ year = int(date_str[:2])
32
+ month = int(date_str[2:4])
33
+ day = int(date_str[4:6])
34
+
35
+ # Fix: for expiry_date, YY >= 30 should still map to 2000+
36
+ # Assume all passport dates are between 1950–2099
37
+ if year >= 50:
38
+ year += 1900
39
+ else:
40
+ year += 2000
41
+
42
+ if 1 <= month <= 12:
43
+ return f"{day:02d}-{ENGLISH_MONTHS[month]}-{year}"
44
+
45
+ except (ValueError, IndexError):
46
+ if logger:
47
+ logger.warning(f"Invalid MRZ date format: {date_str}")
48
+
49
+ return ""
50
+
51
+
52
+ def parse_mrz_lines(mrz_lines, logger=None):
53
+ """Parse MRZ lines to extract passport information."""
54
+ passport_info = {
55
+ "date_of_birth": "",
56
+ "nationality": "",
57
+ "expiry_date": "",
58
+ "passport_number": "",
59
+ }
60
+
61
+ if len(mrz_lines) < 2:
62
+ if logger:
63
+ logger.warning("Insufficient MRZ lines for parsing")
64
+ return passport_info
65
+
66
+ try:
67
+ line2 = mrz_lines[1]
68
+
69
+ # Passport Number
70
+ raw_passport_number = line2[0:9]
71
+ passport_info["passport_number"] = clean_passport_number(raw_passport_number)
72
+
73
+ # Nationality
74
+ passport_info["nationality"] = line2[10:13]
75
+
76
+ # DOB (YYMMDD at index 13–19)
77
+ dob_str = line2[13:19]
78
+ passport_info["date_of_birth"] = parse_mrz_date(dob_str, logger)
79
+
80
+ # Expiry Date (YYMMDD at index 21–27)
81
+ expiry_str = line2[21:27]
82
+ passport_info["expiry_date"] = parse_mrz_date(expiry_str, logger)
83
+
84
+ if logger:
85
+ logger.debug(f"Parsed MRZ data: {passport_info}")
86
+
87
+ except Exception as e:
88
+ if logger:
89
+ logger.error(f"Error parsing MRZ: {str(e)}")
90
+
91
+ return passport_info
92
+
93
+
94
+ def aggressive_clean_pob(text):
95
+ """Aggressively clean POB text to remove document field contamination."""
96
+ if not text:
97
+ return ""
98
+
99
+ # First basic cleanup
100
+ cleaned = text.strip(" :/.,;-")
101
+
102
+ # Remove OCR artifacts
103
+ artifacts = ["<<<", ">>>", "<<", ">>", "||", "|"]
104
+ for artifact in artifacts:
105
+ cleaned = cleaned.replace(artifact, "")
106
+
107
+ # AGGRESSIVE: Split by common separators and take only the first meaningful part
108
+ separators = [
109
+ r"\s+m/",
110
+ r"\s+rte",
111
+ r"\s+/Place",
112
+ r"\s+Place\s+of",
113
+ r"\s+Lugar\s+de",
114
+ r"\s+Date\s+of",
115
+ r"\s+Authority",
116
+ r"\s+Fecha\s+de",
117
+ r"\s+SURAT",
118
+ ]
119
+
120
+ for separator in separators:
121
+ parts = re.split(separator, cleaned, flags=re.IGNORECASE)
122
+ if len(parts) > 1:
123
+ cleaned = parts[0].strip()
124
+ break
125
+
126
+ # Remove any trailing fragments that look like document fields
127
+ unwanted_endings = [
128
+ r"\s+m$",
129
+ r"\s+rt$",
130
+ r"\s+rte$",
131
+ r"\s+/P$",
132
+ r"\s+Pl$",
133
+ r"\s+Place$",
134
+ r"\s+of$",
135
+ r"\s+Issue$",
136
+ r"\s+Auth$",
137
+ ]
138
+
139
+ for ending in unwanted_endings:
140
+ cleaned = re.sub(ending, "", cleaned, flags=re.IGNORECASE)
141
+
142
+ # Final cleanup
143
+ cleaned = cleaned.strip(" /:-.,")
144
+
145
+ return cleaned
146
+
147
+
148
+ def is_clean_place_name(text):
149
+ """Very strict validation for place names."""
150
+ if not text or len(text) < 3:
151
+ return False
152
+
153
+ text_upper = text.upper().strip()
154
+
155
+ # Reject if contains document field indicators
156
+ for term in FORBIDDEN_TERMS:
157
+ if term in text_upper:
158
+ return False
159
+
160
+ # Must be mostly alphabetic (allow spaces, commas, but not too many special chars)
161
+ alpha_chars = sum(1 for c in text if c.isalpha())
162
+ total_chars = len(text.replace(" ", "").replace(",", ""))
163
+
164
+ if total_chars > 0 and alpha_chars / total_chars < 0.7: # At least 70% letters
165
+ return False
166
+
167
+ # Reasonable length for place names
168
+ if len(text) > 40:
169
+ return False
170
+
171
+ return True
172
+
173
+
174
+ def extract_mrz_data(extracted_data, logger=None):
175
+ """Extract and parse MRZ (Machine Readable Zone) data.
176
+
177
+ Args:
178
+ extracted_data (list): List of text data from OCR extraction.
179
+ logger: Logger instance for logging.
180
+
181
+ Returns:
182
+ dict: Parsed MRZ data containing passport info.
183
+ """
184
+ if logger:
185
+ logger.debug("Starting MRZ data extraction")
186
+
187
+ mrz_lines = []
188
+
189
+ # Find MRZ lines (typically at bottom, contain mostly uppercase and special chars)
190
+ for item in extracted_data:
191
+ text = item["text"].strip()
192
+ # MRZ lines are typically long, contain < characters, and are mostly uppercase
193
+ if len(text) > 20 and "<" in text and text.isupper():
194
+ mrz_lines.append(text)
195
+
196
+ if not mrz_lines:
197
+ if logger:
198
+ logger.warning("No MRZ lines found")
199
+ return {}
200
+
201
+ # Sort MRZ lines by vertical position (top to bottom)
202
+ mrz_items = [
203
+ (item, item["text"])
204
+ for item in extracted_data
205
+ if len(item["text"]) > 20 and "<" in item["text"] and item["text"].isupper()
206
+ ]
207
+ mrz_items.sort(key=lambda x: x[0]["center_y"])
208
+ mrz_lines = [item[1] for item in mrz_items]
209
+
210
+ if logger:
211
+ logger.debug(f"Found {len(mrz_lines)} MRZ lines")
212
+
213
+ return parse_mrz_lines(mrz_lines, logger)
214
+
215
+
216
+ def extract_place_of_birth(extracted_data, logger=None):
217
+ """Extract place of birth from passport OCR data.
218
+
219
+ Args:
220
+ extracted_data (list): List of text data from OCR extraction.
221
+ logger: Logger instance for logging.
222
+
223
+ Returns:
224
+ str: Extracted place of birth or empty string.
225
+ """
226
+ if logger:
227
+ logger.debug("Searching for place of birth")
228
+
229
+ # Method 1: Look for indicators and extract carefully
230
+ for i, item in enumerate(extracted_data):
231
+ text = item["text"].upper()
232
+
233
+ for indicator in BIRTH_PLACE_INDICATORS:
234
+ if indicator in text:
235
+ if logger:
236
+ logger.debug(f"Found birth place indicator: '{indicator}'")
237
+
238
+ # Extract from same line (after indicator)
239
+ parts = text.split(indicator)
240
+ if len(parts) > 1 and parts[1].strip():
241
+ candidate = aggressive_clean_pob(parts[1])
242
+ if is_clean_place_name(candidate):
243
+ if logger:
244
+ logger.debug(f"Found POB on same line: '{candidate}'")
245
+ return candidate
246
+
247
+ # Check next few lines with aggressive cleaning
248
+ for offset in range(1, min(4, len(extracted_data) - i)):
249
+ if i + offset < len(extracted_data):
250
+ next_text = extracted_data[i + offset]["text"]
251
+ candidate = aggressive_clean_pob(next_text)
252
+
253
+ if is_clean_place_name(candidate):
254
+ if logger:
255
+ logger.debug(
256
+ f"Found POB on line +{offset}: '{candidate}'"
257
+ )
258
+ return candidate
259
+
260
+ return ""