idvpackage 3.0.11__py3-none-any.whl → 3.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- idvpackage/common.py +8 -966
- idvpackage/iraq_id_extraction_withopenai.py +374 -893
- idvpackage/jor_passport_extraction.py +1 -6
- idvpackage/liveness_spoofing_v2.py +2 -45
- idvpackage/ocr.py +1016 -2430
- idvpackage/ocr_utils.py +148 -489
- idvpackage/pse_passport_extraction.py +18 -292
- idvpackage/qatar_id_extraction.py +4 -956
- idvpackage/sudan_passport_extraction.py +0 -928
- idvpackage/syr_passport_extraction.py +27 -402
- idvpackage/uae_id_extraction.py +87 -151
- {idvpackage-3.0.11.dist-info → idvpackage-3.0.13.dist-info}/METADATA +1 -1
- idvpackage-3.0.13.dist-info/RECORD +34 -0
- {idvpackage-3.0.11.dist-info → idvpackage-3.0.13.dist-info}/WHEEL +1 -1
- idvpackage/ekyc.py +0 -78
- idvpackage/genai_utils.py +0 -309
- idvpackage/iraq_id_extraction.py +0 -992
- idvpackage/iraq_passport_extraction.py +0 -588
- idvpackage/lazy_imports.py +0 -44
- idvpackage/lebanon_passport_extraction.py +0 -161
- idvpackage/sau_id_extraction.py +0 -248
- idvpackage/sudan_id_extraction.py +0 -764
- idvpackage-3.0.11.dist-info/RECORD +0 -42
- {idvpackage-3.0.11.dist-info → idvpackage-3.0.13.dist-info}/licenses/LICENSE +0 -0
- {idvpackage-3.0.11.dist-info → idvpackage-3.0.13.dist-info}/top_level.txt +0 -0
|
@@ -1,931 +1,3 @@
|
|
|
1
|
-
# import re
|
|
2
|
-
# from datetime import datetime
|
|
3
|
-
# import pycountry
|
|
4
|
-
# from rapidfuzz import fuzz
|
|
5
|
-
# from idvpackage.common import *
|
|
6
|
-
# import json
|
|
7
|
-
# import time
|
|
8
|
-
# import openai
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
# def convert_expiry_date(input_date):
|
|
12
|
-
# day = input_date[4:6]
|
|
13
|
-
# month = input_date[2:4]
|
|
14
|
-
# year = input_date[0:2]
|
|
15
|
-
|
|
16
|
-
# current_year = datetime.now().year
|
|
17
|
-
# current_century = current_year // 100
|
|
18
|
-
# current_year_last_two_digits = current_year % 100
|
|
19
|
-
# century = current_century
|
|
20
|
-
|
|
21
|
-
# if int(year) <= current_year_last_two_digits:
|
|
22
|
-
# century = current_century
|
|
23
|
-
# else:
|
|
24
|
-
# century = current_century
|
|
25
|
-
# final_date = f"{day}/{month}/{century}{year}"
|
|
26
|
-
|
|
27
|
-
# return final_date
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
# def get_dates_to_generic_format(date):
|
|
31
|
-
# formats = ["%d/%m/%Y", "%Y/%m/%d"]
|
|
32
|
-
# for fmt in formats:
|
|
33
|
-
# try:
|
|
34
|
-
# return datetime.strptime(date, fmt).strftime("%d/%m/%Y")
|
|
35
|
-
# except ValueError:
|
|
36
|
-
# pass
|
|
37
|
-
# return None
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
# def validate_date(date):
|
|
41
|
-
# try:
|
|
42
|
-
# date = datetime.strptime(date, "%d-%m-%Y")
|
|
43
|
-
# return date.strftime("%d-%m-%Y")
|
|
44
|
-
# except ValueError:
|
|
45
|
-
# try:
|
|
46
|
-
# date = datetime.strptime(date, "%d/%m/%Y")
|
|
47
|
-
# return date.strftime("%d/%m/%Y")
|
|
48
|
-
# except:
|
|
49
|
-
# return ''
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
# def load_nationality_keywords():
|
|
53
|
-
# countries = pycountry.countries
|
|
54
|
-
# nationality_keywords = set()
|
|
55
|
-
|
|
56
|
-
# # Common suffixes for demonyms
|
|
57
|
-
# demonym_suffixes = ['ian', 'ese', 'ish', 'i', 'ic', 'an', 'nian']
|
|
58
|
-
|
|
59
|
-
# for country in countries:
|
|
60
|
-
# nationality_keywords.add(country.name.upper())
|
|
61
|
-
# nationality_keywords.add(country.alpha_3.upper())
|
|
62
|
-
|
|
63
|
-
# # Adding guessed demonyms
|
|
64
|
-
# for suffix in demonym_suffixes:
|
|
65
|
-
# demonym = country.name.upper() + suffix
|
|
66
|
-
# nationality_keywords.add(demonym.upper())
|
|
67
|
-
|
|
68
|
-
# # Add common demonyms if the official name is available
|
|
69
|
-
# if hasattr(country, 'official_name'):
|
|
70
|
-
# nationality_keywords.add(country.official_name.upper())
|
|
71
|
-
# for suffix in demonym_suffixes:
|
|
72
|
-
# demonym = country.official_name.upper() + suffix
|
|
73
|
-
# nationality_keywords.add(demonym.upper())
|
|
74
|
-
|
|
75
|
-
# return nationality_keywords
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
# def convert_to_mrz_date(date_str):
|
|
79
|
-
# if date_str:
|
|
80
|
-
# try:
|
|
81
|
-
# month, day, year = date_str.split('/')
|
|
82
|
-
|
|
83
|
-
# year_last_two_digits = year[-2:]
|
|
84
|
-
|
|
85
|
-
# mrz_date = year_last_two_digits + month.zfill(2) + day.zfill(2)
|
|
86
|
-
|
|
87
|
-
# return mrz_date
|
|
88
|
-
# except:
|
|
89
|
-
# return ''
|
|
90
|
-
# else:
|
|
91
|
-
# return ''
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
# def find_nationality_in_text(text, nationality_keywords):
|
|
95
|
-
# import re
|
|
96
|
-
# for keyword in nationality_keywords:
|
|
97
|
-
# if re.search(r'\b' + re.escape(keyword) + r'\b', text):
|
|
98
|
-
# return keyword
|
|
99
|
-
# return None
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
# def extract_pob_and_poi(passport_data, dob_for_match, passport_number_mrz):
|
|
103
|
-
# place_of_birth, place_of_issue = '', ''
|
|
104
|
-
# try:
|
|
105
|
-
# pattern = re.compile(rf"Nationality(.*?){dob_for_match}|Nation(.*?){dob_for_match}", re.DOTALL)
|
|
106
|
-
# match = pattern.search(passport_data)
|
|
107
|
-
# if match:
|
|
108
|
-
# substring = match.group(1) or match.group(2)
|
|
109
|
-
# if substring:
|
|
110
|
-
# capital_letters = re.findall(r'[A-Z]{2,}', substring)
|
|
111
|
-
# if not capital_letters or len(capital_letters) <= 1:
|
|
112
|
-
# match, substring = '', ''
|
|
113
|
-
|
|
114
|
-
# if not match:
|
|
115
|
-
# pattern = re.compile(rf"Place(.*?)PCSDN", re.DOTALL)
|
|
116
|
-
# match = pattern.search(passport_data)
|
|
117
|
-
|
|
118
|
-
# if not match:
|
|
119
|
-
# pattern_phsdn = re.compile(rf"{dob_for_match}(.*?)PHSDN", re.DOTALL)
|
|
120
|
-
# match = pattern_phsdn.search(passport_data)
|
|
121
|
-
|
|
122
|
-
# if match:
|
|
123
|
-
# substring = match.group(1)
|
|
124
|
-
# if substring:
|
|
125
|
-
# capital_letters = re.findall(r'[A-Z]{2,}', substring)
|
|
126
|
-
# if not capital_letters or len(capital_letters) <= 1:
|
|
127
|
-
# match, substring = '', ''
|
|
128
|
-
|
|
129
|
-
# if not match:
|
|
130
|
-
# pattern = re.compile(rf"{passport_number_mrz}(.*?){dob_for_match}", re.DOTALL)
|
|
131
|
-
# match = pattern.search(passport_data.replace('O', '0'))
|
|
132
|
-
|
|
133
|
-
# if match:
|
|
134
|
-
# substring_orig = match.group(1)
|
|
135
|
-
# lines = substring_orig.split('\n')
|
|
136
|
-
# arabic_and_english_pattern = re.compile(r'[\u0600-\u06FF].*[A-Z]|[A-Z].*[\u0600-\u06FF]')
|
|
137
|
-
# filtered_lines = [line for line in lines if
|
|
138
|
-
# arabic_and_english_pattern.search(line) and 'SDN' not in line]
|
|
139
|
-
# substring = '\n'.join(filtered_lines)
|
|
140
|
-
# substring = substring.replace('0', 'O')
|
|
141
|
-
# if substring:
|
|
142
|
-
# capital_letters = re.findall(r'[A-Z\d]{2,}', substring)
|
|
143
|
-
# if capital_letters and len(capital_letters) < 2:
|
|
144
|
-
# capital_letters = re.findall(r'[A-Z\d]{2,}', substring_orig)
|
|
145
|
-
# substring = '\n'.join(capital_letters)
|
|
146
|
-
# else:
|
|
147
|
-
# substring = substring
|
|
148
|
-
|
|
149
|
-
# if substring:
|
|
150
|
-
# # print(f'SUBSTRING: {substring}')
|
|
151
|
-
# capital_letters = re.findall(r'[A-Z]{2,}', substring)
|
|
152
|
-
# capital_letters = [re.sub(r'\d+', '', i) for i in capital_letters]
|
|
153
|
-
# capital_letters = [i for i in capital_letters if not (len(i) <= 2 or i == 'SDN' or i == '') or i == 'AL']
|
|
154
|
-
# # print(f'CAPS NEW: {capital_letters}')
|
|
155
|
-
# # for item in ['SDN', 'MI', 'MY', 'MA', 'SS', 'MS', 'ME']:
|
|
156
|
-
# # if item in capital_letters:
|
|
157
|
-
# # capital_letters.remove(item)
|
|
158
|
-
|
|
159
|
-
# if len(capital_letters) > 2 and ('AL' in capital_letters or 'NEW' in capital_letters) and (
|
|
160
|
-
# capital_letters[0] == 'AL' or capital_letters[0] == 'NEW'):
|
|
161
|
-
# place_of_birth = capital_letters[0] + ' ' + capital_letters[1]
|
|
162
|
-
# place_of_issue = capital_letters[2]
|
|
163
|
-
# elif len(capital_letters) <= 3:
|
|
164
|
-
# if len(capital_letters) > 2:
|
|
165
|
-
# place_of_birth = capital_letters[0]
|
|
166
|
-
# place_of_issue = capital_letters[1] + ' ' + capital_letters[2]
|
|
167
|
-
|
|
168
|
-
# else:
|
|
169
|
-
# place_of_birth = capital_letters[0]
|
|
170
|
-
# place_of_issue = capital_letters[1]
|
|
171
|
-
# else:
|
|
172
|
-
# place_of_birth = capital_letters[0] + ' ' + capital_letters[1]
|
|
173
|
-
# place_of_issue = capital_letters[2] + ' ' + capital_letters[3]
|
|
174
|
-
# except:
|
|
175
|
-
# place_of_birth, place_of_issue = '', ''
|
|
176
|
-
|
|
177
|
-
# return place_of_birth, place_of_issue
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
# def find_names_with_context(text, keywords):
|
|
181
|
-
# lines = text.strip().split('\n')
|
|
182
|
-
# keyword_set = set(keywords)
|
|
183
|
-
|
|
184
|
-
# arabic_word_pattern = re.compile(r'[\u0600-\u06FF]+')
|
|
185
|
-
# english_word_pattern = re.compile(r'[A-Za-z]+')
|
|
186
|
-
# arabic_name_candidates = []
|
|
187
|
-
|
|
188
|
-
# def contains_three_arabic_words(line):
|
|
189
|
-
# return len(arabic_word_pattern.findall(line)) >= 3
|
|
190
|
-
|
|
191
|
-
# def is_mixed_language(line):
|
|
192
|
-
# return bool(arabic_word_pattern.search(line)) and bool(english_word_pattern.search(line))
|
|
193
|
-
|
|
194
|
-
# for i, line in enumerate(lines):
|
|
195
|
-
# words = set(line.split())
|
|
196
|
-
# if words & keyword_set:
|
|
197
|
-
# if i > 0 and contains_three_arabic_words(lines[i - 1]) and not is_mixed_language(lines[i - 1]):
|
|
198
|
-
# arabic_name_candidates.append((line, lines[i - 1]))
|
|
199
|
-
# elif i < len(lines) - 1 and contains_three_arabic_words(lines[i + 1]) and not is_mixed_language(
|
|
200
|
-
# lines[i + 1]):
|
|
201
|
-
# arabic_name_candidates.append((line, lines[i + 1]))
|
|
202
|
-
|
|
203
|
-
# return arabic_name_candidates
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
# def sdn_passport_extraction(passport_text):
|
|
207
|
-
# passport_details = {}
|
|
208
|
-
|
|
209
|
-
# patterns = {
|
|
210
|
-
# 'passport_number': (r"([A-Za-z]\d{8}|[A-Za-z]\d{7})", lambda match: match.group(1) if match else ''),
|
|
211
|
-
# 'passport_number_mrz': (r"([A-Za-z]\d{8}|[A-Za-z]\d{7})", lambda match: match.group(1) if match else ''),
|
|
212
|
-
# 'dob_mrz': (r'(\d+)[MF]', lambda match: convert_dob(match.group(1)) if match else ''),
|
|
213
|
-
# 'expiry_date_mrz': (r'[MF](\d+)', lambda match: convert_expiry_date(match.group(1)) if match else ''),
|
|
214
|
-
# 'gender': (r'(\d)([A-Za-z])(\d)', lambda match: match.group(2) if match else '')
|
|
215
|
-
# }
|
|
216
|
-
|
|
217
|
-
# passport_text_clean = passport_text.replace(" ", "")
|
|
218
|
-
|
|
219
|
-
# mrz1_pattern = r"PCSDN[A-Z<]+<<[A-Z<]+<"
|
|
220
|
-
# matches = re.findall(mrz1_pattern, passport_text_clean)
|
|
221
|
-
|
|
222
|
-
# try:
|
|
223
|
-
# mrz1 = matches[0]
|
|
224
|
-
# except:
|
|
225
|
-
# try:
|
|
226
|
-
# mrz1_pattern = r"PHSDN[A-Z<]+<<[A-Z<]+<"
|
|
227
|
-
# matches = re.findall(mrz1_pattern, passport_text_clean)
|
|
228
|
-
# mrz1 = matches[0]
|
|
229
|
-
# except:
|
|
230
|
-
# mrz1 = ''
|
|
231
|
-
|
|
232
|
-
# name_dict = {}
|
|
233
|
-
|
|
234
|
-
# try:
|
|
235
|
-
# pattern = r"(PC([A-Z]{3})((?:[<A-Z]+)+)<)"
|
|
236
|
-
# matches = re.findall(pattern, passport_text_clean)
|
|
237
|
-
|
|
238
|
-
# if matches:
|
|
239
|
-
# mrz1, raw_names = matches[0][0], matches[0][2]
|
|
240
|
-
# processed_names = raw_names.replace('<', ' ').strip()
|
|
241
|
-
# # name_parts = processed_names.split()
|
|
242
|
-
|
|
243
|
-
# # if len(name_parts) > 1 and re.search(r'\b(al|el)\b', name_parts[1].lower()):
|
|
244
|
-
# # surname = ' '.join(name_parts[:2])
|
|
245
|
-
# # given_names = ' '.join(name_parts[2:])
|
|
246
|
-
# # else:
|
|
247
|
-
# # surname = name_parts[0]
|
|
248
|
-
# # given_names = ' '.join(name_parts[1:])
|
|
249
|
-
|
|
250
|
-
# # print(f'\nNAME DICT HERE 2: {processed_names}\n')
|
|
251
|
-
|
|
252
|
-
# passport_details['full_name'] = processed_names
|
|
253
|
-
# else:
|
|
254
|
-
# pattern = r"(PH([A-Z]{3})((?:[<A-Z]+)+)<)"
|
|
255
|
-
# matches = re.findall(pattern, passport_text_clean)
|
|
256
|
-
|
|
257
|
-
# if matches:
|
|
258
|
-
# mrz1, raw_names = matches[0][0], matches[0][2]
|
|
259
|
-
# processed_names = raw_names.replace('<', ' ').strip()
|
|
260
|
-
# # name_parts = processed_names.split()
|
|
261
|
-
|
|
262
|
-
# # if len(name_parts) > 1 and re.search(r'\b(al|el)\b', name_parts[1].lower()):
|
|
263
|
-
# # surname = ' '.join(name_parts[:2])
|
|
264
|
-
# # given_names = ' '.join(name_parts[2:])
|
|
265
|
-
# # else:
|
|
266
|
-
# # surname = name_parts[0]
|
|
267
|
-
# # given_names = ' '.join(name_parts[1:])
|
|
268
|
-
|
|
269
|
-
# # print(f'\nNAME DICT HERE 2: {processed_names}\n')
|
|
270
|
-
|
|
271
|
-
# passport_details['full_name'] = processed_names
|
|
272
|
-
# except:
|
|
273
|
-
# passport_details['full_name'] = ''
|
|
274
|
-
|
|
275
|
-
# if not passport_details.get('full_name', ''):
|
|
276
|
-
# pattern = r"SDN(((?:[<A-Z]+)+)<)"
|
|
277
|
-
# matches = re.findall(pattern, passport_text_clean)
|
|
278
|
-
|
|
279
|
-
# if matches:
|
|
280
|
-
# raw_names = matches[0][0]
|
|
281
|
-
# processed_names = raw_names.replace('<', ' ').strip()
|
|
282
|
-
# passport_details['full_name'] = processed_names
|
|
283
|
-
|
|
284
|
-
# mrz2_pattern = r"\n[A-Z]\d+.*?(?=[<]{2,})"
|
|
285
|
-
# mrz2_matches = re.findall(mrz2_pattern, passport_text_clean)
|
|
286
|
-
|
|
287
|
-
# if mrz2_matches:
|
|
288
|
-
# mrz2 = mrz2_matches[0][1:]
|
|
289
|
-
# else:
|
|
290
|
-
# mrz2 = ''
|
|
291
|
-
|
|
292
|
-
# ## EXTRACTING FIELDS FROM MRZ2
|
|
293
|
-
# mrz2_keys = ['gender', 'passport_number_mrz', 'dob_mrz', 'expiry_date_mrz']
|
|
294
|
-
|
|
295
|
-
# for key, value in patterns.items():
|
|
296
|
-
# pattern = value[0]
|
|
297
|
-
# transform_func = value[1]
|
|
298
|
-
|
|
299
|
-
# text = passport_text
|
|
300
|
-
# if key in mrz2_keys:
|
|
301
|
-
# text = mrz2
|
|
302
|
-
|
|
303
|
-
# match = re.search(pattern, text)
|
|
304
|
-
# passport_details[key] = transform_func(match) if match else ''
|
|
305
|
-
|
|
306
|
-
# if passport_details['passport_number_mrz'] and (
|
|
307
|
-
# passport_details['passport_number_mrz'] != passport_details['passport_number']):
|
|
308
|
-
# passport_details['passport_number'] = passport_details['passport_number_mrz']
|
|
309
|
-
|
|
310
|
-
# ## HANDLE PASSPORT NO FROM MRZ
|
|
311
|
-
|
|
312
|
-
# if not passport_details.get('passport_number_mrz'):
|
|
313
|
-
# passport_number_pattern = r"([A-Za-z]\d{8,}[A-Za-z]{2,}.*?|[A-Za-z]*\d{8,}[A-Za-z]{2,}.*?)"
|
|
314
|
-
# passport_number_match = re.search(passport_number_pattern, passport_text_clean)
|
|
315
|
-
# if passport_number_match:
|
|
316
|
-
# passport_number = passport_number_match.group(1)
|
|
317
|
-
# passport_details['passport_number_mrz'] = passport_number[:9]
|
|
318
|
-
|
|
319
|
-
# ## HANDLE DOB DOE FROM MRZ
|
|
320
|
-
|
|
321
|
-
# if not (passport_details.get('dob_mrz') or passport_details.get('expiry_date_mrz')):
|
|
322
|
-
# dob_pattern = r"(\d{7})[MF]"
|
|
323
|
-
# dob_match = re.search(dob_pattern, passport_text_clean)
|
|
324
|
-
# if dob_match:
|
|
325
|
-
# dob = dob_match.group(1)
|
|
326
|
-
# passport_details['dob_mrz'] = convert_dob(dob)
|
|
327
|
-
# else:
|
|
328
|
-
# dob_pattern = r'.*?[\S]R[\S](\d{9,})\b'
|
|
329
|
-
# dob_match = re.search(dob_pattern, passport_text_clean)
|
|
330
|
-
# if dob_match:
|
|
331
|
-
# dob = dob_match.group(1)[:7]
|
|
332
|
-
# passport_details['dob_mrz'] = validate_date(convert_dob(dob))
|
|
333
|
-
|
|
334
|
-
# doe_pattern = r"[MF](\d+)"
|
|
335
|
-
# doe_match = re.search(doe_pattern, passport_text_clean)
|
|
336
|
-
# if doe_match:
|
|
337
|
-
# expiry = doe_match.group(1)
|
|
338
|
-
# passport_details['expiry_date_mrz'] = validate_date(convert_expiry_date(expiry))
|
|
339
|
-
# else:
|
|
340
|
-
# doe_pattern = r'.*?[\S]R[\S](\d{9,})\b'
|
|
341
|
-
# doe_match = re.search(doe_pattern, passport_text_clean)
|
|
342
|
-
# if doe_match:
|
|
343
|
-
# expiry = doe_match.group(1)[8:]
|
|
344
|
-
# passport_details['expiry_date_mrz'] = validate_date(convert_expiry_date(expiry))
|
|
345
|
-
|
|
346
|
-
# ## HANDLE DOB AND DOE CASES FROM GENERIC DATA FOR VALIDATION
|
|
347
|
-
|
|
348
|
-
# dob = ''
|
|
349
|
-
# expiry = ''
|
|
350
|
-
# issue_date = ''
|
|
351
|
-
|
|
352
|
-
# try:
|
|
353
|
-
# matches = re.findall(r'\d{2}-\d{2}-\d{4}', passport_text)
|
|
354
|
-
# date_objects = [datetime.strptime(date, '%d-%m-%Y') for date in matches]
|
|
355
|
-
# sorted_dates = sorted(set(date_objects))
|
|
356
|
-
# sorted_date_strings = [date.strftime('%d-%m-%Y') for date in sorted_dates]
|
|
357
|
-
|
|
358
|
-
# # print(f"DATES 3: {sorted_date_strings}")
|
|
359
|
-
|
|
360
|
-
# if len(sorted_date_strings) > 1:
|
|
361
|
-
# dob = sorted_date_strings[0].replace('-', '/')
|
|
362
|
-
# issue_date = sorted_date_strings[1].replace('-', '/')
|
|
363
|
-
# expiry = sorted_date_strings[2].replace('-', '/')
|
|
364
|
-
|
|
365
|
-
# else:
|
|
366
|
-
# matches = re.findall(r'\d{2}-\d{2}-\d{4}', passport_text)
|
|
367
|
-
# date_objects = [datetime.strptime(date, '%d-%m-%Y') for date in matches]
|
|
368
|
-
# sorted_dates = sorted(set(date_objects))
|
|
369
|
-
# sorted_date_strings = [date.strftime('%d-%m-%Y') for date in sorted_dates]
|
|
370
|
-
|
|
371
|
-
# # print(f"DATES 4: {sorted_date_strings}")
|
|
372
|
-
|
|
373
|
-
# if sorted_date_strings:
|
|
374
|
-
# dob = sorted_date_strings[0].replace('-', '/')
|
|
375
|
-
# issue_date = sorted_date_strings[1].replace('-', '/')
|
|
376
|
-
# expiry = sorted_date_strings[2].replace('-', '/')
|
|
377
|
-
|
|
378
|
-
# except:
|
|
379
|
-
# dob, issue_date, expiry = '', '', ''
|
|
380
|
-
|
|
381
|
-
# passport_details['dob'] = get_dates_to_generic_format(dob)
|
|
382
|
-
# passport_details['expiry_date'] = get_dates_to_generic_format(expiry)
|
|
383
|
-
# passport_details['issue_date'] = get_dates_to_generic_format(issue_date)
|
|
384
|
-
|
|
385
|
-
# ## HANDLE GENDER CASES EXCEPTIONS
|
|
386
|
-
# if not (passport_details['gender']):
|
|
387
|
-
# # print(f'inside gender case')
|
|
388
|
-
# gender_pattern = r'(\d)([MFmf])(\d)'
|
|
389
|
-
# gender_match = re.search(gender_pattern, passport_text_clean)
|
|
390
|
-
# if gender_match:
|
|
391
|
-
# passport_details['gender'] = gender_match.group(2)
|
|
392
|
-
|
|
393
|
-
# ## NATIONALITY FROM MRZ
|
|
394
|
-
# nationality_ptrn = r"PC([A-Z]{3})"
|
|
395
|
-
# matches = re.findall(nationality_ptrn, passport_text)
|
|
396
|
-
# # print(f'Matches nationality: {matches}')
|
|
397
|
-
# try:
|
|
398
|
-
# nationality = matches[0]
|
|
399
|
-
# passport_details['nationality'] = nationality
|
|
400
|
-
# except:
|
|
401
|
-
# nationality = ''
|
|
402
|
-
|
|
403
|
-
# if not passport_details.get('nationality', ''):
|
|
404
|
-
# nationality_ptrn = r"PH([A-Z]{3})"
|
|
405
|
-
# matches = re.findall(nationality_ptrn, passport_text)
|
|
406
|
-
# try:
|
|
407
|
-
# nationality = matches[0]
|
|
408
|
-
# passport_details['nationality'] = nationality
|
|
409
|
-
# except:
|
|
410
|
-
# nationality = ''
|
|
411
|
-
|
|
412
|
-
# ## NATIONALITY FROM GENERIC DATA
|
|
413
|
-
# if not passport_details.get('nationality', ''):
|
|
414
|
-
# nationality_keywords = load_nationality_keywords()
|
|
415
|
-
# nationality = find_nationality_in_text(passport_text, nationality_keywords)
|
|
416
|
-
|
|
417
|
-
# passport_details['nationality'] = nationality
|
|
418
|
-
|
|
419
|
-
# ## HANDLE NATIONA NUMBER HERE
|
|
420
|
-
# try:
|
|
421
|
-
# national_no_pattern = r'(\d{3}-\d{4}-\d{4})'
|
|
422
|
-
# national_no_match = re.search(national_no_pattern, passport_text)
|
|
423
|
-
# if national_no_match:
|
|
424
|
-
# passport_details['national_number'] = national_no_match.group(1)
|
|
425
|
-
# else:
|
|
426
|
-
# national_no_pattern = r'(\d{3}-\d{4}-\d{4})'
|
|
427
|
-
# national_no_match = re.search(national_no_pattern, passport_text_clean)
|
|
428
|
-
# if national_no_match:
|
|
429
|
-
# passport_details['national_number'] = national_no_match.group(1)
|
|
430
|
-
# except:
|
|
431
|
-
# passport_details['national_number'] = ''
|
|
432
|
-
|
|
433
|
-
# ## ELIMINATE DUPLICATED FIELDS AND KEEP ONLY ONE THAT HAS VALUE
|
|
434
|
-
# try:
|
|
435
|
-
# ### 1. Remove passport number and keep only passport number from mrz
|
|
436
|
-
# if not passport_details.get('passport_number_mrz', '') and passport_details.get('passport_number', ''):
|
|
437
|
-
# passport_details['passport_number_mrz'] = passport_details['passport_number']
|
|
438
|
-
|
|
439
|
-
# if passport_details.get('passport_number', ''):
|
|
440
|
-
# passport_details.pop('passport_number')
|
|
441
|
-
|
|
442
|
-
# ### 2. Remove dob from mrz and keep only dob from generic
|
|
443
|
-
# if not passport_details.get('dob', '') and passport_details.get('dob_mrz', ''):
|
|
444
|
-
# passport_details['dob'] = passport_details['dob_mrz']
|
|
445
|
-
|
|
446
|
-
# if passport_details.get('dob_mrz', ''):
|
|
447
|
-
# passport_details.pop('dob_mrz')
|
|
448
|
-
|
|
449
|
-
# ### 3. Remove expiry from mrz and keep only expiry from generic
|
|
450
|
-
# if not passport_details.get('expiry_date', '') and passport_details.get('expiry_date_mrz', ''):
|
|
451
|
-
# passport_details['expiry_date'] = passport_details['expiry_date_mrz']
|
|
452
|
-
|
|
453
|
-
# if passport_details.get('expiry_date_mrz'):
|
|
454
|
-
# passport_details.pop('expiry_date_mrz')
|
|
455
|
-
|
|
456
|
-
# if passport_details.get('passport_number_mrz', ''):
|
|
457
|
-
# passport_details['id_number'] = passport_details['passport_number_mrz']
|
|
458
|
-
|
|
459
|
-
# ### 4. Remove name from mrz and keep only name from generic
|
|
460
|
-
# except:
|
|
461
|
-
# pass
|
|
462
|
-
|
|
463
|
-
# ## HANDLE PLACE OF BIRTH AND PLACE OF ISSUE HERE
|
|
464
|
-
# # print(f"mrz: {passport_details.get('dob_mrz')}, dob: {passport_details.get('dob')}")
|
|
465
|
-
# dob_for_match = passport_details.get('dob', passport_details.get('dob_mrz')).replace('/', '-')
|
|
466
|
-
# pattern = re.compile(rf"{dob_for_match}(.*?)PCSDN|{dob_for_match}(.*?)SDN[A-Z]{{4,}}", re.DOTALL)
|
|
467
|
-
# match = pattern.search(passport_text)
|
|
468
|
-
|
|
469
|
-
# if not match:
|
|
470
|
-
# pattern_phsdn = re.compile(rf"{dob_for_match}(.*?)PHSDN", re.DOTALL)
|
|
471
|
-
# match = pattern_phsdn.search(passport_text)
|
|
472
|
-
|
|
473
|
-
# if match:
|
|
474
|
-
# substring = match.group(1) if match.group(1) is not None else match.group(2)
|
|
475
|
-
# name_list = passport_details.get('full_name', '').split(' ')
|
|
476
|
-
# capital_letters = re.findall(r'[A-Z]{2,}', substring)
|
|
477
|
-
# capital_letters = [re.sub(r'\d+', '', i) for i in capital_letters]
|
|
478
|
-
# capital_letters = [i for i in capital_letters if
|
|
479
|
-
# not (len(i) <= 2 or i == 'SDN' or i == '' or i in name_list) or i == 'AL']
|
|
480
|
-
# # print(f'CAPS: {capital_letters}')
|
|
481
|
-
# # for item in ['SDN', 'MI', 'MY', 'MA', 'SS', 'MS', 'ME', 'SU']:
|
|
482
|
-
# # if item in capital_letters:
|
|
483
|
-
# # capital_letters.remove(item)
|
|
484
|
-
|
|
485
|
-
# try:
|
|
486
|
-
# if len(capital_letters) > 2 and ('AL' in capital_letters or 'NEW' in capital_letters) and (
|
|
487
|
-
# capital_letters[0] == 'AL' or capital_letters[0] == 'NEW'):
|
|
488
|
-
# place_of_birth = capital_letters[0] + ' ' + capital_letters[1]
|
|
489
|
-
# place_of_issue = capital_letters[2]
|
|
490
|
-
# elif len(capital_letters) <= 3:
|
|
491
|
-
# if len(capital_letters) > 2:
|
|
492
|
-
# place_of_birth = capital_letters[0]
|
|
493
|
-
# place_of_issue = capital_letters[1] + ' ' + capital_letters[2]
|
|
494
|
-
|
|
495
|
-
# else:
|
|
496
|
-
# place_of_birth = capital_letters[0]
|
|
497
|
-
# place_of_issue = capital_letters[1]
|
|
498
|
-
# else:
|
|
499
|
-
# place_of_birth = capital_letters[0] + ' ' + capital_letters[1]
|
|
500
|
-
# place_of_issue = capital_letters[2] + ' ' + capital_letters[3]
|
|
501
|
-
# except:
|
|
502
|
-
# place_of_birth, place_of_issue = extract_pob_and_poi(passport_text, dob_for_match,
|
|
503
|
-
# passport_details.get('passport_number_mrz', ''))
|
|
504
|
-
|
|
505
|
-
# passport_details['place_of_birth'] = place_of_birth
|
|
506
|
-
# passport_details['place_of_issue'] = place_of_issue
|
|
507
|
-
# else:
|
|
508
|
-
# try:
|
|
509
|
-
# place_of_birth, place_of_issue = extract_pob_and_poi(passport_text, dob_for_match,
|
|
510
|
-
# passport_details.get('passport_number_mrz', ''))
|
|
511
|
-
# passport_details['place_of_birth'] = place_of_birth
|
|
512
|
-
# passport_details['place_of_issue'] = place_of_issue
|
|
513
|
-
# except:
|
|
514
|
-
# passport_details['place_of_birth'] = ''
|
|
515
|
-
# passport_details['place_of_issue'] = ''
|
|
516
|
-
|
|
517
|
-
# ## HANDLE ARABIC NAME FROM PASSPORT
|
|
518
|
-
# pattern = re.compile(rf"SDN(.*?){passport_details.get('passport_number_mrz', '')}", re.DOTALL)
|
|
519
|
-
# match = re.findall(pattern, passport_text)
|
|
520
|
-
|
|
521
|
-
# if match:
|
|
522
|
-
# substring = match[0]
|
|
523
|
-
# arabic_regex = re.compile(r'^[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s]+$')
|
|
524
|
-
# result = []
|
|
525
|
-
|
|
526
|
-
# for line in substring.split('\n'):
|
|
527
|
-
# if arabic_regex.match(line) and len(line.split()) >= 4:
|
|
528
|
-
# result.append(line)
|
|
529
|
-
|
|
530
|
-
# name_ar = ''
|
|
531
|
-
# if result:
|
|
532
|
-
# name_ar = ' '.join(result)
|
|
533
|
-
# passport_details['name_ar'] = name_ar
|
|
534
|
-
|
|
535
|
-
# # print(f'\nARABIC NAME 1: {name_ar}\n')
|
|
536
|
-
|
|
537
|
-
# if not passport_details.get('name_ar', ''):
|
|
538
|
-
# name_keywords = passport_details.get('full_name', '').split(' ')
|
|
539
|
-
# results = find_names_with_context(passport_text, name_keywords)
|
|
540
|
-
# if results:
|
|
541
|
-
# passport_details['name_ar'] = results[0][1]
|
|
542
|
-
# # print(f'\nARABIC NAME 2: {results[0][1]}\n')
|
|
543
|
-
# else:
|
|
544
|
-
# name_keywords = ['Full', 'Name']
|
|
545
|
-
# results = find_names_with_context(passport_text, name_keywords)
|
|
546
|
-
# if results:
|
|
547
|
-
# passport_details['name_ar'] = results[0][1]
|
|
548
|
-
# # print(f'\nARABIC NAME 3: {results[0][1]}\n')
|
|
549
|
-
# else:
|
|
550
|
-
# name_keywords = ['الاسم']
|
|
551
|
-
# results = find_names_with_context(passport_text, name_keywords)
|
|
552
|
-
# if results:
|
|
553
|
-
# passport_details['name_ar'] = results[0][1]
|
|
554
|
-
# # print(f'\nARABIC NAME 4: {results[0][1]}\n')
|
|
555
|
-
|
|
556
|
-
# ## HANDLE MRZ1 IF NOT COMPLETE
|
|
557
|
-
# if len(mrz1) < 40:
|
|
558
|
-
# mrz1_pattern = r'PC[A-Z]{3}[A-Z0-9<]{5,44}'
|
|
559
|
-
# match = re.search(mrz1_pattern, passport_text.replace(" ", ""))
|
|
560
|
-
# if match:
|
|
561
|
-
# mrz1 = match.group(0)
|
|
562
|
-
|
|
563
|
-
# if not mrz1:
|
|
564
|
-
# mrz1 = ''
|
|
565
|
-
|
|
566
|
-
# if mrz1 and len(mrz1) < 44:
|
|
567
|
-
# mrz1 = mrz1 = f"{mrz1}{'<' * (44 - len(mrz1))}"
|
|
568
|
-
|
|
569
|
-
# ## HANDLE MRZ2 IF NOT COMPLETE
|
|
570
|
-
# if not mrz2:
|
|
571
|
-
# try:
|
|
572
|
-
# mrz2 = passport_details.get('passport_number_mrz', '') + passport_details.get('nationality',
|
|
573
|
-
# '') + convert_to_mrz_date(
|
|
574
|
-
# passport_details.get('dob_mrz', '')) + passport_details.get('gender', '') + convert_to_mrz_date(
|
|
575
|
-
# passport_details.get('expiry_date_mrz', ''))
|
|
576
|
-
# except:
|
|
577
|
-
# mrz2 = ''
|
|
578
|
-
|
|
579
|
-
# if len(mrz2) >= 28 and len(mrz2) < 40:
|
|
580
|
-
# mrz2 = mrz2 = f"{mrz2}{'<' * (44 - len(mrz2))}"
|
|
581
|
-
|
|
582
|
-
# passport_details['mrz'] = mrz1 + mrz2
|
|
583
|
-
# passport_details['mrz1'] = mrz1
|
|
584
|
-
# passport_details['mrz2'] = mrz2
|
|
585
|
-
|
|
586
|
-
# ## EXTRACT ENGLISH NAME FROM PASSPORT HERE
|
|
587
|
-
|
|
588
|
-
# # print(f"PASSPORT DETAILS HERE: {passport_details}")
|
|
589
|
-
|
|
590
|
-
# try:
|
|
591
|
-
# pattern_1 = re.compile(r'\b(SDN|THE|SUDAN|PC|OF|REPUBLIC|SON|TOKAR|AP|CT|PUR|\w)\b', re.IGNORECASE)
|
|
592
|
-
# pattern = re.compile(r'^[A-Z\s]{3,}[A-Z\s]{3,}[A-Z\s]{3,}$', re.MULTILINE)
|
|
593
|
-
# matches = pattern.findall(pattern_1.sub('', passport_text))
|
|
594
|
-
|
|
595
|
-
# if matches:
|
|
596
|
-
# # print(f'MATCHES: {matches}')
|
|
597
|
-
# # filtered_matches = [
|
|
598
|
-
# # match for match in matches
|
|
599
|
-
# # if ('REPUBLIC OF THE SUDAN' not in match.upper() and 'SUDAN' not in match.upper() and 'THE REPUBLIC' not in match.upper() and 'THE' not in match.upper() and 'PASSPORTS' not in match.upper())
|
|
600
|
-
# # and (len(match.replace('\n', ' ').strip().split(' ')) >= 3)
|
|
601
|
-
# # ]
|
|
602
|
-
# excluded_keywords = {'republic of the sudan', 'sudan', 'the republic', 'passports', 'republic of'}
|
|
603
|
-
# filtered_matches = [
|
|
604
|
-
# match for match in matches
|
|
605
|
-
# if not any(keyword.upper() in match.upper() for keyword in excluded_keywords)
|
|
606
|
-
# and len(match.replace('\n', ' ').strip().split()) >= 3
|
|
607
|
-
# ]
|
|
608
|
-
|
|
609
|
-
# if filtered_matches:
|
|
610
|
-
# def get_long_string(lst):
|
|
611
|
-
# if len(lst) > 1:
|
|
612
|
-
# return max(lst, key=len)
|
|
613
|
-
# else:
|
|
614
|
-
# return lst[0]
|
|
615
|
-
# return None
|
|
616
|
-
|
|
617
|
-
# result = get_long_string(filtered_matches)
|
|
618
|
-
# full_name_generic = result.strip().replace('\n', ' ')
|
|
619
|
-
# # full_name_generic = filtered_matches[0].strip().replace('\n', ' ')
|
|
620
|
-
# else:
|
|
621
|
-
# full_name_generic = ''
|
|
622
|
-
|
|
623
|
-
# passport_details['full_name_generic'] = full_name_generic
|
|
624
|
-
# else:
|
|
625
|
-
# passport_details['full_name_generic'] = passport_details.get('full_name', '')
|
|
626
|
-
# except:
|
|
627
|
-
# passport_details['full_name_generic'] = passport_details.get('full_name', '')
|
|
628
|
-
|
|
629
|
-
# if passport_details.get('full_name_generic', ''):
|
|
630
|
-
# passport_details['name'] = passport_details['full_name_generic']
|
|
631
|
-
# name_split = passport_details['full_name_generic'].split(' ')
|
|
632
|
-
# passport_details['first_name'] = name_split[0]
|
|
633
|
-
# passport_details['last_name'] = name_split[-1]
|
|
634
|
-
# passport_details['middle_name'] = ' '.join(name_split[1:-1])
|
|
635
|
-
# else:
|
|
636
|
-
# if passport_details.get('full_name', ''):
|
|
637
|
-
# name_split = passport_details['full_name'].split(' ')
|
|
638
|
-
# passport_details['first_name'] = name_split[0]
|
|
639
|
-
# passport_details['last_name'] = name_split[-1]
|
|
640
|
-
# passport_details['middle_name'] = ' '.join(name_split[1:-1])
|
|
641
|
-
# else:
|
|
642
|
-
# passport_details['first_name'] = ''
|
|
643
|
-
# passport_details['last_name'] = ''
|
|
644
|
-
# passport_details['middle_name'] = ''
|
|
645
|
-
|
|
646
|
-
# passport_details['issuing_country'] = 'SDN'
|
|
647
|
-
|
|
648
|
-
# if "gender" in passport_details:
|
|
649
|
-
# gender = passport_details["gender"].strip().upper()
|
|
650
|
-
# if gender == "F":
|
|
651
|
-
# passport_details["gender"] = "FEMALE"
|
|
652
|
-
# elif gender == "M":
|
|
653
|
-
# passport_details["gender"] = "MALE"
|
|
654
|
-
|
|
655
|
-
# if 'gender' in passport_details:
|
|
656
|
-
# passport_details["gender"] = passport_details["gender"].strip().upper()
|
|
657
|
-
|
|
658
|
-
# passport_details_genai = extract_passport_details_genai(passport_text)
|
|
659
|
-
|
|
660
|
-
# if passport_details_genai and passport_details_genai.get('name_ar', ''):
|
|
661
|
-
# passport_details['name_ar'] = passport_details_genai.get('name_ar', '')
|
|
662
|
-
|
|
663
|
-
# if passport_details_genai and passport_details_genai.get('name_en', ''):
|
|
664
|
-
# passport_details['name_en'] = passport_details_genai.get('name_en', '')
|
|
665
|
-
# passport_details['full_name_generic'] = passport_details_genai.get('name_en', '')
|
|
666
|
-
# passport_details['name'] = passport_details_genai.get('name_en', '')
|
|
667
|
-
|
|
668
|
-
# if passport_details_genai and passport_details_genai.get('place_of_birth', ''):
|
|
669
|
-
# passport_details['place_of_birth'] = passport_details_genai.get('place_of_birth', '')
|
|
670
|
-
|
|
671
|
-
# if passport_details_genai and passport_details_genai.get('place_of_issue', ''):
|
|
672
|
-
# passport_details['place_of_issue'] = passport_details_genai.get('place_of_issue', '')
|
|
673
|
-
|
|
674
|
-
# full_name_generic_2 = passport_details.get('full_name_generic', '')
|
|
675
|
-
# try:
|
|
676
|
-
# name_list = full_name_generic_2.split(' ')
|
|
677
|
-
# passport_details['first_name'] = name_list[0]
|
|
678
|
-
# passport_details['last_name'] = name_list[-1]
|
|
679
|
-
# passport_details['middle_name'] = ' '.join(name_list[1:-1])
|
|
680
|
-
|
|
681
|
-
# except Exception as e:
|
|
682
|
-
# if passport_details_genai and passport_details_genai.get('first_name', ''):
|
|
683
|
-
# passport_details['first_name'] = passport_details_genai.get('first_name', '')
|
|
684
|
-
|
|
685
|
-
# if passport_details_genai and passport_details_genai.get('middle_name', ''):
|
|
686
|
-
# passport_details['middle_name'] = passport_details_genai.get('middle_name', '')
|
|
687
|
-
|
|
688
|
-
# if passport_details_genai and passport_details_genai.get('last_name', ''):
|
|
689
|
-
# passport_details['last_name'] = passport_details_genai.get('last_name', '')
|
|
690
|
-
|
|
691
|
-
# full_name_generic_ar = passport_details.get('name_ar', '')
|
|
692
|
-
# try:
|
|
693
|
-
# name_parts = full_name_generic_ar.split(' ')
|
|
694
|
-
|
|
695
|
-
# # Handle compound names
|
|
696
|
-
# compound_prefixes = ['عبد', 'عبدال', 'فضل', 'بمسك']
|
|
697
|
-
# compound_suffixes = ['الدين', 'الله', 'الرحمن', 'الجنه']
|
|
698
|
-
|
|
699
|
-
# # Process first name
|
|
700
|
-
# if len(name_parts) >= 2:
|
|
701
|
-
# # Check for specific compound first names
|
|
702
|
-
# if name_parts[0] == 'عبد' or (name_parts[0] == 'فضل' and name_parts[1] == 'الله'):
|
|
703
|
-
# passport_details['first_name_ar'] = name_parts[0] + ' ' + name_parts[1]
|
|
704
|
-
# first_name_end_idx = 2
|
|
705
|
-
# # Handle the specific case of "بمسك الجنه"
|
|
706
|
-
# elif name_parts[0] == 'بمسك' and len(name_parts) >= 2 and name_parts[1] == 'الجنه':
|
|
707
|
-
# passport_details['first_name_ar'] = name_parts[0] + ' ' + name_parts[1]
|
|
708
|
-
# first_name_end_idx = 2
|
|
709
|
-
# else:
|
|
710
|
-
# # Regular first name
|
|
711
|
-
# passport_details['first_name_ar'] = name_parts[0]
|
|
712
|
-
# first_name_end_idx = 1
|
|
713
|
-
# else:
|
|
714
|
-
# passport_details['first_name_ar'] = name_parts[0] if name_parts else ''
|
|
715
|
-
# first_name_end_idx = 1
|
|
716
|
-
|
|
717
|
-
# # Process last name - check if last parts form a compound name
|
|
718
|
-
# if len(name_parts) >= 2:
|
|
719
|
-
# # Check for repeating patterns at the end (like "محمد خير")
|
|
720
|
-
# if len(name_parts) >= 4 and name_parts[-2] == name_parts[-4] and name_parts[-1] == name_parts[-3]:
|
|
721
|
-
# # We have a repeating two-word pattern at the end
|
|
722
|
-
# passport_details['last_name_ar'] = name_parts[-2] + ' ' + name_parts[-1]
|
|
723
|
-
# last_name_start_idx = len(name_parts) - 2
|
|
724
|
-
# # Check for compound last names
|
|
725
|
-
# elif len(name_parts) >= 3:
|
|
726
|
-
# # Check for عبد + something
|
|
727
|
-
# if name_parts[-2] == 'عبد':
|
|
728
|
-
# passport_details['last_name_ar'] = 'عبد ' + name_parts[-1]
|
|
729
|
-
# last_name_start_idx = len(name_parts) - 2
|
|
730
|
-
# # Check for something + الدين/الله/etc.
|
|
731
|
-
# elif name_parts[-1] in compound_suffixes:
|
|
732
|
-
# passport_details['last_name_ar'] = name_parts[-2] + ' ' + name_parts[-1]
|
|
733
|
-
# last_name_start_idx = len(name_parts) - 2
|
|
734
|
-
# else:
|
|
735
|
-
# passport_details['last_name_ar'] = name_parts[-1]
|
|
736
|
-
# last_name_start_idx = len(name_parts) - 1
|
|
737
|
-
# else:
|
|
738
|
-
# passport_details['last_name_ar'] = name_parts[-1]
|
|
739
|
-
# last_name_start_idx = len(name_parts) - 1
|
|
740
|
-
# else:
|
|
741
|
-
# passport_details['last_name_ar'] = ''
|
|
742
|
-
# last_name_start_idx = len(name_parts)
|
|
743
|
-
|
|
744
|
-
# # Middle name is everything between first and last name
|
|
745
|
-
# if first_name_end_idx < last_name_start_idx:
|
|
746
|
-
# passport_details['middle_name_ar'] = ' '.join(name_parts[first_name_end_idx:last_name_start_idx])
|
|
747
|
-
# else:
|
|
748
|
-
# passport_details['middle_name_ar'] = ''
|
|
749
|
-
# except Exception as e:
|
|
750
|
-
# if passport_details_genai and passport_details_genai.get('first_name_ar', ''):
|
|
751
|
-
# passport_details['first_name_ar'] = passport_details_genai.get('first_name_ar', '')
|
|
752
|
-
|
|
753
|
-
# if passport_details_genai and passport_details_genai.get('last_name_ar', ''):
|
|
754
|
-
# passport_details['last_name_ar'] = passport_details_genai.get('last_name_ar', '')
|
|
755
|
-
|
|
756
|
-
# if passport_details_genai and passport_details_genai.get('middle_name_ar', ''):
|
|
757
|
-
# passport_details['middle_name_ar'] = passport_details_genai.get('middle_name_ar', '')
|
|
758
|
-
|
|
759
|
-
# print(f"passport details: {passport_details}")
|
|
760
|
-
|
|
761
|
-
# if passport_details_genai and passport_details_genai.get('issue_date', ''):
|
|
762
|
-
# passport_details['issue_date'] = passport_details_genai.get('issue_date', '')
|
|
763
|
-
|
|
764
|
-
# if passport_details_genai and passport_details_genai.get('nationality', ''):
|
|
765
|
-
# if passport_details.get('nationality', '') != passport_details_genai.get('nationality', ''):
|
|
766
|
-
# passport_details['nationality'] = passport_details_genai.get('nationality', '')
|
|
767
|
-
|
|
768
|
-
# if passport_details_genai and passport_details_genai.get('mrz1', ''):
|
|
769
|
-
# passport_details['mrz1'] = passport_details_genai.get('mrz1', '')
|
|
770
|
-
|
|
771
|
-
# if passport_details_genai and passport_details_genai.get('mrz2', ''):
|
|
772
|
-
# passport_details['mrz2'] = passport_details_genai.get('mrz2', '')
|
|
773
|
-
|
|
774
|
-
# if passport_details_genai and passport_details_genai.get('mrz', ''):
|
|
775
|
-
# passport_details['mrz'] = passport_details_genai.get('mrz', '')
|
|
776
|
-
|
|
777
|
-
# if passport_details_genai and passport_details_genai.get('id_number', ''):
|
|
778
|
-
# passport_details['id_number'] = passport_details_genai.get('id_number', '')
|
|
779
|
-
|
|
780
|
-
# if passport_details_genai and passport_details_genai.get('passport_number_mrz', ''):
|
|
781
|
-
# if passport_details.get('passport_number_mrz', '') != passport_details_genai.get('id_number',
|
|
782
|
-
# passport_details_genai.get(
|
|
783
|
-
# 'passport_number',
|
|
784
|
-
# '')):
|
|
785
|
-
# passport_details['passport_number_mrz'] = passport_details_genai.get('id_number',
|
|
786
|
-
# passport_details_genai.get(
|
|
787
|
-
# 'passport_number', ''))
|
|
788
|
-
|
|
789
|
-
# if passport_details_genai and passport_details_genai.get('gender', ''):
|
|
790
|
-
# if passport_details.get('gender', '') != passport_details_genai.get('gender', ''):
|
|
791
|
-
# passport_details['gender'] = passport_details_genai.get('gender', '')
|
|
792
|
-
# # try:
|
|
793
|
-
# # full_name_generic = ''
|
|
794
|
-
# # passport_details['full_name_generic'] = full_name_generic
|
|
795
|
-
# # except:
|
|
796
|
-
# # passport_details['full_name_generic'] = ''
|
|
797
|
-
|
|
798
|
-
# return passport_details
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
# def make_api_request_with_retries(prompt: str, max_retries: int = 3, delay_seconds: float = 2):
|
|
802
|
-
# """
|
|
803
|
-
# Helper function to make API requests with retry logic using OpenAI
|
|
804
|
-
# """
|
|
805
|
-
# for attempt in range(max_retries):
|
|
806
|
-
# try:
|
|
807
|
-
# response = openai.ChatCompletion.create(
|
|
808
|
-
# model="gpt-4o",
|
|
809
|
-
# temperature=0.4,
|
|
810
|
-
# max_tokens=2000,
|
|
811
|
-
# messages=[
|
|
812
|
-
# {
|
|
813
|
-
# "role": "user",
|
|
814
|
-
# "content": prompt
|
|
815
|
-
# }
|
|
816
|
-
# ]
|
|
817
|
-
# )
|
|
818
|
-
# result = response.choices[0].message.content
|
|
819
|
-
|
|
820
|
-
# try:
|
|
821
|
-
# return json.loads(result)
|
|
822
|
-
# except json.JSONDecodeError:
|
|
823
|
-
# try:
|
|
824
|
-
# json_match = re.search(r'```(json|python|plaintext)?\s*(.*?)\s*```|\s*({.*?})', result, re.DOTALL)
|
|
825
|
-
# if json_match:
|
|
826
|
-
# json_str = json_match.group(2) or json_match.group(3)
|
|
827
|
-
# try:
|
|
828
|
-
# return json.loads(json_str)
|
|
829
|
-
# except:
|
|
830
|
-
# return eval(json_str.replace("'", '"'))
|
|
831
|
-
# except:
|
|
832
|
-
# pass
|
|
833
|
-
|
|
834
|
-
# return json.loads(result)
|
|
835
|
-
|
|
836
|
-
# except Exception as e:
|
|
837
|
-
# print(f"Error during API request (attempt {attempt + 1} of {max_retries}): {str(e)}")
|
|
838
|
-
# if attempt < max_retries - 1:
|
|
839
|
-
# time.sleep(delay_seconds)
|
|
840
|
-
# else:
|
|
841
|
-
# raise Exception(f"Max retries exceeded. Last error: {str(e)}")
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
# def extract_passport_details_genai(passport_data):
|
|
845
|
-
# """
|
|
846
|
-
# Function to extract passport details using OpenAI API
|
|
847
|
-
# """
|
|
848
|
-
# try:
|
|
849
|
-
# prompt = f"""From the attached text, please extract the data in a structured format. The response should be a dictionary containing:
|
|
850
|
-
# - name_ar (Arabic name if available)
|
|
851
|
-
# - name_en (English name)
|
|
852
|
-
# - place_of_birth (English place of birth)
|
|
853
|
-
# - place_of_issue (English place of issue)
|
|
854
|
-
# - gender (FEMALE or MALE)
|
|
855
|
-
# - mrz1 (first line of MRZ)
|
|
856
|
-
# - mrz2 (second line of MRZ)
|
|
857
|
-
# - passport_number (should be in format: letter followed by 8 digits)
|
|
858
|
-
# - dob (in format dd/mm/yyyy)
|
|
859
|
-
# - issue_date (in format dd/mm/yyyy)
|
|
860
|
-
# - expiry_date (in format dd/mm/yyyy)
|
|
861
|
-
# - nationality (ISO 3166-1 alpha-3 country code)
|
|
862
|
-
# - first_name (from English name)
|
|
863
|
-
# - middle_name (from English name)
|
|
864
|
-
# - last_name (from English name)
|
|
865
|
-
# - first_name_ar (Arabic first name)
|
|
866
|
-
# - last_name_ar (Arabic last name)
|
|
867
|
-
# - middle_name_ar (Arabic middle name)
|
|
868
|
-
|
|
869
|
-
# Make sure to extract the correct names from both Arabic and English text.
|
|
870
|
-
# Important NOTE: If the number of words in name_en and name_ar are not equal, translate the English name (name_en) into Arabic and update name_ar with the translated text.
|
|
871
|
-
|
|
872
|
-
# The MRZ lines should be complete.
|
|
873
|
-
# The response should only contain a dictionary with these fields.
|
|
874
|
-
|
|
875
|
-
# Here's the text: {passport_data}"""
|
|
876
|
-
|
|
877
|
-
# back_data = make_api_request_with_retries(prompt)
|
|
878
|
-
|
|
879
|
-
# if back_data:
|
|
880
|
-
# try:
|
|
881
|
-
# if back_data.get('passport_number', ''):
|
|
882
|
-
# back_data['id_number'] = back_data.pop('passport_number', '')
|
|
883
|
-
# except:
|
|
884
|
-
# pass
|
|
885
|
-
|
|
886
|
-
# try:
|
|
887
|
-
# if back_data.get('mrz1', '') and back_data.get('mrz2', ''):
|
|
888
|
-
# back_data['mrz'] = back_data.get('mrz1', '') + back_data.get('mrz2', '')
|
|
889
|
-
# except:
|
|
890
|
-
# pass
|
|
891
|
-
|
|
892
|
-
# back_data['issuing_country'] = 'SDN'
|
|
893
|
-
|
|
894
|
-
# try:
|
|
895
|
-
# if "gender" in back_data:
|
|
896
|
-
# gender = back_data["gender"].strip().upper()
|
|
897
|
-
# if gender == "F":
|
|
898
|
-
# back_data["gender"] = "FEMALE"
|
|
899
|
-
# elif gender == "M":
|
|
900
|
-
# back_data["gender"] = "MALE"
|
|
901
|
-
# elif gender in ['MALE', 'FEMALE']:
|
|
902
|
-
# back_data["gender"] = gender.upper()
|
|
903
|
-
|
|
904
|
-
# except:
|
|
905
|
-
# pass
|
|
906
|
-
# except Exception as e:
|
|
907
|
-
# print(f"Error in processing the extracted data: {e}")
|
|
908
|
-
# back_data = {
|
|
909
|
-
# 'name_ar': '',
|
|
910
|
-
# 'name_en': '',
|
|
911
|
-
# 'first_name': '',
|
|
912
|
-
# 'middle_name': '',
|
|
913
|
-
# 'last_name': '',
|
|
914
|
-
# 'first_name_ar': '',
|
|
915
|
-
# 'last_name_ar': '',
|
|
916
|
-
# 'middle_name_ar': '',
|
|
917
|
-
# 'dob': '',
|
|
918
|
-
# 'issue_date': '',
|
|
919
|
-
# 'expiry_date': '',
|
|
920
|
-
# 'place_of_birth': '',
|
|
921
|
-
# 'place_of_issue': '',
|
|
922
|
-
# 'nationality': '',
|
|
923
|
-
# 'mrz1': '',
|
|
924
|
-
# 'mrz2': '',
|
|
925
|
-
# 'mrz': ''
|
|
926
|
-
# }
|
|
927
|
-
|
|
928
|
-
# return back_data
|
|
929
1
|
|
|
930
2
|
import base64
|
|
931
3
|
import time
|