idvpackage 3.0.11__py3-none-any.whl → 3.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,931 +1,3 @@
1
- # import re
2
- # from datetime import datetime
3
- # import pycountry
4
- # from rapidfuzz import fuzz
5
- # from idvpackage.common import *
6
- # import json
7
- # import time
8
- # import openai
9
-
10
-
11
- # def convert_expiry_date(input_date):
12
- # day = input_date[4:6]
13
- # month = input_date[2:4]
14
- # year = input_date[0:2]
15
-
16
- # current_year = datetime.now().year
17
- # current_century = current_year // 100
18
- # current_year_last_two_digits = current_year % 100
19
- # century = current_century
20
-
21
- # if int(year) <= current_year_last_two_digits:
22
- # century = current_century
23
- # else:
24
- # century = current_century
25
- # final_date = f"{day}/{month}/{century}{year}"
26
-
27
- # return final_date
28
-
29
-
30
- # def get_dates_to_generic_format(date):
31
- # formats = ["%d/%m/%Y", "%Y/%m/%d"]
32
- # for fmt in formats:
33
- # try:
34
- # return datetime.strptime(date, fmt).strftime("%d/%m/%Y")
35
- # except ValueError:
36
- # pass
37
- # return None
38
-
39
-
40
- # def validate_date(date):
41
- # try:
42
- # date = datetime.strptime(date, "%d-%m-%Y")
43
- # return date.strftime("%d-%m-%Y")
44
- # except ValueError:
45
- # try:
46
- # date = datetime.strptime(date, "%d/%m/%Y")
47
- # return date.strftime("%d/%m/%Y")
48
- # except:
49
- # return ''
50
-
51
-
52
- # def load_nationality_keywords():
53
- # countries = pycountry.countries
54
- # nationality_keywords = set()
55
-
56
- # # Common suffixes for demonyms
57
- # demonym_suffixes = ['ian', 'ese', 'ish', 'i', 'ic', 'an', 'nian']
58
-
59
- # for country in countries:
60
- # nationality_keywords.add(country.name.upper())
61
- # nationality_keywords.add(country.alpha_3.upper())
62
-
63
- # # Adding guessed demonyms
64
- # for suffix in demonym_suffixes:
65
- # demonym = country.name.upper() + suffix
66
- # nationality_keywords.add(demonym.upper())
67
-
68
- # # Add common demonyms if the official name is available
69
- # if hasattr(country, 'official_name'):
70
- # nationality_keywords.add(country.official_name.upper())
71
- # for suffix in demonym_suffixes:
72
- # demonym = country.official_name.upper() + suffix
73
- # nationality_keywords.add(demonym.upper())
74
-
75
- # return nationality_keywords
76
-
77
-
78
- # def convert_to_mrz_date(date_str):
79
- # if date_str:
80
- # try:
81
- # month, day, year = date_str.split('/')
82
-
83
- # year_last_two_digits = year[-2:]
84
-
85
- # mrz_date = year_last_two_digits + month.zfill(2) + day.zfill(2)
86
-
87
- # return mrz_date
88
- # except:
89
- # return ''
90
- # else:
91
- # return ''
92
-
93
-
94
- # def find_nationality_in_text(text, nationality_keywords):
95
- # import re
96
- # for keyword in nationality_keywords:
97
- # if re.search(r'\b' + re.escape(keyword) + r'\b', text):
98
- # return keyword
99
- # return None
100
-
101
-
102
- # def extract_pob_and_poi(passport_data, dob_for_match, passport_number_mrz):
103
- # place_of_birth, place_of_issue = '', ''
104
- # try:
105
- # pattern = re.compile(rf"Nationality(.*?){dob_for_match}|Nation(.*?){dob_for_match}", re.DOTALL)
106
- # match = pattern.search(passport_data)
107
- # if match:
108
- # substring = match.group(1) or match.group(2)
109
- # if substring:
110
- # capital_letters = re.findall(r'[A-Z]{2,}', substring)
111
- # if not capital_letters or len(capital_letters) <= 1:
112
- # match, substring = '', ''
113
-
114
- # if not match:
115
- # pattern = re.compile(rf"Place(.*?)PCSDN", re.DOTALL)
116
- # match = pattern.search(passport_data)
117
-
118
- # if not match:
119
- # pattern_phsdn = re.compile(rf"{dob_for_match}(.*?)PHSDN", re.DOTALL)
120
- # match = pattern_phsdn.search(passport_data)
121
-
122
- # if match:
123
- # substring = match.group(1)
124
- # if substring:
125
- # capital_letters = re.findall(r'[A-Z]{2,}', substring)
126
- # if not capital_letters or len(capital_letters) <= 1:
127
- # match, substring = '', ''
128
-
129
- # if not match:
130
- # pattern = re.compile(rf"{passport_number_mrz}(.*?){dob_for_match}", re.DOTALL)
131
- # match = pattern.search(passport_data.replace('O', '0'))
132
-
133
- # if match:
134
- # substring_orig = match.group(1)
135
- # lines = substring_orig.split('\n')
136
- # arabic_and_english_pattern = re.compile(r'[\u0600-\u06FF].*[A-Z]|[A-Z].*[\u0600-\u06FF]')
137
- # filtered_lines = [line for line in lines if
138
- # arabic_and_english_pattern.search(line) and 'SDN' not in line]
139
- # substring = '\n'.join(filtered_lines)
140
- # substring = substring.replace('0', 'O')
141
- # if substring:
142
- # capital_letters = re.findall(r'[A-Z\d]{2,}', substring)
143
- # if capital_letters and len(capital_letters) < 2:
144
- # capital_letters = re.findall(r'[A-Z\d]{2,}', substring_orig)
145
- # substring = '\n'.join(capital_letters)
146
- # else:
147
- # substring = substring
148
-
149
- # if substring:
150
- # # print(f'SUBSTRING: {substring}')
151
- # capital_letters = re.findall(r'[A-Z]{2,}', substring)
152
- # capital_letters = [re.sub(r'\d+', '', i) for i in capital_letters]
153
- # capital_letters = [i for i in capital_letters if not (len(i) <= 2 or i == 'SDN' or i == '') or i == 'AL']
154
- # # print(f'CAPS NEW: {capital_letters}')
155
- # # for item in ['SDN', 'MI', 'MY', 'MA', 'SS', 'MS', 'ME']:
156
- # # if item in capital_letters:
157
- # # capital_letters.remove(item)
158
-
159
- # if len(capital_letters) > 2 and ('AL' in capital_letters or 'NEW' in capital_letters) and (
160
- # capital_letters[0] == 'AL' or capital_letters[0] == 'NEW'):
161
- # place_of_birth = capital_letters[0] + ' ' + capital_letters[1]
162
- # place_of_issue = capital_letters[2]
163
- # elif len(capital_letters) <= 3:
164
- # if len(capital_letters) > 2:
165
- # place_of_birth = capital_letters[0]
166
- # place_of_issue = capital_letters[1] + ' ' + capital_letters[2]
167
-
168
- # else:
169
- # place_of_birth = capital_letters[0]
170
- # place_of_issue = capital_letters[1]
171
- # else:
172
- # place_of_birth = capital_letters[0] + ' ' + capital_letters[1]
173
- # place_of_issue = capital_letters[2] + ' ' + capital_letters[3]
174
- # except:
175
- # place_of_birth, place_of_issue = '', ''
176
-
177
- # return place_of_birth, place_of_issue
178
-
179
-
180
- # def find_names_with_context(text, keywords):
181
- # lines = text.strip().split('\n')
182
- # keyword_set = set(keywords)
183
-
184
- # arabic_word_pattern = re.compile(r'[\u0600-\u06FF]+')
185
- # english_word_pattern = re.compile(r'[A-Za-z]+')
186
- # arabic_name_candidates = []
187
-
188
- # def contains_three_arabic_words(line):
189
- # return len(arabic_word_pattern.findall(line)) >= 3
190
-
191
- # def is_mixed_language(line):
192
- # return bool(arabic_word_pattern.search(line)) and bool(english_word_pattern.search(line))
193
-
194
- # for i, line in enumerate(lines):
195
- # words = set(line.split())
196
- # if words & keyword_set:
197
- # if i > 0 and contains_three_arabic_words(lines[i - 1]) and not is_mixed_language(lines[i - 1]):
198
- # arabic_name_candidates.append((line, lines[i - 1]))
199
- # elif i < len(lines) - 1 and contains_three_arabic_words(lines[i + 1]) and not is_mixed_language(
200
- # lines[i + 1]):
201
- # arabic_name_candidates.append((line, lines[i + 1]))
202
-
203
- # return arabic_name_candidates
204
-
205
-
206
- # def sdn_passport_extraction(passport_text):
207
- # passport_details = {}
208
-
209
- # patterns = {
210
- # 'passport_number': (r"([A-Za-z]\d{8}|[A-Za-z]\d{7})", lambda match: match.group(1) if match else ''),
211
- # 'passport_number_mrz': (r"([A-Za-z]\d{8}|[A-Za-z]\d{7})", lambda match: match.group(1) if match else ''),
212
- # 'dob_mrz': (r'(\d+)[MF]', lambda match: convert_dob(match.group(1)) if match else ''),
213
- # 'expiry_date_mrz': (r'[MF](\d+)', lambda match: convert_expiry_date(match.group(1)) if match else ''),
214
- # 'gender': (r'(\d)([A-Za-z])(\d)', lambda match: match.group(2) if match else '')
215
- # }
216
-
217
- # passport_text_clean = passport_text.replace(" ", "")
218
-
219
- # mrz1_pattern = r"PCSDN[A-Z<]+<<[A-Z<]+<"
220
- # matches = re.findall(mrz1_pattern, passport_text_clean)
221
-
222
- # try:
223
- # mrz1 = matches[0]
224
- # except:
225
- # try:
226
- # mrz1_pattern = r"PHSDN[A-Z<]+<<[A-Z<]+<"
227
- # matches = re.findall(mrz1_pattern, passport_text_clean)
228
- # mrz1 = matches[0]
229
- # except:
230
- # mrz1 = ''
231
-
232
- # name_dict = {}
233
-
234
- # try:
235
- # pattern = r"(PC([A-Z]{3})((?:[<A-Z]+)+)<)"
236
- # matches = re.findall(pattern, passport_text_clean)
237
-
238
- # if matches:
239
- # mrz1, raw_names = matches[0][0], matches[0][2]
240
- # processed_names = raw_names.replace('<', ' ').strip()
241
- # # name_parts = processed_names.split()
242
-
243
- # # if len(name_parts) > 1 and re.search(r'\b(al|el)\b', name_parts[1].lower()):
244
- # # surname = ' '.join(name_parts[:2])
245
- # # given_names = ' '.join(name_parts[2:])
246
- # # else:
247
- # # surname = name_parts[0]
248
- # # given_names = ' '.join(name_parts[1:])
249
-
250
- # # print(f'\nNAME DICT HERE 2: {processed_names}\n')
251
-
252
- # passport_details['full_name'] = processed_names
253
- # else:
254
- # pattern = r"(PH([A-Z]{3})((?:[<A-Z]+)+)<)"
255
- # matches = re.findall(pattern, passport_text_clean)
256
-
257
- # if matches:
258
- # mrz1, raw_names = matches[0][0], matches[0][2]
259
- # processed_names = raw_names.replace('<', ' ').strip()
260
- # # name_parts = processed_names.split()
261
-
262
- # # if len(name_parts) > 1 and re.search(r'\b(al|el)\b', name_parts[1].lower()):
263
- # # surname = ' '.join(name_parts[:2])
264
- # # given_names = ' '.join(name_parts[2:])
265
- # # else:
266
- # # surname = name_parts[0]
267
- # # given_names = ' '.join(name_parts[1:])
268
-
269
- # # print(f'\nNAME DICT HERE 2: {processed_names}\n')
270
-
271
- # passport_details['full_name'] = processed_names
272
- # except:
273
- # passport_details['full_name'] = ''
274
-
275
- # if not passport_details.get('full_name', ''):
276
- # pattern = r"SDN(((?:[<A-Z]+)+)<)"
277
- # matches = re.findall(pattern, passport_text_clean)
278
-
279
- # if matches:
280
- # raw_names = matches[0][0]
281
- # processed_names = raw_names.replace('<', ' ').strip()
282
- # passport_details['full_name'] = processed_names
283
-
284
- # mrz2_pattern = r"\n[A-Z]\d+.*?(?=[<]{2,})"
285
- # mrz2_matches = re.findall(mrz2_pattern, passport_text_clean)
286
-
287
- # if mrz2_matches:
288
- # mrz2 = mrz2_matches[0][1:]
289
- # else:
290
- # mrz2 = ''
291
-
292
- # ## EXTRACTING FIELDS FROM MRZ2
293
- # mrz2_keys = ['gender', 'passport_number_mrz', 'dob_mrz', 'expiry_date_mrz']
294
-
295
- # for key, value in patterns.items():
296
- # pattern = value[0]
297
- # transform_func = value[1]
298
-
299
- # text = passport_text
300
- # if key in mrz2_keys:
301
- # text = mrz2
302
-
303
- # match = re.search(pattern, text)
304
- # passport_details[key] = transform_func(match) if match else ''
305
-
306
- # if passport_details['passport_number_mrz'] and (
307
- # passport_details['passport_number_mrz'] != passport_details['passport_number']):
308
- # passport_details['passport_number'] = passport_details['passport_number_mrz']
309
-
310
- # ## HANDLE PASSPORT NO FROM MRZ
311
-
312
- # if not passport_details.get('passport_number_mrz'):
313
- # passport_number_pattern = r"([A-Za-z]\d{8,}[A-Za-z]{2,}.*?|[A-Za-z]*\d{8,}[A-Za-z]{2,}.*?)"
314
- # passport_number_match = re.search(passport_number_pattern, passport_text_clean)
315
- # if passport_number_match:
316
- # passport_number = passport_number_match.group(1)
317
- # passport_details['passport_number_mrz'] = passport_number[:9]
318
-
319
- # ## HANDLE DOB DOE FROM MRZ
320
-
321
- # if not (passport_details.get('dob_mrz') or passport_details.get('expiry_date_mrz')):
322
- # dob_pattern = r"(\d{7})[MF]"
323
- # dob_match = re.search(dob_pattern, passport_text_clean)
324
- # if dob_match:
325
- # dob = dob_match.group(1)
326
- # passport_details['dob_mrz'] = convert_dob(dob)
327
- # else:
328
- # dob_pattern = r'.*?[\S]R[\S](\d{9,})\b'
329
- # dob_match = re.search(dob_pattern, passport_text_clean)
330
- # if dob_match:
331
- # dob = dob_match.group(1)[:7]
332
- # passport_details['dob_mrz'] = validate_date(convert_dob(dob))
333
-
334
- # doe_pattern = r"[MF](\d+)"
335
- # doe_match = re.search(doe_pattern, passport_text_clean)
336
- # if doe_match:
337
- # expiry = doe_match.group(1)
338
- # passport_details['expiry_date_mrz'] = validate_date(convert_expiry_date(expiry))
339
- # else:
340
- # doe_pattern = r'.*?[\S]R[\S](\d{9,})\b'
341
- # doe_match = re.search(doe_pattern, passport_text_clean)
342
- # if doe_match:
343
- # expiry = doe_match.group(1)[8:]
344
- # passport_details['expiry_date_mrz'] = validate_date(convert_expiry_date(expiry))
345
-
346
- # ## HANDLE DOB AND DOE CASES FROM GENERIC DATA FOR VALIDATION
347
-
348
- # dob = ''
349
- # expiry = ''
350
- # issue_date = ''
351
-
352
- # try:
353
- # matches = re.findall(r'\d{2}-\d{2}-\d{4}', passport_text)
354
- # date_objects = [datetime.strptime(date, '%d-%m-%Y') for date in matches]
355
- # sorted_dates = sorted(set(date_objects))
356
- # sorted_date_strings = [date.strftime('%d-%m-%Y') for date in sorted_dates]
357
-
358
- # # print(f"DATES 3: {sorted_date_strings}")
359
-
360
- # if len(sorted_date_strings) > 1:
361
- # dob = sorted_date_strings[0].replace('-', '/')
362
- # issue_date = sorted_date_strings[1].replace('-', '/')
363
- # expiry = sorted_date_strings[2].replace('-', '/')
364
-
365
- # else:
366
- # matches = re.findall(r'\d{2}-\d{2}-\d{4}', passport_text)
367
- # date_objects = [datetime.strptime(date, '%d-%m-%Y') for date in matches]
368
- # sorted_dates = sorted(set(date_objects))
369
- # sorted_date_strings = [date.strftime('%d-%m-%Y') for date in sorted_dates]
370
-
371
- # # print(f"DATES 4: {sorted_date_strings}")
372
-
373
- # if sorted_date_strings:
374
- # dob = sorted_date_strings[0].replace('-', '/')
375
- # issue_date = sorted_date_strings[1].replace('-', '/')
376
- # expiry = sorted_date_strings[2].replace('-', '/')
377
-
378
- # except:
379
- # dob, issue_date, expiry = '', '', ''
380
-
381
- # passport_details['dob'] = get_dates_to_generic_format(dob)
382
- # passport_details['expiry_date'] = get_dates_to_generic_format(expiry)
383
- # passport_details['issue_date'] = get_dates_to_generic_format(issue_date)
384
-
385
- # ## HANDLE GENDER CASES EXCEPTIONS
386
- # if not (passport_details['gender']):
387
- # # print(f'inside gender case')
388
- # gender_pattern = r'(\d)([MFmf])(\d)'
389
- # gender_match = re.search(gender_pattern, passport_text_clean)
390
- # if gender_match:
391
- # passport_details['gender'] = gender_match.group(2)
392
-
393
- # ## NATIONALITY FROM MRZ
394
- # nationality_ptrn = r"PC([A-Z]{3})"
395
- # matches = re.findall(nationality_ptrn, passport_text)
396
- # # print(f'Matches nationality: {matches}')
397
- # try:
398
- # nationality = matches[0]
399
- # passport_details['nationality'] = nationality
400
- # except:
401
- # nationality = ''
402
-
403
- # if not passport_details.get('nationality', ''):
404
- # nationality_ptrn = r"PH([A-Z]{3})"
405
- # matches = re.findall(nationality_ptrn, passport_text)
406
- # try:
407
- # nationality = matches[0]
408
- # passport_details['nationality'] = nationality
409
- # except:
410
- # nationality = ''
411
-
412
- # ## NATIONALITY FROM GENERIC DATA
413
- # if not passport_details.get('nationality', ''):
414
- # nationality_keywords = load_nationality_keywords()
415
- # nationality = find_nationality_in_text(passport_text, nationality_keywords)
416
-
417
- # passport_details['nationality'] = nationality
418
-
419
- # ## HANDLE NATIONA NUMBER HERE
420
- # try:
421
- # national_no_pattern = r'(\d{3}-\d{4}-\d{4})'
422
- # national_no_match = re.search(national_no_pattern, passport_text)
423
- # if national_no_match:
424
- # passport_details['national_number'] = national_no_match.group(1)
425
- # else:
426
- # national_no_pattern = r'(\d{3}-\d{4}-\d{4})'
427
- # national_no_match = re.search(national_no_pattern, passport_text_clean)
428
- # if national_no_match:
429
- # passport_details['national_number'] = national_no_match.group(1)
430
- # except:
431
- # passport_details['national_number'] = ''
432
-
433
- # ## ELIMINATE DUPLICATED FIELDS AND KEEP ONLY ONE THAT HAS VALUE
434
- # try:
435
- # ### 1. Remove passport number and keep only passport number from mrz
436
- # if not passport_details.get('passport_number_mrz', '') and passport_details.get('passport_number', ''):
437
- # passport_details['passport_number_mrz'] = passport_details['passport_number']
438
-
439
- # if passport_details.get('passport_number', ''):
440
- # passport_details.pop('passport_number')
441
-
442
- # ### 2. Remove dob from mrz and keep only dob from generic
443
- # if not passport_details.get('dob', '') and passport_details.get('dob_mrz', ''):
444
- # passport_details['dob'] = passport_details['dob_mrz']
445
-
446
- # if passport_details.get('dob_mrz', ''):
447
- # passport_details.pop('dob_mrz')
448
-
449
- # ### 3. Remove expiry from mrz and keep only expiry from generic
450
- # if not passport_details.get('expiry_date', '') and passport_details.get('expiry_date_mrz', ''):
451
- # passport_details['expiry_date'] = passport_details['expiry_date_mrz']
452
-
453
- # if passport_details.get('expiry_date_mrz'):
454
- # passport_details.pop('expiry_date_mrz')
455
-
456
- # if passport_details.get('passport_number_mrz', ''):
457
- # passport_details['id_number'] = passport_details['passport_number_mrz']
458
-
459
- # ### 4. Remove name from mrz and keep only name from generic
460
- # except:
461
- # pass
462
-
463
- # ## HANDLE PLACE OF BIRTH AND PLACE OF ISSUE HERE
464
- # # print(f"mrz: {passport_details.get('dob_mrz')}, dob: {passport_details.get('dob')}")
465
- # dob_for_match = passport_details.get('dob', passport_details.get('dob_mrz')).replace('/', '-')
466
- # pattern = re.compile(rf"{dob_for_match}(.*?)PCSDN|{dob_for_match}(.*?)SDN[A-Z]{{4,}}", re.DOTALL)
467
- # match = pattern.search(passport_text)
468
-
469
- # if not match:
470
- # pattern_phsdn = re.compile(rf"{dob_for_match}(.*?)PHSDN", re.DOTALL)
471
- # match = pattern_phsdn.search(passport_text)
472
-
473
- # if match:
474
- # substring = match.group(1) if match.group(1) is not None else match.group(2)
475
- # name_list = passport_details.get('full_name', '').split(' ')
476
- # capital_letters = re.findall(r'[A-Z]{2,}', substring)
477
- # capital_letters = [re.sub(r'\d+', '', i) for i in capital_letters]
478
- # capital_letters = [i for i in capital_letters if
479
- # not (len(i) <= 2 or i == 'SDN' or i == '' or i in name_list) or i == 'AL']
480
- # # print(f'CAPS: {capital_letters}')
481
- # # for item in ['SDN', 'MI', 'MY', 'MA', 'SS', 'MS', 'ME', 'SU']:
482
- # # if item in capital_letters:
483
- # # capital_letters.remove(item)
484
-
485
- # try:
486
- # if len(capital_letters) > 2 and ('AL' in capital_letters or 'NEW' in capital_letters) and (
487
- # capital_letters[0] == 'AL' or capital_letters[0] == 'NEW'):
488
- # place_of_birth = capital_letters[0] + ' ' + capital_letters[1]
489
- # place_of_issue = capital_letters[2]
490
- # elif len(capital_letters) <= 3:
491
- # if len(capital_letters) > 2:
492
- # place_of_birth = capital_letters[0]
493
- # place_of_issue = capital_letters[1] + ' ' + capital_letters[2]
494
-
495
- # else:
496
- # place_of_birth = capital_letters[0]
497
- # place_of_issue = capital_letters[1]
498
- # else:
499
- # place_of_birth = capital_letters[0] + ' ' + capital_letters[1]
500
- # place_of_issue = capital_letters[2] + ' ' + capital_letters[3]
501
- # except:
502
- # place_of_birth, place_of_issue = extract_pob_and_poi(passport_text, dob_for_match,
503
- # passport_details.get('passport_number_mrz', ''))
504
-
505
- # passport_details['place_of_birth'] = place_of_birth
506
- # passport_details['place_of_issue'] = place_of_issue
507
- # else:
508
- # try:
509
- # place_of_birth, place_of_issue = extract_pob_and_poi(passport_text, dob_for_match,
510
- # passport_details.get('passport_number_mrz', ''))
511
- # passport_details['place_of_birth'] = place_of_birth
512
- # passport_details['place_of_issue'] = place_of_issue
513
- # except:
514
- # passport_details['place_of_birth'] = ''
515
- # passport_details['place_of_issue'] = ''
516
-
517
- # ## HANDLE ARABIC NAME FROM PASSPORT
518
- # pattern = re.compile(rf"SDN(.*?){passport_details.get('passport_number_mrz', '')}", re.DOTALL)
519
- # match = re.findall(pattern, passport_text)
520
-
521
- # if match:
522
- # substring = match[0]
523
- # arabic_regex = re.compile(r'^[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s]+$')
524
- # result = []
525
-
526
- # for line in substring.split('\n'):
527
- # if arabic_regex.match(line) and len(line.split()) >= 4:
528
- # result.append(line)
529
-
530
- # name_ar = ''
531
- # if result:
532
- # name_ar = ' '.join(result)
533
- # passport_details['name_ar'] = name_ar
534
-
535
- # # print(f'\nARABIC NAME 1: {name_ar}\n')
536
-
537
- # if not passport_details.get('name_ar', ''):
538
- # name_keywords = passport_details.get('full_name', '').split(' ')
539
- # results = find_names_with_context(passport_text, name_keywords)
540
- # if results:
541
- # passport_details['name_ar'] = results[0][1]
542
- # # print(f'\nARABIC NAME 2: {results[0][1]}\n')
543
- # else:
544
- # name_keywords = ['Full', 'Name']
545
- # results = find_names_with_context(passport_text, name_keywords)
546
- # if results:
547
- # passport_details['name_ar'] = results[0][1]
548
- # # print(f'\nARABIC NAME 3: {results[0][1]}\n')
549
- # else:
550
- # name_keywords = ['الاسم']
551
- # results = find_names_with_context(passport_text, name_keywords)
552
- # if results:
553
- # passport_details['name_ar'] = results[0][1]
554
- # # print(f'\nARABIC NAME 4: {results[0][1]}\n')
555
-
556
- # ## HANDLE MRZ1 IF NOT COMPLETE
557
- # if len(mrz1) < 40:
558
- # mrz1_pattern = r'PC[A-Z]{3}[A-Z0-9<]{5,44}'
559
- # match = re.search(mrz1_pattern, passport_text.replace(" ", ""))
560
- # if match:
561
- # mrz1 = match.group(0)
562
-
563
- # if not mrz1:
564
- # mrz1 = ''
565
-
566
- # if mrz1 and len(mrz1) < 44:
567
- # mrz1 = mrz1 = f"{mrz1}{'<' * (44 - len(mrz1))}"
568
-
569
- # ## HANDLE MRZ2 IF NOT COMPLETE
570
- # if not mrz2:
571
- # try:
572
- # mrz2 = passport_details.get('passport_number_mrz', '') + passport_details.get('nationality',
573
- # '') + convert_to_mrz_date(
574
- # passport_details.get('dob_mrz', '')) + passport_details.get('gender', '') + convert_to_mrz_date(
575
- # passport_details.get('expiry_date_mrz', ''))
576
- # except:
577
- # mrz2 = ''
578
-
579
- # if len(mrz2) >= 28 and len(mrz2) < 40:
580
- # mrz2 = mrz2 = f"{mrz2}{'<' * (44 - len(mrz2))}"
581
-
582
- # passport_details['mrz'] = mrz1 + mrz2
583
- # passport_details['mrz1'] = mrz1
584
- # passport_details['mrz2'] = mrz2
585
-
586
- # ## EXTRACT ENGLISH NAME FROM PASSPORT HERE
587
-
588
- # # print(f"PASSPORT DETAILS HERE: {passport_details}")
589
-
590
- # try:
591
- # pattern_1 = re.compile(r'\b(SDN|THE|SUDAN|PC|OF|REPUBLIC|SON|TOKAR|AP|CT|PUR|\w)\b', re.IGNORECASE)
592
- # pattern = re.compile(r'^[A-Z\s]{3,}[A-Z\s]{3,}[A-Z\s]{3,}$', re.MULTILINE)
593
- # matches = pattern.findall(pattern_1.sub('', passport_text))
594
-
595
- # if matches:
596
- # # print(f'MATCHES: {matches}')
597
- # # filtered_matches = [
598
- # # match for match in matches
599
- # # if ('REPUBLIC OF THE SUDAN' not in match.upper() and 'SUDAN' not in match.upper() and 'THE REPUBLIC' not in match.upper() and 'THE' not in match.upper() and 'PASSPORTS' not in match.upper())
600
- # # and (len(match.replace('\n', ' ').strip().split(' ')) >= 3)
601
- # # ]
602
- # excluded_keywords = {'republic of the sudan', 'sudan', 'the republic', 'passports', 'republic of'}
603
- # filtered_matches = [
604
- # match for match in matches
605
- # if not any(keyword.upper() in match.upper() for keyword in excluded_keywords)
606
- # and len(match.replace('\n', ' ').strip().split()) >= 3
607
- # ]
608
-
609
- # if filtered_matches:
610
- # def get_long_string(lst):
611
- # if len(lst) > 1:
612
- # return max(lst, key=len)
613
- # else:
614
- # return lst[0]
615
- # return None
616
-
617
- # result = get_long_string(filtered_matches)
618
- # full_name_generic = result.strip().replace('\n', ' ')
619
- # # full_name_generic = filtered_matches[0].strip().replace('\n', ' ')
620
- # else:
621
- # full_name_generic = ''
622
-
623
- # passport_details['full_name_generic'] = full_name_generic
624
- # else:
625
- # passport_details['full_name_generic'] = passport_details.get('full_name', '')
626
- # except:
627
- # passport_details['full_name_generic'] = passport_details.get('full_name', '')
628
-
629
- # if passport_details.get('full_name_generic', ''):
630
- # passport_details['name'] = passport_details['full_name_generic']
631
- # name_split = passport_details['full_name_generic'].split(' ')
632
- # passport_details['first_name'] = name_split[0]
633
- # passport_details['last_name'] = name_split[-1]
634
- # passport_details['middle_name'] = ' '.join(name_split[1:-1])
635
- # else:
636
- # if passport_details.get('full_name', ''):
637
- # name_split = passport_details['full_name'].split(' ')
638
- # passport_details['first_name'] = name_split[0]
639
- # passport_details['last_name'] = name_split[-1]
640
- # passport_details['middle_name'] = ' '.join(name_split[1:-1])
641
- # else:
642
- # passport_details['first_name'] = ''
643
- # passport_details['last_name'] = ''
644
- # passport_details['middle_name'] = ''
645
-
646
- # passport_details['issuing_country'] = 'SDN'
647
-
648
- # if "gender" in passport_details:
649
- # gender = passport_details["gender"].strip().upper()
650
- # if gender == "F":
651
- # passport_details["gender"] = "FEMALE"
652
- # elif gender == "M":
653
- # passport_details["gender"] = "MALE"
654
-
655
- # if 'gender' in passport_details:
656
- # passport_details["gender"] = passport_details["gender"].strip().upper()
657
-
658
- # passport_details_genai = extract_passport_details_genai(passport_text)
659
-
660
- # if passport_details_genai and passport_details_genai.get('name_ar', ''):
661
- # passport_details['name_ar'] = passport_details_genai.get('name_ar', '')
662
-
663
- # if passport_details_genai and passport_details_genai.get('name_en', ''):
664
- # passport_details['name_en'] = passport_details_genai.get('name_en', '')
665
- # passport_details['full_name_generic'] = passport_details_genai.get('name_en', '')
666
- # passport_details['name'] = passport_details_genai.get('name_en', '')
667
-
668
- # if passport_details_genai and passport_details_genai.get('place_of_birth', ''):
669
- # passport_details['place_of_birth'] = passport_details_genai.get('place_of_birth', '')
670
-
671
- # if passport_details_genai and passport_details_genai.get('place_of_issue', ''):
672
- # passport_details['place_of_issue'] = passport_details_genai.get('place_of_issue', '')
673
-
674
- # full_name_generic_2 = passport_details.get('full_name_generic', '')
675
- # try:
676
- # name_list = full_name_generic_2.split(' ')
677
- # passport_details['first_name'] = name_list[0]
678
- # passport_details['last_name'] = name_list[-1]
679
- # passport_details['middle_name'] = ' '.join(name_list[1:-1])
680
-
681
- # except Exception as e:
682
- # if passport_details_genai and passport_details_genai.get('first_name', ''):
683
- # passport_details['first_name'] = passport_details_genai.get('first_name', '')
684
-
685
- # if passport_details_genai and passport_details_genai.get('middle_name', ''):
686
- # passport_details['middle_name'] = passport_details_genai.get('middle_name', '')
687
-
688
- # if passport_details_genai and passport_details_genai.get('last_name', ''):
689
- # passport_details['last_name'] = passport_details_genai.get('last_name', '')
690
-
691
- # full_name_generic_ar = passport_details.get('name_ar', '')
692
- # try:
693
- # name_parts = full_name_generic_ar.split(' ')
694
-
695
- # # Handle compound names
696
- # compound_prefixes = ['عبد', 'عبدال', 'فضل', 'بمسك']
697
- # compound_suffixes = ['الدين', 'الله', 'الرحمن', 'الجنه']
698
-
699
- # # Process first name
700
- # if len(name_parts) >= 2:
701
- # # Check for specific compound first names
702
- # if name_parts[0] == 'عبد' or (name_parts[0] == 'فضل' and name_parts[1] == 'الله'):
703
- # passport_details['first_name_ar'] = name_parts[0] + ' ' + name_parts[1]
704
- # first_name_end_idx = 2
705
- # # Handle the specific case of "بمسك الجنه"
706
- # elif name_parts[0] == 'بمسك' and len(name_parts) >= 2 and name_parts[1] == 'الجنه':
707
- # passport_details['first_name_ar'] = name_parts[0] + ' ' + name_parts[1]
708
- # first_name_end_idx = 2
709
- # else:
710
- # # Regular first name
711
- # passport_details['first_name_ar'] = name_parts[0]
712
- # first_name_end_idx = 1
713
- # else:
714
- # passport_details['first_name_ar'] = name_parts[0] if name_parts else ''
715
- # first_name_end_idx = 1
716
-
717
- # # Process last name - check if last parts form a compound name
718
- # if len(name_parts) >= 2:
719
- # # Check for repeating patterns at the end (like "محمد خير")
720
- # if len(name_parts) >= 4 and name_parts[-2] == name_parts[-4] and name_parts[-1] == name_parts[-3]:
721
- # # We have a repeating two-word pattern at the end
722
- # passport_details['last_name_ar'] = name_parts[-2] + ' ' + name_parts[-1]
723
- # last_name_start_idx = len(name_parts) - 2
724
- # # Check for compound last names
725
- # elif len(name_parts) >= 3:
726
- # # Check for عبد + something
727
- # if name_parts[-2] == 'عبد':
728
- # passport_details['last_name_ar'] = 'عبد ' + name_parts[-1]
729
- # last_name_start_idx = len(name_parts) - 2
730
- # # Check for something + الدين/الله/etc.
731
- # elif name_parts[-1] in compound_suffixes:
732
- # passport_details['last_name_ar'] = name_parts[-2] + ' ' + name_parts[-1]
733
- # last_name_start_idx = len(name_parts) - 2
734
- # else:
735
- # passport_details['last_name_ar'] = name_parts[-1]
736
- # last_name_start_idx = len(name_parts) - 1
737
- # else:
738
- # passport_details['last_name_ar'] = name_parts[-1]
739
- # last_name_start_idx = len(name_parts) - 1
740
- # else:
741
- # passport_details['last_name_ar'] = ''
742
- # last_name_start_idx = len(name_parts)
743
-
744
- # # Middle name is everything between first and last name
745
- # if first_name_end_idx < last_name_start_idx:
746
- # passport_details['middle_name_ar'] = ' '.join(name_parts[first_name_end_idx:last_name_start_idx])
747
- # else:
748
- # passport_details['middle_name_ar'] = ''
749
- # except Exception as e:
750
- # if passport_details_genai and passport_details_genai.get('first_name_ar', ''):
751
- # passport_details['first_name_ar'] = passport_details_genai.get('first_name_ar', '')
752
-
753
- # if passport_details_genai and passport_details_genai.get('last_name_ar', ''):
754
- # passport_details['last_name_ar'] = passport_details_genai.get('last_name_ar', '')
755
-
756
- # if passport_details_genai and passport_details_genai.get('middle_name_ar', ''):
757
- # passport_details['middle_name_ar'] = passport_details_genai.get('middle_name_ar', '')
758
-
759
- # print(f"passport details: {passport_details}")
760
-
761
- # if passport_details_genai and passport_details_genai.get('issue_date', ''):
762
- # passport_details['issue_date'] = passport_details_genai.get('issue_date', '')
763
-
764
- # if passport_details_genai and passport_details_genai.get('nationality', ''):
765
- # if passport_details.get('nationality', '') != passport_details_genai.get('nationality', ''):
766
- # passport_details['nationality'] = passport_details_genai.get('nationality', '')
767
-
768
- # if passport_details_genai and passport_details_genai.get('mrz1', ''):
769
- # passport_details['mrz1'] = passport_details_genai.get('mrz1', '')
770
-
771
- # if passport_details_genai and passport_details_genai.get('mrz2', ''):
772
- # passport_details['mrz2'] = passport_details_genai.get('mrz2', '')
773
-
774
- # if passport_details_genai and passport_details_genai.get('mrz', ''):
775
- # passport_details['mrz'] = passport_details_genai.get('mrz', '')
776
-
777
- # if passport_details_genai and passport_details_genai.get('id_number', ''):
778
- # passport_details['id_number'] = passport_details_genai.get('id_number', '')
779
-
780
- # if passport_details_genai and passport_details_genai.get('passport_number_mrz', ''):
781
- # if passport_details.get('passport_number_mrz', '') != passport_details_genai.get('id_number',
782
- # passport_details_genai.get(
783
- # 'passport_number',
784
- # '')):
785
- # passport_details['passport_number_mrz'] = passport_details_genai.get('id_number',
786
- # passport_details_genai.get(
787
- # 'passport_number', ''))
788
-
789
- # if passport_details_genai and passport_details_genai.get('gender', ''):
790
- # if passport_details.get('gender', '') != passport_details_genai.get('gender', ''):
791
- # passport_details['gender'] = passport_details_genai.get('gender', '')
792
- # # try:
793
- # # full_name_generic = ''
794
- # # passport_details['full_name_generic'] = full_name_generic
795
- # # except:
796
- # # passport_details['full_name_generic'] = ''
797
-
798
- # return passport_details
799
-
800
-
801
- # def make_api_request_with_retries(prompt: str, max_retries: int = 3, delay_seconds: float = 2):
802
- # """
803
- # Helper function to make API requests with retry logic using OpenAI
804
- # """
805
- # for attempt in range(max_retries):
806
- # try:
807
- # response = openai.ChatCompletion.create(
808
- # model="gpt-4o",
809
- # temperature=0.4,
810
- # max_tokens=2000,
811
- # messages=[
812
- # {
813
- # "role": "user",
814
- # "content": prompt
815
- # }
816
- # ]
817
- # )
818
- # result = response.choices[0].message.content
819
-
820
- # try:
821
- # return json.loads(result)
822
- # except json.JSONDecodeError:
823
- # try:
824
- # json_match = re.search(r'```(json|python|plaintext)?\s*(.*?)\s*```|\s*({.*?})', result, re.DOTALL)
825
- # if json_match:
826
- # json_str = json_match.group(2) or json_match.group(3)
827
- # try:
828
- # return json.loads(json_str)
829
- # except:
830
- # return eval(json_str.replace("'", '"'))
831
- # except:
832
- # pass
833
-
834
- # return json.loads(result)
835
-
836
- # except Exception as e:
837
- # print(f"Error during API request (attempt {attempt + 1} of {max_retries}): {str(e)}")
838
- # if attempt < max_retries - 1:
839
- # time.sleep(delay_seconds)
840
- # else:
841
- # raise Exception(f"Max retries exceeded. Last error: {str(e)}")
842
-
843
-
844
- # def extract_passport_details_genai(passport_data):
845
- # """
846
- # Function to extract passport details using OpenAI API
847
- # """
848
- # try:
849
- # prompt = f"""From the attached text, please extract the data in a structured format. The response should be a dictionary containing:
850
- # - name_ar (Arabic name if available)
851
- # - name_en (English name)
852
- # - place_of_birth (English place of birth)
853
- # - place_of_issue (English place of issue)
854
- # - gender (FEMALE or MALE)
855
- # - mrz1 (first line of MRZ)
856
- # - mrz2 (second line of MRZ)
857
- # - passport_number (should be in format: letter followed by 8 digits)
858
- # - dob (in format dd/mm/yyyy)
859
- # - issue_date (in format dd/mm/yyyy)
860
- # - expiry_date (in format dd/mm/yyyy)
861
- # - nationality (ISO 3166-1 alpha-3 country code)
862
- # - first_name (from English name)
863
- # - middle_name (from English name)
864
- # - last_name (from English name)
865
- # - first_name_ar (Arabic first name)
866
- # - last_name_ar (Arabic last name)
867
- # - middle_name_ar (Arabic middle name)
868
-
869
- # Make sure to extract the correct names from both Arabic and English text.
870
- # Important NOTE: If the number of words in name_en and name_ar are not equal, translate the English name (name_en) into Arabic and update name_ar with the translated text.
871
-
872
- # The MRZ lines should be complete.
873
- # The response should only contain a dictionary with these fields.
874
-
875
- # Here's the text: {passport_data}"""
876
-
877
- # back_data = make_api_request_with_retries(prompt)
878
-
879
- # if back_data:
880
- # try:
881
- # if back_data.get('passport_number', ''):
882
- # back_data['id_number'] = back_data.pop('passport_number', '')
883
- # except:
884
- # pass
885
-
886
- # try:
887
- # if back_data.get('mrz1', '') and back_data.get('mrz2', ''):
888
- # back_data['mrz'] = back_data.get('mrz1', '') + back_data.get('mrz2', '')
889
- # except:
890
- # pass
891
-
892
- # back_data['issuing_country'] = 'SDN'
893
-
894
- # try:
895
- # if "gender" in back_data:
896
- # gender = back_data["gender"].strip().upper()
897
- # if gender == "F":
898
- # back_data["gender"] = "FEMALE"
899
- # elif gender == "M":
900
- # back_data["gender"] = "MALE"
901
- # elif gender in ['MALE', 'FEMALE']:
902
- # back_data["gender"] = gender.upper()
903
-
904
- # except:
905
- # pass
906
- # except Exception as e:
907
- # print(f"Error in processing the extracted data: {e}")
908
- # back_data = {
909
- # 'name_ar': '',
910
- # 'name_en': '',
911
- # 'first_name': '',
912
- # 'middle_name': '',
913
- # 'last_name': '',
914
- # 'first_name_ar': '',
915
- # 'last_name_ar': '',
916
- # 'middle_name_ar': '',
917
- # 'dob': '',
918
- # 'issue_date': '',
919
- # 'expiry_date': '',
920
- # 'place_of_birth': '',
921
- # 'place_of_issue': '',
922
- # 'nationality': '',
923
- # 'mrz1': '',
924
- # 'mrz2': '',
925
- # 'mrz': ''
926
- # }
927
-
928
- # return back_data
929
1
 
930
2
  import base64
931
3
  import time