idvpackage 3.0.11__py3-none-any.whl → 3.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- idvpackage/common.py +4 -962
- idvpackage/iraq_id_extraction_withopenai.py +374 -893
- idvpackage/jor_passport_extraction.py +1 -6
- idvpackage/liveness_spoofing_v2.py +2 -45
- idvpackage/ocr.py +1011 -2427
- idvpackage/ocr_utils.py +144 -486
- idvpackage/pse_passport_extraction.py +18 -292
- idvpackage/qatar_id_extraction.py +4 -956
- idvpackage/sudan_passport_extraction.py +0 -928
- idvpackage/syr_passport_extraction.py +27 -402
- idvpackage/uae_id_extraction.py +87 -151
- {idvpackage-3.0.11.dist-info → idvpackage-3.0.12.dist-info}/METADATA +1 -1
- idvpackage-3.0.12.dist-info/RECORD +34 -0
- {idvpackage-3.0.11.dist-info → idvpackage-3.0.12.dist-info}/WHEEL +1 -1
- idvpackage/ekyc.py +0 -78
- idvpackage/genai_utils.py +0 -309
- idvpackage/iraq_id_extraction.py +0 -992
- idvpackage/iraq_passport_extraction.py +0 -588
- idvpackage/lazy_imports.py +0 -44
- idvpackage/lebanon_passport_extraction.py +0 -161
- idvpackage/sau_id_extraction.py +0 -248
- idvpackage/sudan_id_extraction.py +0 -764
- idvpackage-3.0.11.dist-info/RECORD +0 -42
- {idvpackage-3.0.11.dist-info → idvpackage-3.0.12.dist-info}/licenses/LICENSE +0 -0
- {idvpackage-3.0.11.dist-info → idvpackage-3.0.12.dist-info}/top_level.txt +0 -0
|
@@ -1,764 +0,0 @@
|
|
|
1
|
-
from datetime import datetime, timedelta
|
|
2
|
-
import re
|
|
3
|
-
from google.cloud import vision_v1
|
|
4
|
-
from googletrans import Translator
|
|
5
|
-
from deep_translator import GoogleTranslator
|
|
6
|
-
import io
|
|
7
|
-
from PIL import Image
|
|
8
|
-
import json
|
|
9
|
-
import openai
|
|
10
|
-
import time
|
|
11
|
-
|
|
12
|
-
import json
|
|
13
|
-
import openai
|
|
14
|
-
import time
|
|
15
|
-
translator = Translator()
|
|
16
|
-
import base64
|
|
17
|
-
|
|
18
|
-
def find_gender_from_back(text):
|
|
19
|
-
gender = ""
|
|
20
|
-
gender_pattern = r"(\d)([A-Za-z])(\d)"
|
|
21
|
-
gender_match = re.search(gender_pattern, text)
|
|
22
|
-
if gender_match:
|
|
23
|
-
gender = gender_match.group(2)
|
|
24
|
-
|
|
25
|
-
if not gender:
|
|
26
|
-
gender_pattern = r"(\d)([MFmf])(\d)"
|
|
27
|
-
gender_match = re.search(gender_pattern, text)
|
|
28
|
-
if gender_match:
|
|
29
|
-
gender = gender_match.group(2)
|
|
30
|
-
|
|
31
|
-
return gender
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def func_common_dates(extract_no_space):
|
|
35
|
-
dob = ""
|
|
36
|
-
expiry_date = ""
|
|
37
|
-
try:
|
|
38
|
-
matches = re.findall(r"\d{2}/\d{2}/\d{4}", extract_no_space)
|
|
39
|
-
y1 = matches[0][-4:]
|
|
40
|
-
y2 = matches[1][-4:]
|
|
41
|
-
if int(y1) < int(y2):
|
|
42
|
-
dob = matches[0]
|
|
43
|
-
expiry_date = matches[1]
|
|
44
|
-
else:
|
|
45
|
-
dob = matches[1]
|
|
46
|
-
expiry_date = matches[0]
|
|
47
|
-
except:
|
|
48
|
-
dob = ""
|
|
49
|
-
expiry_date = ""
|
|
50
|
-
|
|
51
|
-
return dob, expiry_date
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def convert_dob(input_date):
|
|
55
|
-
day = input_date[4:6]
|
|
56
|
-
month = input_date[2:4]
|
|
57
|
-
year = input_date[0:2]
|
|
58
|
-
|
|
59
|
-
current_year = datetime.now().year
|
|
60
|
-
current_century = current_year // 100
|
|
61
|
-
current_year_last_two_digits = current_year % 100
|
|
62
|
-
|
|
63
|
-
century = current_century
|
|
64
|
-
# If the given year is greater than the last two digits of the current year, assume last century
|
|
65
|
-
if int(year) > current_year_last_two_digits:
|
|
66
|
-
century = current_century - 1
|
|
67
|
-
|
|
68
|
-
final_date = f"{day}/{month}/{century}{year}"
|
|
69
|
-
|
|
70
|
-
return final_date
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def func_expiry_date(extract):
|
|
74
|
-
extract_no_space = extract.replace(" ", "")
|
|
75
|
-
dob, expiry_date = func_common_dates(extract_no_space)
|
|
76
|
-
if expiry_date == "":
|
|
77
|
-
match_doe = re.findall(r"\d{7}[A-Z]{2,3}", extract_no_space)
|
|
78
|
-
for i in match_doe:
|
|
79
|
-
raw_doe = i[0:6]
|
|
80
|
-
print(raw_doe)
|
|
81
|
-
expiry_date = raw_doe[4:6] + "/" + raw_doe[2:4] + "/20" + raw_doe[0:2]
|
|
82
|
-
try:
|
|
83
|
-
dt_obj = datetime.strptime(expiry_date, "%d/%m/%Y")
|
|
84
|
-
break
|
|
85
|
-
except:
|
|
86
|
-
expiry_date = ""
|
|
87
|
-
|
|
88
|
-
return expiry_date
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
def convert_expiry_date(input_date):
|
|
92
|
-
day = input_date[4:6]
|
|
93
|
-
month = input_date[2:4]
|
|
94
|
-
year = input_date[0:2]
|
|
95
|
-
|
|
96
|
-
current_year = datetime.now().year
|
|
97
|
-
current_century = current_year // 100
|
|
98
|
-
current_year_last_two_digits = current_year % 100
|
|
99
|
-
century = current_century
|
|
100
|
-
|
|
101
|
-
if int(year) <= current_year_last_two_digits:
|
|
102
|
-
century = current_century
|
|
103
|
-
else:
|
|
104
|
-
century = current_century
|
|
105
|
-
final_date = f"{day}/{month}/{century}{year}"
|
|
106
|
-
|
|
107
|
-
return final_date
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
def func_dob(extract):
|
|
111
|
-
extract_no_space = extract.replace(" ", "")
|
|
112
|
-
dob, expiry_date = func_common_dates(extract_no_space)
|
|
113
|
-
if dob == "":
|
|
114
|
-
match_dob = re.findall(r"\d{7}(?:M|F)\d", extract_no_space)
|
|
115
|
-
for i in match_dob:
|
|
116
|
-
# print(i)
|
|
117
|
-
raw_dob = i[0:6]
|
|
118
|
-
# print(raw_dob)
|
|
119
|
-
year = str(datetime.today().year)[2:4]
|
|
120
|
-
temp = "19"
|
|
121
|
-
if int(raw_dob[0:2]) > int(year):
|
|
122
|
-
temp = "19"
|
|
123
|
-
else:
|
|
124
|
-
temp = "20"
|
|
125
|
-
dob = raw_dob[4:6] + "/" + raw_dob[2:4] + "/" + temp + raw_dob[0:2]
|
|
126
|
-
try:
|
|
127
|
-
dt_obj = datetime.strptime(dob, "%d/%m/%Y")
|
|
128
|
-
break
|
|
129
|
-
except:
|
|
130
|
-
# print(f'invalid date {dob}')
|
|
131
|
-
dob = ""
|
|
132
|
-
else:
|
|
133
|
-
pattern = r"\b(\d{14}).*?\b"
|
|
134
|
-
|
|
135
|
-
new_dob_match = re.search(pattern, extract_no_space)
|
|
136
|
-
|
|
137
|
-
if new_dob_match:
|
|
138
|
-
new_dob = new_dob_match.group(1)
|
|
139
|
-
new_dob = new_dob[:7]
|
|
140
|
-
dob = convert_dob(new_dob)
|
|
141
|
-
|
|
142
|
-
return dob
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
def remove_special_characters_mrz2(string):
|
|
146
|
-
# This pattern matches any character that is not a letter, digit, or space
|
|
147
|
-
pattern = r"[^a-zA-Z0-9\s]"
|
|
148
|
-
return re.sub(pattern, "", string)
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
def count_digits(element):
|
|
152
|
-
digits = [char for char in element if char.isdigit()]
|
|
153
|
-
return len(digits)
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
def sdn_back_id_extraction(back_id_data):
|
|
157
|
-
mrz_pattern = (
|
|
158
|
-
r"(IDSDN.*\n*.*\n*.*\n*.*|IDSDN.*\n*.*\n*.*\n*.*|IDSDN.*\n*.*\n*.*\n*.*)"
|
|
159
|
-
)
|
|
160
|
-
nationality_pattern = r"([A-Z]+)<<"
|
|
161
|
-
|
|
162
|
-
mrz1, mrz2, mrz3 = "", "", ""
|
|
163
|
-
|
|
164
|
-
try:
|
|
165
|
-
mrz = re.findall(
|
|
166
|
-
mrz_pattern, back_id_data.replace(" ", "").strip(), re.MULTILINE
|
|
167
|
-
)
|
|
168
|
-
mrz_str = mrz[0].replace(" ", "")
|
|
169
|
-
except:
|
|
170
|
-
mrz_str = ""
|
|
171
|
-
|
|
172
|
-
if mrz_str:
|
|
173
|
-
mrz_list = mrz_str.replace(" ", "").split("\n")
|
|
174
|
-
try:
|
|
175
|
-
mrz1 = mrz_list[0]
|
|
176
|
-
if len(mrz_list) == 3:
|
|
177
|
-
mrz1, mrz2, mrz3 = mrz_list[0], mrz_list[1], mrz_list[2]
|
|
178
|
-
except:
|
|
179
|
-
mrz1 = ""
|
|
180
|
-
|
|
181
|
-
try:
|
|
182
|
-
mrz2 = [
|
|
183
|
-
ele
|
|
184
|
-
for ele in [ele for ele in mrz_list if ele not in [mrz1, mrz3]]
|
|
185
|
-
if remove_special_characters_mrz2(ele) != ""
|
|
186
|
-
]
|
|
187
|
-
if len(mrz2) > 1:
|
|
188
|
-
mrz2 = (
|
|
189
|
-
max(mrz2, key=count_digits)
|
|
190
|
-
+ [ele for ele in mrz2 if ele != max(mrz2, key=count_digits)][0]
|
|
191
|
-
)
|
|
192
|
-
|
|
193
|
-
pattern = r"\d{7}[MF]\d{7}[\S]{3}<+?\d"
|
|
194
|
-
mrz2_temp = re.search(pattern, mrz2.replace(">", ""))
|
|
195
|
-
if mrz2_temp:
|
|
196
|
-
mrz2 = mrz2_temp.group(0)
|
|
197
|
-
|
|
198
|
-
mrz2 = mrz2.split("<")[0] + "<<<<<<<<<<" + mrz2.split("<")[-1]
|
|
199
|
-
|
|
200
|
-
# mrz2=mrz2[0].split('<')[0]+'<<<<<<<<<<'+mrz2[-1][-1]
|
|
201
|
-
else:
|
|
202
|
-
mrz2 = mrz2[0].split("<")[0] + "<<<<<<<<<<" + mrz2[0][-1]
|
|
203
|
-
except:
|
|
204
|
-
mrz2 = ""
|
|
205
|
-
|
|
206
|
-
## condition to replace O with 0
|
|
207
|
-
try:
|
|
208
|
-
pattern = r"(IDSDN[A-Z]{1,2})O(?=[0-9])"
|
|
209
|
-
replacement = lambda m: m.group(1) + "0"
|
|
210
|
-
mrz1 = re.sub(pattern, replacement, mrz1)
|
|
211
|
-
except:
|
|
212
|
-
pass
|
|
213
|
-
|
|
214
|
-
## condition to replace '>' with 7
|
|
215
|
-
if mrz2 and mrz2.endswith(">"):
|
|
216
|
-
mrz2 = mrz2.split("<")[0] + "<<<<<<<<<<" + "7"
|
|
217
|
-
|
|
218
|
-
if not mrz3 or (mrz3.startswith(">") or mrz3.startswith("<")):
|
|
219
|
-
pattern = r"^[A-Za-z]+<+[A-Za-z]+.*$"
|
|
220
|
-
matches = re.findall(pattern, mrz_str, re.MULTILINE)
|
|
221
|
-
try:
|
|
222
|
-
mrz3 = list(filter(None, matches))[0]
|
|
223
|
-
except:
|
|
224
|
-
try:
|
|
225
|
-
matches = re.findall(pattern, back_id_data, re.MULTILINE)
|
|
226
|
-
mrz3 = list(filter(None, matches))[0]
|
|
227
|
-
except:
|
|
228
|
-
mrz3 = ""
|
|
229
|
-
|
|
230
|
-
## condition to add filler to mrz3, making it total length of 30 chars
|
|
231
|
-
if len(mrz3) < 30:
|
|
232
|
-
mrz3 = mrz3.ljust(30, "<")
|
|
233
|
-
|
|
234
|
-
try:
|
|
235
|
-
dob = func_dob(mrz2)
|
|
236
|
-
except:
|
|
237
|
-
dob = ""
|
|
238
|
-
|
|
239
|
-
if not dob:
|
|
240
|
-
matches = re.findall(r"\d{4}/\d{2}/\d{2}", back_id_data)
|
|
241
|
-
sorted_dates = sorted(matches)
|
|
242
|
-
dob = sorted_dates[0]
|
|
243
|
-
|
|
244
|
-
expiry = func_expiry_date(mrz_str)
|
|
245
|
-
if not expiry:
|
|
246
|
-
matches = re.findall(r"\d{4}/\d{2}/\d{2}", back_id_data)
|
|
247
|
-
sorted_dates = sorted(matches)
|
|
248
|
-
expiry = sorted_dates[-1]
|
|
249
|
-
|
|
250
|
-
# issue date
|
|
251
|
-
issue_date = "" # Initialize with default value
|
|
252
|
-
try:
|
|
253
|
-
matches = re.findall(r"\d{4}/\d{2}/\d{2}", back_id_data)
|
|
254
|
-
sorted_dates = sorted(matches)
|
|
255
|
-
if len(sorted_dates) > 2:
|
|
256
|
-
issue_date = sorted_dates[1]
|
|
257
|
-
except:
|
|
258
|
-
pass
|
|
259
|
-
|
|
260
|
-
try:
|
|
261
|
-
nationality = mrz2.split("<")[0][-3:]
|
|
262
|
-
except:
|
|
263
|
-
nationality = ""
|
|
264
|
-
|
|
265
|
-
if mrz3:
|
|
266
|
-
full_name_mrz = mrz3.replace("<", " ").replace(">", " ").strip()
|
|
267
|
-
|
|
268
|
-
else:
|
|
269
|
-
full_name_mrz = ""
|
|
270
|
-
|
|
271
|
-
try:
|
|
272
|
-
pattern = r"(?<=Name: )\w+(?: \w+)*|(?<=Name )\w+(?: \w+)*"
|
|
273
|
-
|
|
274
|
-
match = re.search(pattern, back_id_data, re.IGNORECASE)
|
|
275
|
-
name = match.group(0) or match.group(1)
|
|
276
|
-
except:
|
|
277
|
-
try:
|
|
278
|
-
pattern = r"(?<=NAME):*[ \n]*([A-Z ]+)"
|
|
279
|
-
|
|
280
|
-
match = re.search(pattern, back_id_data, re.IGNORECASE)
|
|
281
|
-
if match:
|
|
282
|
-
name = match.group(1).strip().replace(":", "")
|
|
283
|
-
else:
|
|
284
|
-
name = ""
|
|
285
|
-
except:
|
|
286
|
-
name = ""
|
|
287
|
-
|
|
288
|
-
if full_name_mrz and not name:
|
|
289
|
-
name = (
|
|
290
|
-
" ".join(full_name_mrz.split(" ")[1:]) + " " + full_name_mrz.split(" ")[0]
|
|
291
|
-
if full_name_mrz
|
|
292
|
-
else ""
|
|
293
|
-
)
|
|
294
|
-
name = name.strip()
|
|
295
|
-
|
|
296
|
-
if name:
|
|
297
|
-
first_name = name.split(" ")[0]
|
|
298
|
-
last_name = name.split(" ")[-1]
|
|
299
|
-
middle_name = " ".join(name.split(" ")[1:-1])
|
|
300
|
-
else:
|
|
301
|
-
first_name, last_name, middle_name = "", "", ""
|
|
302
|
-
|
|
303
|
-
if "issue_date" not in locals():
|
|
304
|
-
issue_date = ""
|
|
305
|
-
|
|
306
|
-
try:
|
|
307
|
-
dob_pattern = r"(\d+)[MF]"
|
|
308
|
-
dob_match = re.search(dob_pattern, mrz2)
|
|
309
|
-
dob_mrz = convert_dob(dob_match.group(1)) if dob_match else ""
|
|
310
|
-
|
|
311
|
-
doe_pattern = r"[MF](\d+)"
|
|
312
|
-
doe_match = re.search(doe_pattern, mrz2)
|
|
313
|
-
expiry_date_mrz = convert_expiry_date(doe_match.group(1)) if doe_match else ""
|
|
314
|
-
except:
|
|
315
|
-
dob_mrz, expiry_date_mrz = "", ""
|
|
316
|
-
|
|
317
|
-
gender = ""
|
|
318
|
-
try:
|
|
319
|
-
gender = find_gender_from_back(mrz2)
|
|
320
|
-
except:
|
|
321
|
-
gender = find_gender_from_back(back_id_data)
|
|
322
|
-
|
|
323
|
-
mrz_str = f"{mrz1}\n{mrz2}\n{mrz3}"
|
|
324
|
-
|
|
325
|
-
try:
|
|
326
|
-
if expiry and not expiry_date_mrz:
|
|
327
|
-
expiry_date_mrz = expiry
|
|
328
|
-
|
|
329
|
-
if dob and not dob_mrz:
|
|
330
|
-
dob_mrz = dob
|
|
331
|
-
except:
|
|
332
|
-
pass
|
|
333
|
-
|
|
334
|
-
if issue_date == "":
|
|
335
|
-
print(f"Calculating issue date....")
|
|
336
|
-
from dateutil.relativedelta import relativedelta
|
|
337
|
-
|
|
338
|
-
try:
|
|
339
|
-
exp = datetime.strptime(expiry_date_mrz, "%d/%m/%Y")
|
|
340
|
-
except:
|
|
341
|
-
exp = datetime.strptime(expiry_date_mrz, "%d-%m-%Y")
|
|
342
|
-
|
|
343
|
-
issue_date = exp - relativedelta(years=5) + timedelta(days=1)
|
|
344
|
-
issue_date = issue_date.strftime("%d/%m/%Y")
|
|
345
|
-
|
|
346
|
-
back_data_dict = {
|
|
347
|
-
"mrz": [mrz_str],
|
|
348
|
-
"mrz1": mrz1.replace("*", "<"),
|
|
349
|
-
"mrz2": mrz2,
|
|
350
|
-
"mrz3": mrz3,
|
|
351
|
-
# "dob_generic": dob,
|
|
352
|
-
# "full_name_mrz": full_name_mrz,
|
|
353
|
-
"full_name_generic": name,
|
|
354
|
-
"first_name": first_name,
|
|
355
|
-
"middle_name": middle_name,
|
|
356
|
-
"last_name": last_name,
|
|
357
|
-
"issuing_country": "SDN",
|
|
358
|
-
# "expiry_date_generic": expiry,
|
|
359
|
-
"nationality": nationality,
|
|
360
|
-
"dob_back": dob_mrz,
|
|
361
|
-
"issue_date": issue_date,
|
|
362
|
-
"expiry_date": expiry_date_mrz,
|
|
363
|
-
"gender": gender,
|
|
364
|
-
}
|
|
365
|
-
if "gender" in back_data_dict:
|
|
366
|
-
gender = back_data_dict["gender"].strip().upper()
|
|
367
|
-
if gender == "F":
|
|
368
|
-
back_data_dict["gender"] = "FEMALE"
|
|
369
|
-
elif gender == "M":
|
|
370
|
-
back_data_dict["gender"] = "MALE"
|
|
371
|
-
|
|
372
|
-
return back_data_dict
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
def crop_second_part(img):
|
|
376
|
-
width, height = img.size
|
|
377
|
-
half_width = width // 2
|
|
378
|
-
second_part = img.crop((half_width, 0, width, height))
|
|
379
|
-
return second_part
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
def extract_text_from_image_data(client, image):
|
|
383
|
-
"""Detects text in the file."""
|
|
384
|
-
|
|
385
|
-
with io.BytesIO() as output:
|
|
386
|
-
image.save(output, format="PNG")
|
|
387
|
-
content = output.getvalue()
|
|
388
|
-
|
|
389
|
-
image = vision_v1.types.Image(content=content)
|
|
390
|
-
|
|
391
|
-
response = client.text_detection(image=image)
|
|
392
|
-
texts = response.text_annotations
|
|
393
|
-
|
|
394
|
-
return texts[0].description
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
def detect_id_card(client, image_data, id_text, part=None):
|
|
398
|
-
if id_text:
|
|
399
|
-
vertices = id_text[0].bounding_poly.vertices
|
|
400
|
-
left = vertices[0].x
|
|
401
|
-
top = vertices[0].y
|
|
402
|
-
right = vertices[2].x
|
|
403
|
-
bottom = vertices[2].y
|
|
404
|
-
|
|
405
|
-
padding = 30
|
|
406
|
-
left -= padding
|
|
407
|
-
top -= padding
|
|
408
|
-
right += padding
|
|
409
|
-
bottom += padding
|
|
410
|
-
|
|
411
|
-
# img = image_data
|
|
412
|
-
|
|
413
|
-
with Image.open(io.BytesIO(image_data)) as img:
|
|
414
|
-
id_card = img.crop((max(0, left), max(0, top), right, bottom))
|
|
415
|
-
width, height = id_card.size
|
|
416
|
-
if width < height:
|
|
417
|
-
id_card = id_card.rotate(90, expand=True)
|
|
418
|
-
|
|
419
|
-
part_text = id_text[0].description
|
|
420
|
-
part_img = crop_second_part(id_card)
|
|
421
|
-
part_text = extract_text_from_image_data(client, part_img)
|
|
422
|
-
|
|
423
|
-
return part_text
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
def extract_occupation(text):
|
|
427
|
-
match = re.search(r"المهنة\s*([^\n]+)", text)
|
|
428
|
-
if match:
|
|
429
|
-
return match.group(1).strip().replace(":", "")
|
|
430
|
-
else:
|
|
431
|
-
match = re.search(r"المهن[ةــ]*\s*\n\s*([^\n]+)", text)
|
|
432
|
-
if match:
|
|
433
|
-
return (
|
|
434
|
-
match.group(1)
|
|
435
|
-
.replace(":", "")
|
|
436
|
-
.replace("ــة", "")
|
|
437
|
-
.replace("ة.", "")
|
|
438
|
-
.replace("ة", "")
|
|
439
|
-
.replace("العقــ", "")
|
|
440
|
-
.replace("ـ", "")
|
|
441
|
-
.strip()
|
|
442
|
-
)
|
|
443
|
-
else:
|
|
444
|
-
return None
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
def extract_occupation_from_text(part_text):
|
|
448
|
-
lines = part_text.split("\n")
|
|
449
|
-
|
|
450
|
-
arabic_number_pattern = re.compile(r"[\u0660-\u0669]+")
|
|
451
|
-
final_occupation = ""
|
|
452
|
-
|
|
453
|
-
for i in range(len(lines) - 1, 0, -1):
|
|
454
|
-
if arabic_number_pattern.search(lines[i]):
|
|
455
|
-
occupation = lines[i - 1].strip()
|
|
456
|
-
occupation = re.sub(r"\d+", "", occupation)
|
|
457
|
-
occupation = re.sub(r"[A-Za-z]+", "", occupation)
|
|
458
|
-
if occupation and occupation in [
|
|
459
|
-
"العنــ",
|
|
460
|
-
"الغد",
|
|
461
|
-
"المهنــ",
|
|
462
|
-
"العنوان",
|
|
463
|
-
"العلب",
|
|
464
|
-
"العنب",
|
|
465
|
-
"ـوان",
|
|
466
|
-
"العيد",
|
|
467
|
-
"العز",
|
|
468
|
-
"العن",
|
|
469
|
-
"العد",
|
|
470
|
-
]:
|
|
471
|
-
search_key = lines.index(occupation)
|
|
472
|
-
final_occupation = lines[search_key - 1]
|
|
473
|
-
break
|
|
474
|
-
else:
|
|
475
|
-
final_occupation = occupation
|
|
476
|
-
|
|
477
|
-
return (
|
|
478
|
-
final_occupation.replace(":", "")
|
|
479
|
-
.replace("المهنة", "")
|
|
480
|
-
.replace("ــة", "")
|
|
481
|
-
.replace("ة.", "")
|
|
482
|
-
.replace("ة", "")
|
|
483
|
-
.replace("العقــ", "")
|
|
484
|
-
.replace("ـ", "")
|
|
485
|
-
.strip()
|
|
486
|
-
if final_occupation
|
|
487
|
-
else ""
|
|
488
|
-
)
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
def extract_occupation_v2(client, image_data, texts):
|
|
492
|
-
part_text = detect_id_card(client, image_data, texts)
|
|
493
|
-
occupation_res = extract_occupation_from_text(part_text)
|
|
494
|
-
if not occupation_res:
|
|
495
|
-
occupation_res = extract_occupation_from_text(texts[0].description)
|
|
496
|
-
|
|
497
|
-
if occupation_res in ["الرقم الوطني"] or not occupation_res:
|
|
498
|
-
occupation_res = extract_occupation(part_text)
|
|
499
|
-
if not occupation_res:
|
|
500
|
-
occupation_res = extract_occupation(texts[0].description)
|
|
501
|
-
|
|
502
|
-
return occupation_res
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
def extract_place_of_birth(text):
|
|
506
|
-
match = re.search(r"مكان الميلاد\s*([^\n]+)|مكان الميادد\s*([^\n]+)", text)
|
|
507
|
-
if match:
|
|
508
|
-
return (
|
|
509
|
-
match.group(1).strip().replace(":", "")
|
|
510
|
-
if match.group(1) is not None
|
|
511
|
-
else match.group(2).strip().replace(":", "")
|
|
512
|
-
)
|
|
513
|
-
return None
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
def extract_dob(text):
|
|
517
|
-
dob = ""
|
|
518
|
-
extract_no_space = text.replace(" ", "")
|
|
519
|
-
try:
|
|
520
|
-
matches = re.findall(r"\d{4}/\d{2}/\d{2}", extract_no_space)
|
|
521
|
-
dob = matches[0]
|
|
522
|
-
except:
|
|
523
|
-
dob = ""
|
|
524
|
-
return dob
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
def extract_name_from_front(text, dob):
|
|
528
|
-
name = ""
|
|
529
|
-
lines = text.split("\n")
|
|
530
|
-
for line in lines:
|
|
531
|
-
if dob in line:
|
|
532
|
-
search_key = lines.index(line)
|
|
533
|
-
part_name = lines[search_key - 1]
|
|
534
|
-
if part_name:
|
|
535
|
-
part_name = re.sub(r"\d+", "", part_name)
|
|
536
|
-
part_name = re.sub(r"[A-Za-z]+", "", part_name)
|
|
537
|
-
name = (
|
|
538
|
-
part_name.replace("الإســــــــــــم", "")
|
|
539
|
-
.replace("الاسم", "")
|
|
540
|
-
.replace("الإسم", "")
|
|
541
|
-
.replace("ـم", "")
|
|
542
|
-
.replace(":", "")
|
|
543
|
-
.strip()
|
|
544
|
-
)
|
|
545
|
-
if (
|
|
546
|
-
name
|
|
547
|
-
in [
|
|
548
|
-
"تاريخ الميلاد",
|
|
549
|
-
"الإســ",
|
|
550
|
-
"الإسـ",
|
|
551
|
-
"الإس",
|
|
552
|
-
"الإصـ",
|
|
553
|
-
"تاریخ",
|
|
554
|
-
"الرقم الوطني",
|
|
555
|
-
]
|
|
556
|
-
or len(name.split(" ")) <= 2
|
|
557
|
-
):
|
|
558
|
-
name = lines[search_key - 2]
|
|
559
|
-
if name:
|
|
560
|
-
name = (
|
|
561
|
-
name.replace("الإســــــــــــم", "")
|
|
562
|
-
.replace("الاسم", "")
|
|
563
|
-
.replace("الإسم", "")
|
|
564
|
-
.replace("ـم", "")
|
|
565
|
-
.replace(":", "")
|
|
566
|
-
.strip()
|
|
567
|
-
)
|
|
568
|
-
name = re.sub(r"\d+", "", name)
|
|
569
|
-
name = re.sub(r"[A-Za-z]+", "", name)
|
|
570
|
-
break
|
|
571
|
-
return name.strip()
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
def sdn_front_id_extraction(
|
|
575
|
-
client, ar_front_id_data, image_data, texts, compressed_image
|
|
576
|
-
):
|
|
577
|
-
try:
|
|
578
|
-
front_id_data = translator.translate(ar_front_id_data, src="ar", dest="en").text
|
|
579
|
-
except Exception as e:
|
|
580
|
-
front_id_data = GoogleTranslator("ar", "en").translate(ar_front_id_data)
|
|
581
|
-
|
|
582
|
-
id_number_pattern = r"\b\d{11}\b"
|
|
583
|
-
|
|
584
|
-
id_number_match = re.search(
|
|
585
|
-
id_number_pattern, front_id_data.replace(" ", ""), re.IGNORECASE
|
|
586
|
-
)
|
|
587
|
-
if id_number_match:
|
|
588
|
-
id_number = id_number_match.group(0)
|
|
589
|
-
else:
|
|
590
|
-
try:
|
|
591
|
-
id_number_match = re.search(
|
|
592
|
-
id_number_pattern, ar_front_id_data.replace(" ", ""), re.IGNORECASE
|
|
593
|
-
)
|
|
594
|
-
id_number = id_number_match.group(0)
|
|
595
|
-
except:
|
|
596
|
-
id_number = ""
|
|
597
|
-
|
|
598
|
-
try:
|
|
599
|
-
occupation = extract_occupation_v2(client, image_data, texts)
|
|
600
|
-
if occupation:
|
|
601
|
-
try:
|
|
602
|
-
occupation_en = GoogleTranslator(dest="en").translate(occupation)
|
|
603
|
-
except:
|
|
604
|
-
occupation_en = ""
|
|
605
|
-
else:
|
|
606
|
-
occupation, occupation_en = "", ""
|
|
607
|
-
except:
|
|
608
|
-
occupation, occupation_en = "", ""
|
|
609
|
-
|
|
610
|
-
try:
|
|
611
|
-
place_of_birth = extract_place_of_birth(ar_front_id_data)
|
|
612
|
-
if place_of_birth:
|
|
613
|
-
try:
|
|
614
|
-
place_of_birth_en = GoogleTranslator(dest="en").translate(
|
|
615
|
-
place_of_birth
|
|
616
|
-
)
|
|
617
|
-
except:
|
|
618
|
-
place_of_birth_en = ""
|
|
619
|
-
else:
|
|
620
|
-
place_of_birth, place_of_birth_en = "", ""
|
|
621
|
-
except:
|
|
622
|
-
place_of_birth, place_of_birth_en = "", ""
|
|
623
|
-
|
|
624
|
-
try:
|
|
625
|
-
dob = extract_dob(ar_front_id_data)
|
|
626
|
-
except:
|
|
627
|
-
dob = ""
|
|
628
|
-
|
|
629
|
-
try:
|
|
630
|
-
full_name = extract_name_from_front(ar_front_id_data, dob)
|
|
631
|
-
if full_name:
|
|
632
|
-
if "مكان الميلاد" in full_name:
|
|
633
|
-
lines = ar_front_id_data.split("\n")
|
|
634
|
-
search_pos = lines.index(full_name)
|
|
635
|
-
full_name = lines[search_pos - 1]
|
|
636
|
-
else:
|
|
637
|
-
full_name
|
|
638
|
-
except:
|
|
639
|
-
full_name
|
|
640
|
-
|
|
641
|
-
front_data_dict = {
|
|
642
|
-
"id_number": id_number,
|
|
643
|
-
"occupation_ar": occupation,
|
|
644
|
-
"occupation_en": occupation_en,
|
|
645
|
-
"occupation": occupation_en,
|
|
646
|
-
"place_of_birth": place_of_birth,
|
|
647
|
-
"place_of_birth_en": place_of_birth_en,
|
|
648
|
-
"dob": dob,
|
|
649
|
-
"name_ar": full_name,
|
|
650
|
-
}
|
|
651
|
-
|
|
652
|
-
empty_string_keys = [key for key, value in front_data_dict.items() if value == ""]
|
|
653
|
-
if empty_string_keys:
|
|
654
|
-
prompt = (
|
|
655
|
-
"""
|
|
656
|
-
From the provided text: " %s ", extract and structure the following fields as a dictionary:
|
|
657
|
-
|
|
658
|
-
- 'id_number': The ID number (e.g., national ID, passport number, etc.)
|
|
659
|
-
- 'occupation_ar': The occupation in Arabic
|
|
660
|
-
- 'occupation_en': The occupation in English
|
|
661
|
-
- 'place_of_birth': The place of birth in Arabic
|
|
662
|
-
- 'place_of_birth_en': The place of birth in English
|
|
663
|
-
- 'dob': The date of birth (in the format YYYY-MM-DD or any standard date format provided)
|
|
664
|
-
- 'name_ar': The full name in Arabic
|
|
665
|
-
|
|
666
|
-
The response should STRICTLY follow this format:
|
|
667
|
-
{
|
|
668
|
-
"id_number": "<value>",
|
|
669
|
-
"occupation_ar": "<value>",
|
|
670
|
-
"occupation_en": "<value>",
|
|
671
|
-
"place_of_birth": "<value>",
|
|
672
|
-
"place_of_birth_en": "<value>",
|
|
673
|
-
"dob": "<value>",
|
|
674
|
-
"name_ar": "<value>"
|
|
675
|
-
}
|
|
676
|
-
Ensure that all values are accurately extracted and formatted. If a value is missing, return `null` for that field.
|
|
677
|
-
|
|
678
|
-
Example:
|
|
679
|
-
{
|
|
680
|
-
"id_number": "12345678901",
|
|
681
|
-
"occupation_ar": "مهندس",
|
|
682
|
-
"occupation_en": "Engineer",
|
|
683
|
-
"place_of_birth": "الرياض",
|
|
684
|
-
"place_of_birth_en": "Riyadh",
|
|
685
|
-
"dob": "1990-05-15",
|
|
686
|
-
"name_ar": "محمد بن أحمد"
|
|
687
|
-
}
|
|
688
|
-
"""
|
|
689
|
-
% front_id_data
|
|
690
|
-
)
|
|
691
|
-
|
|
692
|
-
start = time.time()
|
|
693
|
-
front_data_dict = get_openai_response_with_retries(
|
|
694
|
-
prompt=prompt, compressed_image=compressed_image
|
|
695
|
-
)
|
|
696
|
-
end = time.time() - start
|
|
697
|
-
print(f"Openai api call took an additional {end}s")
|
|
698
|
-
front_data_dict["occupation"] = front_data_dict["occupation_en"]
|
|
699
|
-
|
|
700
|
-
return front_data_dict
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
def get_openai_response_with_retries(
|
|
704
|
-
max_retries=3, prompt="", delay_seconds: float = 2, compressed_image=""
|
|
705
|
-
):
|
|
706
|
-
img_bytes = compressed_image.getvalue()
|
|
707
|
-
|
|
708
|
-
# Encode the bytes to base64
|
|
709
|
-
img_base64_bytes = base64.b64encode(img_bytes).decode("utf-8")
|
|
710
|
-
for attempt in range(max_retries):
|
|
711
|
-
try:
|
|
712
|
-
response = openai.ChatCompletion.create(
|
|
713
|
-
model="gpt-4.1-nano",
|
|
714
|
-
temperature=0.4,
|
|
715
|
-
max_tokens=2000,
|
|
716
|
-
messages=[
|
|
717
|
-
{
|
|
718
|
-
"role": "user",
|
|
719
|
-
"content": [
|
|
720
|
-
{"type": "text", "text": prompt},
|
|
721
|
-
{
|
|
722
|
-
"type": "image_url",
|
|
723
|
-
"image_url": {
|
|
724
|
-
"url": f"data:image/jpeg;base64,{img_base64_bytes}",
|
|
725
|
-
},
|
|
726
|
-
},
|
|
727
|
-
],
|
|
728
|
-
}
|
|
729
|
-
],
|
|
730
|
-
)
|
|
731
|
-
|
|
732
|
-
result = response.choices[0].message.content
|
|
733
|
-
|
|
734
|
-
try:
|
|
735
|
-
return json.loads(result)
|
|
736
|
-
except json.JSONDecodeError:
|
|
737
|
-
try:
|
|
738
|
-
json_match = re.search(
|
|
739
|
-
r"```(json|python|plaintext)?\s*(.*?)\s*```|\s*({.*?})",
|
|
740
|
-
result,
|
|
741
|
-
re.DOTALL,
|
|
742
|
-
)
|
|
743
|
-
if json_match:
|
|
744
|
-
json_str = json_match.group(2) or json_match.group(3)
|
|
745
|
-
try:
|
|
746
|
-
return json.loads(json_str)
|
|
747
|
-
except:
|
|
748
|
-
return eval(json_str.replace("'", '"'))
|
|
749
|
-
except Exception as e:
|
|
750
|
-
return {
|
|
751
|
-
"error": "GPT's response incorrectly formatted.",
|
|
752
|
-
"error_details": e,
|
|
753
|
-
}
|
|
754
|
-
|
|
755
|
-
except Exception as e:
|
|
756
|
-
print(
|
|
757
|
-
f"Error during API request (attempt {attempt + 1} of {max_retries}): {str(e)}"
|
|
758
|
-
)
|
|
759
|
-
|
|
760
|
-
if attempt < max_retries - 1:
|
|
761
|
-
time.sleep(delay_seconds)
|
|
762
|
-
|
|
763
|
-
else:
|
|
764
|
-
raise Exception(f"Max retries exceeded. Last error: {str(e)}")
|