idvpackage 3.0.11__py3-none-any.whl → 3.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- idvpackage/common.py +8 -966
- idvpackage/iraq_id_extraction_withopenai.py +374 -893
- idvpackage/jor_passport_extraction.py +1 -6
- idvpackage/liveness_spoofing_v2.py +2 -45
- idvpackage/ocr.py +1016 -2430
- idvpackage/ocr_utils.py +148 -489
- idvpackage/pse_passport_extraction.py +18 -292
- idvpackage/qatar_id_extraction.py +4 -956
- idvpackage/sudan_passport_extraction.py +0 -928
- idvpackage/syr_passport_extraction.py +27 -402
- idvpackage/uae_id_extraction.py +87 -151
- {idvpackage-3.0.11.dist-info → idvpackage-3.0.13.dist-info}/METADATA +1 -1
- idvpackage-3.0.13.dist-info/RECORD +34 -0
- {idvpackage-3.0.11.dist-info → idvpackage-3.0.13.dist-info}/WHEEL +1 -1
- idvpackage/ekyc.py +0 -78
- idvpackage/genai_utils.py +0 -309
- idvpackage/iraq_id_extraction.py +0 -992
- idvpackage/iraq_passport_extraction.py +0 -588
- idvpackage/lazy_imports.py +0 -44
- idvpackage/lebanon_passport_extraction.py +0 -161
- idvpackage/sau_id_extraction.py +0 -248
- idvpackage/sudan_id_extraction.py +0 -764
- idvpackage-3.0.11.dist-info/RECORD +0 -42
- {idvpackage-3.0.11.dist-info → idvpackage-3.0.13.dist-info}/licenses/LICENSE +0 -0
- {idvpackage-3.0.11.dist-info → idvpackage-3.0.13.dist-info}/top_level.txt +0 -0
|
@@ -1,961 +1,9 @@
|
|
|
1
|
-
# from PIL import Image
|
|
2
|
-
# from deep_translator import GoogleTranslator
|
|
3
|
-
# import pycountry
|
|
4
|
-
# from rapidfuzz import process, fuzz
|
|
5
|
-
# from idvpackage.common import extract_text_from_image_data
|
|
6
|
-
# from io import BytesIO
|
|
7
|
-
# import re
|
|
8
|
-
# import time
|
|
9
|
-
# import datetime
|
|
10
|
-
# from langchain.tools import tool
|
|
11
|
-
# from langchain.prompts import ChatPromptTemplate
|
|
12
|
-
# from langchain.chat_models import ChatOpenAI
|
|
13
|
-
# from pydantic import BaseModel, Field, validator
|
|
14
|
-
# from langchain.utils.openai_functions import convert_pydantic_to_openai_function
|
|
15
|
-
# from typing import Optional, Literal
|
|
16
|
-
# from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
|
|
17
|
-
# from datetime import datetime, timedelta
|
|
18
|
-
# from langchain.schema.agent import AgentFinish
|
|
19
|
-
# import openai
|
|
20
|
-
# import json
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
# class QatarIDInfo(BaseModel):
|
|
24
|
-
# """
|
|
25
|
-
# Extract info from ocr-extracted text from a Qatar ID
|
|
26
|
-
# """
|
|
27
|
-
# name: str = Field(..., description="Full name in English")
|
|
28
|
-
# name_ar: str = Field(..., description="Full name in Arabic")
|
|
29
|
-
# nationality: str = Field(...,
|
|
30
|
-
# description="Nationality in ISO 3166-1 alpha-3 format (e.g., 'PAK' 'QAT', 'SYR', 'PHL')",
|
|
31
|
-
# example="SYR")
|
|
32
|
-
# id_number: str = Field(..., description="National ID number")
|
|
33
|
-
# dob: str = Field(..., description="Date of birth")
|
|
34
|
-
# expiry_date: str = Field(..., description="Card expiry date")
|
|
35
|
-
# occupation: str = Field(..., description="Occupation in Arabic")
|
|
36
|
-
# occupation_en: str = Field(..., description="Occupation, translated from Arabic to English")
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
# # @tool(args_schema=QatarIDInfo)
|
|
40
|
-
# # def verify_qatar_id_info(name='', name_ar='', nationality='', id_number='', dob='', expiry_date='', occupation='',
|
|
41
|
-
# # occupation_en=''):
|
|
42
|
-
# # if occupation_en == '':
|
|
43
|
-
# # occupation_en = GoogleTranslator('ar', 'en').translate(occupation)
|
|
44
|
-
# #
|
|
45
|
-
# # return {**locals()}
|
|
46
|
-
# #
|
|
47
|
-
# #
|
|
48
|
-
# # def route(result):
|
|
49
|
-
# # if isinstance(result, AgentFinish):
|
|
50
|
-
# # return result.return_values['output']
|
|
51
|
-
# # else:
|
|
52
|
-
# # tools = {
|
|
53
|
-
# # "verify_qatar_id_info": verify_qatar_id_info
|
|
54
|
-
# # }
|
|
55
|
-
# # return tools[result.tool].run(result.tool_input)
|
|
56
|
-
# #
|
|
57
|
-
|
|
58
|
-
# def qatar_id_info_chain(ocr_text, openai_key):
|
|
59
|
-
# gpt_model = 'gpt-4o'
|
|
60
|
-
|
|
61
|
-
# prompt = ChatPromptTemplate.from_messages([
|
|
62
|
-
# ("system",
|
|
63
|
-
# "Extract the relevant information, if not explicitly provided do not guess, leave empty string. Extract partial info. Translate where explicity stated."
|
|
64
|
-
# ),
|
|
65
|
-
# ("user", "{ocr_text}")
|
|
66
|
-
# ])
|
|
67
|
-
|
|
68
|
-
# model = ChatOpenAI(model=gpt_model, temperature=0,
|
|
69
|
-
# openai_api_key=openai_key)
|
|
70
|
-
# functions = [convert_pydantic_to_openai_function(QatarIDInfo)]
|
|
71
|
-
# verification_model = model.bind(functions=functions)
|
|
72
|
-
# verification_chain = prompt | verification_model | JsonOutputFunctionsParser()
|
|
73
|
-
|
|
74
|
-
# result = verification_chain.invoke({"ocr_text": ocr_text})
|
|
75
|
-
# return result
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
# def extract_name_line(ocr_text):
|
|
79
|
-
# """Try to extract the English name line explicitly from OCR."""
|
|
80
|
-
# match = re.search(r'(?i)\bname\b\s*[:\-]?\s*([A-Z][A-Z\s]+)', ocr_text)
|
|
81
|
-
# return match.group(1).strip() if match else None
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
# # ISO3166 nationality mapping
|
|
85
|
-
# ISO3166_nationality_mapping = {
|
|
86
|
-
# "004": "AFG", "008": "ALB", "012": "DZA", "016": "ASM", "020": "AND", "024": "AGO", "660": "AIA",
|
|
87
|
-
# "010": "ATA", "028": "ATG", "032": "ARG", "051": "ARM", "533": "ABW", "036": "AUS", "040": "AUT",
|
|
88
|
-
# "031": "AZE", "044": "BHS", "048": "BHR", "050": "BGD", "052": "BRB", "112": "BLR", "056": "BEL",
|
|
89
|
-
# "084": "BLZ", "204": "BEN", "060": "BMU", "064": "BTN", "068": "BOL", "535": "BES", "070": "BIH",
|
|
90
|
-
# "072": "BWA", "074": "BVT", "076": "BRA", "086": "IOT", "096": "BRN", "100": "BGR", "854": "BFA",
|
|
91
|
-
# "108": "BDI", "132": "CPV", "116": "KHM", "120": "CMR", "124": "CAN", "136": "CYM", "140": "CAF",
|
|
92
|
-
# "148": "TCD", "152": "CHL", "156": "CHN", "162": "CXR", "166": "CCK", "170": "COL", "174": "COM",
|
|
93
|
-
# "180": "COD", "178": "COG", "184": "COK", "188": "CRI", "191": "HRV", "192": "CUB", "531": "CUW",
|
|
94
|
-
# "196": "CYP", "203": "CZE", "384": "CIV", "208": "DNK", "262": "DJI", "212": "DMA", "214": "DOM",
|
|
95
|
-
# "218": "ECU", "818": "EGY", "222": "SLV", "226": "GNQ", "232": "ERI", "080": "ERI", "233": "EST",
|
|
96
|
-
# "748": "SWZ", "231": "ETH", "238": "FLK", "234": "FRO", "242": "FJI", "246": "FIN", "250": "FRA",
|
|
97
|
-
# "254": "GUF", "258": "PYF", "260": "ATF", "266": "GAB", "270": "GMB", "268": "GEO", "276": "DEU",
|
|
98
|
-
# "288": "GHA", "292": "GIB", "300": "GRC", "304": "GRL", "308": "GRD", "312": "GLP", "316": "GUM",
|
|
99
|
-
# "320": "GTM", "831": "GGY", "324": "GIN", "624": "GNB", "328": "GUY", "332": "HTI", "334": "HMD",
|
|
100
|
-
# "336": "VAT", "340": "HND", "344": "HKG", "348": "HUN", "352": "ISL", "356": "IND", "360": "IDN",
|
|
101
|
-
# "364": "IRN", "368": "IRQ", "372": "IRL", "833": "IMN", "376": "ISR", "380": "ITA", "388": "JAM",
|
|
102
|
-
# "392": "JPN", "832": "JEY", "400": "JOR", "398": "KAZ", "404": "KEN", "296": "KIR", "408": "PRK",
|
|
103
|
-
# "410": "KOR", "414": "KWT", "417": "KGZ", "418": "LAO", "428": "LVA", "422": "LBN", "426": "LSO",
|
|
104
|
-
# "430": "LBR", "434": "LBY", "438": "LIE", "440": "LTU", "442": "LUX", "446": "MAC", "450": "MDG",
|
|
105
|
-
# "454": "MWI", "458": "MYS", "462": "MDV", "466": "MLI", "470": "MLT", "584": "MHL", "474": "MTQ",
|
|
106
|
-
# "478": "MRT", "480": "MUS", "175": "MYT", "484": "MEX", "583": "FSM", "498": "MDA", "492": "MCO",
|
|
107
|
-
# "496": "MNG", "499": "MNE", "500": "MSR", "504": "MAR", "508": "MOZ", "104": "MMR", "516": "NAM",
|
|
108
|
-
# "520": "NRU", "524": "NPL", "528": "NLD", "540": "NCL", "554": "NZL", "558": "NIC", "562": "NER",
|
|
109
|
-
# "566": "NGA", "570": "NIU", "574": "NFK", "580": "MNP", "578": "NOR", "512": "OMN", "586": "PAK",
|
|
110
|
-
# "585": "PLW", "275": "PSE", "591": "PAN", "598": "PNG", "600": "PRY", "604": "PER", "608": "PHL",
|
|
111
|
-
# "612": "PCN", "616": "POL", "620": "PRT", "630": "PRI", "634": "QAT", "807": "MKD", "642": "ROU",
|
|
112
|
-
# "643": "RUS", "646": "RWA", "638": "REU", "652": "BLM", "654": "SHN", "659": "KNA", "662": "LCA",
|
|
113
|
-
# "663": "MAF", "666": "SPM", "670": "VCT", "882": "WSM", "674": "SMR", "678": "STP", "682": "SAU",
|
|
114
|
-
# "686": "SEN", "688": "SRB", "690": "SYC", "694": "SLE", "702": "SGP", "534": "SXM", "703": "SVK",
|
|
115
|
-
# "705": "SVN", "090": "SLB", "706": "SOM", "710": "ZAF", "239": "SGS", "728": "SSD", "724": "ESP",
|
|
116
|
-
# "144": "LKA", "736": "SDN", "740": "SUR", "744": "SJM", "752": "SWE", "756": "CHE", "760": "SYR",
|
|
117
|
-
# "158": "TWN", "762": "TJK", "834": "TZA", "764": "THA", "626": "TLS", "768": "TGO", "772": "TKL",
|
|
118
|
-
# "776": "TON", "780": "TTO", "788": "TUN", "792": "TUR", "795": "TKM", "796": "TCA", "798": "TUV",
|
|
119
|
-
# "800": "UGA", "804": "UKR", "784": "ARE", "826": "GBR", "581": "UMI", "840": "USA", "858": "URY",
|
|
120
|
-
# "860": "UZB", "548": "VUT", "862": "VEN", "704": "VNM", "092": "VGB", "850": "VIR", "876": "WLF",
|
|
121
|
-
# "732": "ESH", "887": "YEM", "894": "ZMB", "716": "ZWE", "248": "ALA", "999": "PSE", "544": "BIH",
|
|
122
|
-
# "230": "ETH", "886": "YEM", "901": "TWN"
|
|
123
|
-
# }
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
# def crop_second_part(img):
|
|
127
|
-
# width, height = img.size
|
|
128
|
-
# half_width = width // 2
|
|
129
|
-
# second_part = img.crop((half_width, 0, width, height))
|
|
130
|
-
# return second_part
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
# def crop_third_part(img):
|
|
134
|
-
# width, height = img.size
|
|
135
|
-
# part_height = height // 6
|
|
136
|
-
# third_part = img.crop((0, 3.7 * part_height, width, height))
|
|
137
|
-
# return third_part
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
# def detect_id_card(client, image_data, id_text, part=None):
|
|
141
|
-
# if id_text:
|
|
142
|
-
# vertices = id_text[0].bounding_poly.vertices
|
|
143
|
-
# left = vertices[0].x
|
|
144
|
-
# top = vertices[0].y
|
|
145
|
-
# right = vertices[2].x
|
|
146
|
-
# bottom = vertices[2].y
|
|
147
|
-
|
|
148
|
-
# padding = 40
|
|
149
|
-
# left -= padding
|
|
150
|
-
# top -= padding
|
|
151
|
-
# right += padding
|
|
152
|
-
# bottom += padding
|
|
153
|
-
|
|
154
|
-
# # img = image_data
|
|
155
|
-
# # with Image.open(io.BytesIO(image_data)) as img:
|
|
156
|
-
# # id_card = img.crop((max(0, left), max(0, top), right, bottom))
|
|
157
|
-
|
|
158
|
-
# pil_image = Image.open(BytesIO(image_data))
|
|
159
|
-
# compressed_image = BytesIO()
|
|
160
|
-
# pil_image.save(compressed_image, format="JPEG", quality=50, optimize=True)
|
|
161
|
-
# compressed_image_data = compressed_image.getvalue()
|
|
162
|
-
# compressed_pil_image = Image.open(BytesIO(compressed_image_data))
|
|
163
|
-
# id_card = compressed_pil_image.crop((max(0, left), max(0, top), right, bottom))
|
|
164
|
-
|
|
165
|
-
# width, height = id_card.size
|
|
166
|
-
# if width < height:
|
|
167
|
-
# id_card = id_card.rotate(90, expand=True)
|
|
168
|
-
|
|
169
|
-
# if part == 'second':
|
|
170
|
-
# part_img = crop_second_part(id_card)
|
|
171
|
-
# if part == 'third':
|
|
172
|
-
# part_img = crop_third_part(id_card)
|
|
173
|
-
|
|
174
|
-
# # 2nd call to vision AI
|
|
175
|
-
# part_text = extract_text_from_image_data(client, part_img)
|
|
176
|
-
|
|
177
|
-
# return id_card, part_img, part_text
|
|
178
|
-
# else:
|
|
179
|
-
# print('No text found in the image.')
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
# def is_arabic(word):
|
|
183
|
-
# return re.search(r'[\u0600-\u06FF]', word) is not None
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
# def extract_name_ar(text):
|
|
187
|
-
# # patterns = [
|
|
188
|
-
# # r"(?:الاسم|الإسم):\s*([^\n]+)",
|
|
189
|
-
# # r"الاسم\s+([^\n]+)"
|
|
190
|
-
# # ]
|
|
191
|
-
|
|
192
|
-
# patterns = [
|
|
193
|
-
# r"(?:الإسم|الاسم):\s*([^\n]+)",
|
|
194
|
-
# r"(?:الإسم|الاسم)\s+([^\n]+)",
|
|
195
|
-
# ]
|
|
196
|
-
|
|
197
|
-
# for pattern in patterns:
|
|
198
|
-
# regex = re.compile(pattern, re.MULTILINE)
|
|
199
|
-
# match = regex.search(text)
|
|
200
|
-
# if match:
|
|
201
|
-
# return match.group(1).strip()
|
|
202
|
-
|
|
203
|
-
# return None
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
# def extract_name_fields_from_cropped_part(text):
|
|
207
|
-
# pattern = r"Name:\s*([A-Z\s-]+)"
|
|
208
|
-
# name_dict = {}
|
|
209
|
-
# match = re.search(pattern, text)
|
|
210
|
-
|
|
211
|
-
# if match:
|
|
212
|
-
# extracted_name = match.group(1).strip()
|
|
213
|
-
# extracted_name = extracted_name.replace("\n", " ")
|
|
214
|
-
# unnecessary_words = ['OF', 'THE']
|
|
215
|
-
# extracted_name = [word for word in extracted_name.split() if word.upper() not in unnecessary_words]
|
|
216
|
-
# if len(extracted_name[-1]) <= 2:
|
|
217
|
-
# extracted_name = extracted_name[:-1]
|
|
218
|
-
|
|
219
|
-
# extracted_name = ' '.join(extracted_name)
|
|
220
|
-
|
|
221
|
-
# name_dict["name"] = extracted_name.strip()
|
|
222
|
-
# name_parts = extracted_name.split()
|
|
223
|
-
|
|
224
|
-
# first_name = name_parts[0].upper()
|
|
225
|
-
# last_name = name_parts[-1].upper()
|
|
226
|
-
|
|
227
|
-
# name_dict["first_name"] = first_name
|
|
228
|
-
# name_dict["last_name"] = last_name
|
|
229
|
-
# return name_dict
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
# def identify_front(text):
|
|
233
|
-
# front_id_keywords = ["State of Qatar"]
|
|
234
|
-
# pattern = '|'.join(map(re.escape, front_id_keywords))
|
|
235
|
-
|
|
236
|
-
# try:
|
|
237
|
-
# if re.search(pattern, text, re.IGNORECASE):
|
|
238
|
-
# return True
|
|
239
|
-
# else:
|
|
240
|
-
# return False
|
|
241
|
-
# except:
|
|
242
|
-
# return 'error'
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
# def sort_dates_by_datetime(dates):
|
|
246
|
-
# return sorted(dates, key=lambda x: datetime.strptime(x, '%d/%m/%Y'))
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
# def extract_and_check_country(words):
|
|
250
|
-
# for word in words:
|
|
251
|
-
# try:
|
|
252
|
-
# country = pycountry.countries.lookup(word)
|
|
253
|
-
# if country:
|
|
254
|
-
# return country.name.upper()
|
|
255
|
-
# except LookupError:
|
|
256
|
-
# pass
|
|
257
|
-
|
|
258
|
-
# return ''
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
# def extract_and_check_country_normalized(words):
|
|
262
|
-
# normalized_words = [re.sub(r'\s+|-', '', word).lower() for word in words]
|
|
263
|
-
|
|
264
|
-
# for country in pycountry.countries:
|
|
265
|
-
# common_name_normalized = re.sub(r'\s+|-', '', country.name).lower()
|
|
266
|
-
# official_name_normalized = re.sub(r'\s+|-', '', getattr(country, 'official_name', '')).lower()
|
|
267
|
-
|
|
268
|
-
# if common_name_normalized in normalized_words or official_name_normalized in normalized_words:
|
|
269
|
-
# return country.name.upper()
|
|
270
|
-
|
|
271
|
-
# return ''
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
# def extract_name_after_nationality(word_list, nationality):
|
|
275
|
-
# nationality_index = word_list.index(nationality) if nationality in word_list else -1
|
|
276
|
-
|
|
277
|
-
# if nationality_index != -1 and nationality_index < len(word_list) - 1:
|
|
278
|
-
# words_after_nationality = word_list[nationality_index + 1:]
|
|
279
|
-
# return words_after_nationality
|
|
280
|
-
# else:
|
|
281
|
-
# return []
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
# def get_fuzzy_match_score(line, patterns, threshold=80):
|
|
285
|
-
# result = process.extractOne(line, patterns, scorer=fuzz.WRatio)
|
|
286
|
-
# if result and result[1] > threshold:
|
|
287
|
-
# return result[1]
|
|
288
|
-
# return None
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
# def extract_occupation_in_empty_case(text):
|
|
292
|
-
# pattern = re.compile(r'المهنة\s*[:]*\s*(\S*)', re.IGNORECASE)
|
|
293
|
-
# lines = text.split('\n')
|
|
294
|
-
|
|
295
|
-
# for i, line in enumerate(lines):
|
|
296
|
-
# match = pattern.search(line)
|
|
297
|
-
# if match:
|
|
298
|
-
# if match.group(1):
|
|
299
|
-
# return match.group(1).strip()
|
|
300
|
-
# if i + 1 < len(lines):
|
|
301
|
-
# return lines[i + 1].strip()
|
|
302
|
-
|
|
303
|
-
# return ''
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
# def extract_occupation_in_empty_case_v2(text):
|
|
307
|
-
# pattern = re.compile(r'occupation\s*[:]*\s*(\S*)', re.IGNORECASE)
|
|
308
|
-
# lines = text.split('\n')
|
|
309
|
-
|
|
310
|
-
# for i, line in enumerate(lines):
|
|
311
|
-
# match = pattern.search(line)
|
|
312
|
-
# if match:
|
|
313
|
-
# if match.group(1):
|
|
314
|
-
# return match.group(1).strip()
|
|
315
|
-
# if i + 1 < len(lines):
|
|
316
|
-
# return lines[i + 1].strip()
|
|
317
|
-
|
|
318
|
-
# return ''
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
# def genAI(ar_front_data, model):
|
|
322
|
-
# query = f"Please extract the nationality from the following text and provide the corresponding ISO 3166-1 alpha-3 country code for that nationality: {ar_front_data}"
|
|
323
|
-
# response = model.generate_content(query)
|
|
324
|
-
# nationality_ai = re.findall(r'\*\*(.*?)\*\*', response.text)[1]
|
|
325
|
-
# return nationality_ai
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
# def genAI_for_occupation(dct, model):
|
|
329
|
-
# query = f"""
|
|
330
|
-
# You are provided with the following front_data: {dct}.
|
|
331
|
-
|
|
332
|
-
# Check if 'occupation_en' information is valid and correct. Please review this broadly without focusing on the specifics.
|
|
333
|
-
# for example if (doctor teacher employee and etc it is occupation as well)
|
|
334
|
-
# If 'occupation_en' match the expected values, respond with 'correct'.
|
|
335
|
-
# If it is incorrect, respond with 'not_correct', if you are not able to determine then respond with 'undetermined'.
|
|
336
|
-
# as a response give me 'not_correct','undetermined' or 'correct' nothing else
|
|
337
|
-
# """
|
|
338
|
-
# response = model.generate_content(query)
|
|
339
|
-
# value = response.candidates[0].content.parts[0].text.strip()
|
|
340
|
-
|
|
341
|
-
# return value
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
# def genAI_for_occupation_correct(passport_details, model):
|
|
345
|
-
# query = f"""
|
|
346
|
-
# Please extract the occupation from the following text and provide it in this format:
|
|
347
|
-
# - English: **occupation**
|
|
348
|
-
# - Arabic: **occupation** return only these 2 nothing else.
|
|
349
|
-
# So you will get occupation in arabic and translate it into english and send it
|
|
350
|
-
# if no info about occupation then 'not_provided', for both English and Arabic: {passport_details}
|
|
351
|
-
# """
|
|
352
|
-
# response = model.generate_content(query)
|
|
353
|
-
# occupation_ai = re.findall(r'\*\*(.*?)\*\*', response.text)
|
|
354
|
-
|
|
355
|
-
# return occupation_ai
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
# def genAI_for_expiry_date(ar_front_data, model):
|
|
359
|
-
# query = f"""
|
|
360
|
-
# Please extract the expiry date from the following text and provide it in this format(dd/mm/yyyy):
|
|
361
|
-
# - expiry_date, return only this 1 variable, nothing else.
|
|
362
|
-
# if no info about expiry_date found then return 'expiry_date': 'not_provided': {ar_front_data}
|
|
363
|
-
# """
|
|
364
|
-
# response = model.generate_content(query)
|
|
365
|
-
# expiry_ai = re.findall(r'\*\*(.*?)\*\*', response.text)[1]
|
|
366
|
-
|
|
367
|
-
# return expiry_ai
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
# def make_api_request_with_retries(prompt: str, max_retries: int = 3, delay_seconds: float = 2):
|
|
371
|
-
# """
|
|
372
|
-
# Helper function to make API requests with retry logic using OpenAI
|
|
373
|
-
# """
|
|
374
|
-
# for attempt in range(max_retries):
|
|
375
|
-
# try:
|
|
376
|
-
# response = openai.ChatCompletion.create(
|
|
377
|
-
# model="gpt-4o-mini",
|
|
378
|
-
# temperature=0.4,
|
|
379
|
-
# max_tokens=2000,
|
|
380
|
-
# messages=[
|
|
381
|
-
# {
|
|
382
|
-
# "role": "user",
|
|
383
|
-
# "content": prompt
|
|
384
|
-
# }
|
|
385
|
-
# ]
|
|
386
|
-
# )
|
|
387
|
-
# result = response.choices[0].message.content
|
|
388
|
-
|
|
389
|
-
# try:
|
|
390
|
-
# return json.loads(result)
|
|
391
|
-
# except json.JSONDecodeError:
|
|
392
|
-
# try:
|
|
393
|
-
# json_match = re.search(r'```(json|python|plaintext)?\s*(.*?)\s*```|\s*({.*?})', result, re.DOTALL)
|
|
394
|
-
# if json_match:
|
|
395
|
-
# json_str = json_match.group(2) or json_match.group(3)
|
|
396
|
-
# try:
|
|
397
|
-
# return json.loads(json_str)
|
|
398
|
-
# except:
|
|
399
|
-
# return eval(json_str.replace("'", '"'))
|
|
400
|
-
# except:
|
|
401
|
-
# pass
|
|
402
|
-
|
|
403
|
-
# return json.loads(result)
|
|
404
|
-
|
|
405
|
-
# except Exception as e:
|
|
406
|
-
# print(f"Error during API request (attempt {attempt + 1} of {max_retries}): {str(e)}")
|
|
407
|
-
# if attempt < max_retries - 1:
|
|
408
|
-
# time.sleep(delay_seconds)
|
|
409
|
-
# else:
|
|
410
|
-
# raise Exception(f"Max retries exceeded. Last error: {str(e)}")
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
# def extract_numeric_fields_from_raw(ar_front_data, third_part_text, name_extracted, extract_names=False):
|
|
414
|
-
# front_data = GoogleTranslator(dest='en').translate(ar_front_data)
|
|
415
|
-
# id_number_pattern = r"\b\d{11}\b"
|
|
416
|
-
|
|
417
|
-
# words = re.findall(r'\b[A-Z]{4,}\b', ar_front_data)
|
|
418
|
-
# nationality = extract_and_check_country(words)
|
|
419
|
-
|
|
420
|
-
# nationality_iso = ''
|
|
421
|
-
# if not nationality:
|
|
422
|
-
# nationality = extract_and_check_country_normalized(words)
|
|
423
|
-
|
|
424
|
-
# if nationality:
|
|
425
|
-
# try:
|
|
426
|
-
# country = pycountry.countries.lookup(nationality)
|
|
427
|
-
# nationality_iso = country.alpha_3
|
|
428
|
-
# except:
|
|
429
|
-
# nationality_iso = ''
|
|
430
|
-
|
|
431
|
-
# print(f'------------Nationality from OCR: {nationality_iso}')
|
|
432
|
-
# # Extract nationality from ID number
|
|
433
|
-
# id_number_match = re.search(id_number_pattern, ar_front_data, re.IGNORECASE)
|
|
434
|
-
# if id_number_match:
|
|
435
|
-
# id_number = id_number_match.group(0)
|
|
436
|
-
# # Extract nationality code from ID number (digits 4-6)
|
|
437
|
-
# if len(id_number) >= 6:
|
|
438
|
-
# nationality_code = id_number[3:6] # 0-based indexing, so 3:6 gives us digits 4-6
|
|
439
|
-
# nationality_from_id = ISO3166_nationality_mapping.get(nationality_code, '')
|
|
440
|
-
# print(f'------------Nationality from ID number code: {nationality_from_id}')
|
|
441
|
-
|
|
442
|
-
# # Use nationality from ID if OCR nationality is empty, invalid, or different
|
|
443
|
-
# if nationality_from_id:
|
|
444
|
-
# if not nationality_iso or len(nationality_iso) != 3:
|
|
445
|
-
# nationality_iso = nationality_from_id
|
|
446
|
-
# elif nationality_iso != nationality_from_id:
|
|
447
|
-
# nationality_iso = nationality_from_id
|
|
448
|
-
# else:
|
|
449
|
-
# try:
|
|
450
|
-
# id_number_match = re.search(id_number_pattern, ar_front_data, re.IGNORECASE)
|
|
451
|
-
# id_number = id_number_match.group(0)
|
|
452
|
-
# except:
|
|
453
|
-
# id_number = ''
|
|
454
|
-
|
|
455
|
-
# names_list = extract_name_after_nationality(words, nationality)
|
|
456
|
-
# name = ' '.join(names_list)
|
|
457
|
-
# if not name:
|
|
458
|
-
# name = name_extracted
|
|
459
|
-
|
|
460
|
-
# dates = sort_dates_by_datetime(re.findall(r'\d{2}/\d{2}/\d{4}', ar_front_data))
|
|
461
|
-
# combined_back_pattern = r'(Director General of the General Department|Directorate of Passports|Passport Number|Passport Expiry)'
|
|
462
|
-
# back_match = re.search(combined_back_pattern, ar_front_data, re.IGNORECASE)
|
|
463
|
-
|
|
464
|
-
# try:
|
|
465
|
-
# if back_match:
|
|
466
|
-
# if 'Passport' in ar_front_data:
|
|
467
|
-
# ar_front_data = ar_front_data.split("Name")[0]
|
|
468
|
-
|
|
469
|
-
# dates = sort_dates_by_datetime(re.findall(r'\d{2}/\d{2}/\d{4}', ar_front_data))
|
|
470
|
-
|
|
471
|
-
# if len(dates) > 2:
|
|
472
|
-
# dob = dates[0]
|
|
473
|
-
# expiry = dates[1]
|
|
474
|
-
# elif len(dates) <= 2:
|
|
475
|
-
# dob = dates[0]
|
|
476
|
-
# expiry = dates[-1]
|
|
477
|
-
# else:
|
|
478
|
-
# dob = dates[0]
|
|
479
|
-
# expiry = dates[-1]
|
|
480
|
-
# except:
|
|
481
|
-
# try:
|
|
482
|
-
# dob = dates[0]
|
|
483
|
-
# expiry = dates[-1]
|
|
484
|
-
# except:
|
|
485
|
-
# dob = ''
|
|
486
|
-
# expiry = ''
|
|
487
|
-
|
|
488
|
-
# if 'Passport' in ar_front_data:
|
|
489
|
-
# ar_front_data = ar_front_data.split("Name")[0]
|
|
490
|
-
|
|
491
|
-
# ar_front_data_filtered = [
|
|
492
|
-
# re.sub(r'\b[a-zA-Z0-9]+\b', '',
|
|
493
|
-
# line.replace(':', '').replace('/', '').replace('.', '').replace('المهنة', '').replace('تاريخ الميلاد',
|
|
494
|
-
# '').replace(
|
|
495
|
-
# 'دولة قطر', '').replace('الرقم الشخصي', '').replace('الصلاحية', '').replace('الجنسية', '').replace(
|
|
496
|
-
# 'رخصة إقامة', '').replace('الرقم', '').replace('اللى', '').replace('طو', '').replace('دولة',
|
|
497
|
-
# '').replace(
|
|
498
|
-
# 'الهند', '').replace('بطاقة', '').replace('إثبات', '').replace('شخصية', '').replace('ہے',
|
|
499
|
-
# '').replace('۔',
|
|
500
|
-
# ''))
|
|
501
|
-
# for line in ar_front_data.split('\n')
|
|
502
|
-
# ]
|
|
503
|
-
|
|
504
|
-
# cleaned_lines = [line for line in ar_front_data_filtered if line.strip()]
|
|
505
|
-
|
|
506
|
-
# patterns_to_remove = [
|
|
507
|
-
# r"State Of Qatar", r"Residency Permit", r"ID\.No:", r"D\.O\.B\.:", r"D\.O\.B:",
|
|
508
|
-
# r"Expiry:", r"Nationality:", r"\d{9}", r"\d{2}/\d{2}/\d{4}", r"بنغلاديش", r"الهند",
|
|
509
|
-
# r"on", r"الرقم الشخصي:", r"تاريخ الميلاد:", r"الصلاحية:",
|
|
510
|
-
# r"الجنسية:", r"دولة قطر", r"رخصة إقامة", r"المهنة:", r"الاسم:", r"Name:"
|
|
511
|
-
# ]
|
|
512
|
-
|
|
513
|
-
# if nationality:
|
|
514
|
-
# patterns_to_remove.append(re.escape(nationality))
|
|
515
|
-
|
|
516
|
-
# if name:
|
|
517
|
-
# patterns_to_remove.append(re.escape(name))
|
|
518
|
-
|
|
519
|
-
# compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in patterns_to_remove]
|
|
520
|
-
|
|
521
|
-
# countries_list = ['أفغانستان', 'جزر أولاند', 'ألبانيا', 'یمنی', 'الجزائر', 'ساموا الأمريكية', 'مغربي', 'أندورا',
|
|
522
|
-
# 'أنغولا', 'أنغويلا', 'القارة القطبية الجنوبية', 'أنتيغوا وبربودا', 'الأرجنتين', 'أرمينيا',
|
|
523
|
-
# 'أروبا', 'أستراليا', 'النمسا', 'أذربيجان', 'باهاماس', 'البحرين', 'بنغلاديش', 'بربادوس',
|
|
524
|
-
# 'بيلاروسيا', 'بلجيكا', 'بليز', 'بنين', 'برمودا', 'بوتان', 'بوليفيا', 'البوسنة والهرسك',
|
|
525
|
-
# 'بوتسوانا', 'جزيرة بوفيه', 'البرازيل', 'إقليم المحيط الهندي البريطاني', 'جزر العذراء البريطانية',
|
|
526
|
-
# 'بروناي', 'بلغاريا', 'بوركينا فاسو', 'بوروندي', 'كابو فيردي', 'كمبوديا', 'الكاميرون', 'كندا',
|
|
527
|
-
# 'الجزر الكاريبية الهولندية', 'جزر كايمان', 'جمهورية أفريقيا الوسطى', 'تشاد', 'تشيلي', 'الصين',
|
|
528
|
-
# 'جزيرة الكريسماس', 'جزر كوكوس', 'كولومبيا', 'جزر القمر', 'جمهورية الكونغو', 'جزر كوك',
|
|
529
|
-
# 'كوستاريكا', 'كرواتيا', 'كوبا', 'كوراساو', 'قبرص', 'التشيك', 'الدنمارك', 'جيبوتي', 'دومينيكا',
|
|
530
|
-
# 'جمهورية الدومينيكان', 'جمهورية الكونغو الديمقراطية', 'الاكوادور', 'السلفادور',
|
|
531
|
-
# 'غينيا الاستوائية', 'إريتريا', 'إستونيا', 'إسواتيني', 'إثيوبيا', 'جزر فوكلاند', 'جزر فارو',
|
|
532
|
-
# 'فيجي', 'فنلندا', 'فرنسا', 'غويانا الفرنسية', 'بولينزيا الفرنسية', 'أراض فرنسية جنوبية',
|
|
533
|
-
# 'الجابون', 'غامبيا', '\u202bجورجيا', 'ألمانيا', 'غانا', 'جبل طارق', 'اليونان', 'جرينلاند',
|
|
534
|
-
# 'غرينادا', 'غوادلوب', 'غوام', 'غواتيمالا', 'غيرنزي', 'غينيا', 'غينيا بيساو', 'غيانا', 'هايتي',
|
|
535
|
-
# 'جزيرة هيرد وجزر ماكدونالد', 'هندوراس', 'هونج كونج', 'هنجاريا', 'آيسلندا', 'الهند', 'أندونيسيا',
|
|
536
|
-
# 'إيران', 'العراق', 'أيرلندا', 'جزيرة مان', 'إيطاليا', 'ساحل العاج', 'جامايكا', 'اليابان', 'جيرسي',
|
|
537
|
-
# 'الأردن', 'كازاخستان', 'كينيا', 'كيريباتي', 'كوسوفو', 'الكويت', 'قيرغيزستان', 'لاوس', 'لاتفيا',
|
|
538
|
-
# 'لبنان', 'ليسوتو', 'ليبيريا', 'ليبيا', 'ليختنشتاين', 'ليتوانيا', 'لوكسمبورغ', 'ماكاو', 'مدغشقر',
|
|
539
|
-
# 'مالاوي', 'ماليزيا', 'المالديف', 'مالي', 'مالطا', 'جزر مارشال', 'مارتينيك', 'موريتانيا',
|
|
540
|
-
# 'موريشيوس', 'مايوت', 'المكسيك', 'ولايات ميكرونيسيا المتحدة', 'مولدوفا', 'موناكو', 'منغوليا',
|
|
541
|
-
# 'مونتينيغرو', 'مونتسرات', 'المغرب', 'موزمبيق', 'ميانمار', 'ناميبيا', 'ناورو', 'نيبال', 'هولندا',
|
|
542
|
-
# 'جزر الأنتيل الهولندية', 'كاليدونيا الجديدة', 'نيوزيلندا', 'نيكاراغوا', 'النيجر', 'نيجيريا',
|
|
543
|
-
# 'نييوي', 'جزيرة نورفولك', 'كوريا الشمالية', 'مقدونيا الشمالية', 'جزر ماريانا الشمالية', 'النرويج',
|
|
544
|
-
# 'سلطنة عمان', 'باكستان', 'بالاو', 'فلسطين', 'بنما', 'بابوا غينيا الجديدة', 'باراغواي', 'بيرو',
|
|
545
|
-
# 'الفلبين', 'جزر بيتكيرن', 'بولندا', 'البرتغال', 'بورتوريكو', 'قطر', 'ريونيون', 'رومانيا', 'روسيا',
|
|
546
|
-
# 'رواندا', 'سان بارتيلمي', 'سانت هيلينا', 'سانت كيتس ونيفيس', 'سانت لوسيا', 'سانت مارتن',
|
|
547
|
-
# 'سان بيير وميكلون', 'سانت فينسنت والغرينادين', 'ساموا', 'سان مارينو', 'ساو تومي وبرينسيب',
|
|
548
|
-
# 'السعودية', 'السنغال', 'صربيا', 'سيشل', 'سيراليون', 'سنغافورة', 'سانت مارتن', 'سلوفاكيا',
|
|
549
|
-
# 'سلوفينيا', 'جزر سليمان', 'الصومال', 'جنوب أفريقيا', 'جورجيا الجنوبية وجزر ساندويتش الجنوبية',
|
|
550
|
-
# 'كوريا الجنوبية', 'جنوب السودان', 'إسبانيا', 'سريلانكا', 'السودان', 'سورينام',
|
|
551
|
-
# 'سفالبارد ويان ماين', 'السويد', 'سويسرا', 'سوريا', 'تايوان', 'طاجيكستان', 'تنزانيا', 'تايلاند',
|
|
552
|
-
# 'تيمور الشرقية', 'توجو', 'توكيلاو', 'تونغا', 'ترينيداد وتوباغو', 'تونس', 'تركيا', 'تركمانستان',
|
|
553
|
-
# 'جزر توركس وكايكوس', 'توفالو', 'جزر الولايات المتحدة الصغيرة النائية', 'جزر العذراء الأمريكية',
|
|
554
|
-
# 'أوغندا', 'أوكرانيا', 'الإمارات العربية المتحدة', 'المملكة المتحدة', 'الولايات المتحدة الأمريكية',
|
|
555
|
-
# 'أوروغواي', 'أوزبكستان', 'فانواتو', 'مدينة الفاتيكان', 'فنزويلا', 'فيتنام', 'واليس وفوتونا',
|
|
556
|
-
# 'الصحراء الغربية', 'اليمن', 'زامبيا', 'زيمبابوي', 'اردني', 'اردنی', 'سریلانکا', 'پاکستان',
|
|
557
|
-
# 'بيكور', 'ایران', 'المهلة']
|
|
558
|
-
|
|
559
|
-
# arabic_keywords_to_remove = [
|
|
560
|
-
# "الرقم الشخصي", "تاريخ الميلاد", "الصلاحية", "لدولة", "الجنسية", "دولة قطر", "رخصة إقامة", "المهنة", "الإسم",
|
|
561
|
-
# "بطاقة", "إثبات", "شخصية", "ـلـة قـ", "ـة", "سلاحية"
|
|
562
|
-
# ]
|
|
563
|
-
|
|
564
|
-
# filtered_lines = []
|
|
565
|
-
# for line in cleaned_lines:
|
|
566
|
-
# match_score = get_fuzzy_match_score(line, arabic_keywords_to_remove)
|
|
567
|
-
# match_score1 = get_fuzzy_match_score(line, countries_list)
|
|
568
|
-
|
|
569
|
-
# if match_score or match_score1:
|
|
570
|
-
# score = match_score if match_score else match_score1
|
|
571
|
-
# elif not any(pattern.search(line) for pattern in compiled_patterns):
|
|
572
|
-
# filtered_lines.append(line)
|
|
573
|
-
|
|
574
|
-
# occupation, occupation_en = '', ''
|
|
575
|
-
|
|
576
|
-
# front_data = {
|
|
577
|
-
# "nationality": nationality_iso,
|
|
578
|
-
# "id_number": id_number,
|
|
579
|
-
# "dob": dob,
|
|
580
|
-
# "expiry_date": expiry,
|
|
581
|
-
# "occupation": occupation,
|
|
582
|
-
# "occupation_en": occupation_en
|
|
583
|
-
# }
|
|
584
|
-
|
|
585
|
-
# try:
|
|
586
|
-
# if extract_names:
|
|
587
|
-
# prompt = f"""Please extract the following information from the text and provide it in a structured dictionary format: {{'occupation': 'abc', 'occupation_en': 'abc', 'nationality': 'XXX', 'name': 'FULL NAME', 'first_name': 'FIRST', 'last_name': 'LAST', 'name_ar': 'ARABIC NAME'}}
|
|
588
|
-
# For the name fields:
|
|
589
|
-
# - Extract the full name in English and split it into first and last name
|
|
590
|
-
# - Extract the full name in Arabic (name_ar)
|
|
591
|
-
# For occupation:
|
|
592
|
-
# - Extract in both Arabic and English
|
|
593
|
-
# For nationality:
|
|
594
|
-
# - Provide the ISO 3166-1 alpha-3 country code
|
|
595
|
-
# Here's the text: {ar_front_data}"""
|
|
596
|
-
# else:
|
|
597
|
-
# prompt = f"""Please extract the occupation and nationality(ISO 3166-1 alpha-3 country code) from the following text and provide it in a structured dictionary format: {{'occupation': 'abc', 'occupation_en': 'abc', 'nationality': 'XXX'}}
|
|
598
|
-
# So you will get occupation in arabic and translate it into english as well and send it as part of your response. The results should always be a dictionary with only 3 keys as mentioned above and nothing else. Here's the text for your task: {ar_front_data}"""
|
|
599
|
-
|
|
600
|
-
# response = make_api_request_with_retries(prompt)
|
|
601
|
-
|
|
602
|
-
# if response.get('occupation', ''):
|
|
603
|
-
# front_data['occupation'] = response['occupation']
|
|
604
|
-
|
|
605
|
-
# if response.get('occupation_en', ''):
|
|
606
|
-
# front_data['occupation_en'] = response['occupation_en']
|
|
607
|
-
|
|
608
|
-
# if extract_names:
|
|
609
|
-
# if response.get('name', ''):
|
|
610
|
-
# front_data['name'] = response['name']
|
|
611
|
-
# if response.get('first_name', ''):
|
|
612
|
-
# front_data['first_name'] = response['first_name']
|
|
613
|
-
# if response.get('last_name', ''):
|
|
614
|
-
# front_data['last_name'] = response['last_name']
|
|
615
|
-
# if response.get('name_ar', ''):
|
|
616
|
-
# front_data['name_ar'] = response['name_ar']
|
|
617
|
-
|
|
618
|
-
# if front_data.get('occupation_en', ''):
|
|
619
|
-
# if front_data['occupation_en'].lower() in ['not available', 'unspecified', 'not specified',
|
|
620
|
-
# 'not provided'] or front_data[
|
|
621
|
-
# 'occupation_en'].lower().startswith('director of nationality'):
|
|
622
|
-
# front_data['occupation'], front_data['occupation_en'] = '', ''
|
|
623
|
-
|
|
624
|
-
# except Exception as e:
|
|
625
|
-
# print(f"Error in processing the extracted data: {e}")
|
|
626
|
-
# front_data['occupation'], front_data['occupation_en'] = '', ''
|
|
627
|
-
|
|
628
|
-
# return front_data
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
# def qatar_front_id_extraction(client, image_data, front_id_text, front_id_text_description, openai_key):
|
|
632
|
-
# # cropped_id_card, third_part, third_part_text = detect_id_card(client, image_data, front_id_text, part='third')
|
|
633
|
-
# # front_data = extract_name_fields_from_cropped_part(third_part_text.replace("\n", ""))
|
|
634
|
-
# try:
|
|
635
|
-
# english_name_raw = extract_name_line(front_id_text_description)
|
|
636
|
-
# if not english_name_raw:
|
|
637
|
-
# return {'error': 'covered_photo', 'error_details': 'English name not found in OCR'}
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
# result = qatar_id_info_chain(front_id_text_description, openai_key)
|
|
641
|
-
|
|
642
|
-
# from idvpackage.genai_utils import is_age_less_than_100, is_age_18_above
|
|
643
|
-
# age_check = is_age_less_than_100(result.get('dob', ''))
|
|
644
|
-
# if not age_check:
|
|
645
|
-
# return {'error': 'dob_glare'}
|
|
646
|
-
# if age_check == 'invalid_format':
|
|
647
|
-
# return {'error':'dob_glare'}
|
|
648
|
-
|
|
649
|
-
# age_check_2 = is_age_18_above(result.get('dob', ''))
|
|
650
|
-
# if age_check_2=='invalid_format':
|
|
651
|
-
# return {'error':'dob_glare'}
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
# name = result.get("name", "")
|
|
655
|
-
# name_parts = name.split()
|
|
656
|
-
# first_name = name_parts[0]
|
|
657
|
-
# last_name = name_parts[-1]
|
|
658
|
-
|
|
659
|
-
# front_data = {
|
|
660
|
-
# 'name': name,
|
|
661
|
-
# 'first_name': first_name,
|
|
662
|
-
# 'last_name': last_name,
|
|
663
|
-
# 'name_ar': result.get('name_ar', ''),
|
|
664
|
-
# 'nationality': result.get('nationality', ''),
|
|
665
|
-
# 'id_number': result.get('id_number', ''),
|
|
666
|
-
# 'dob': result.get('dob', ''),
|
|
667
|
-
# 'expiry_date': result.get('expiry_date', ''),
|
|
668
|
-
# 'occupation': result.get('occupation', ''),
|
|
669
|
-
# 'occupation_en': result.get('occupation_en', '')
|
|
670
|
-
# }
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
# except Exception as e:
|
|
674
|
-
# return {'error': 'covered_photo', 'error_details': f'Exception Thrown {e}'}
|
|
675
|
-
# # if 'error' in front_data.keys():
|
|
676
|
-
# # return front_data
|
|
677
|
-
# # if not front_data.get('name', '') or not front_data.get('first_name', '') or not front_data.get('last_name', '') or len(front_data.get('name', '').split(' ')) < 2:
|
|
678
|
-
# # front_data_temp = extract_name_fields_from_cropped_part(front_id_text_description)
|
|
679
|
-
# # front_data['name'] = front_data_temp.get('name', '')
|
|
680
|
-
# # front_data['first_name'] = front_data_temp.get('first_name', '')
|
|
681
|
-
# # front_data['last_name'] = front_data_temp.get('last_name', '') if len(front_data_temp.get('last_name', ''))>1 else ''
|
|
682
|
-
# #
|
|
683
|
-
# # name_ar = extract_name_ar(front_id_text_description)
|
|
684
|
-
# # if name_ar:
|
|
685
|
-
# # front_data["name_ar"] = name_ar
|
|
686
|
-
# # else:
|
|
687
|
-
# # front_data["name_ar"] = ''
|
|
688
|
-
|
|
689
|
-
# # # Check if we need to extract names using GPT
|
|
690
|
-
# # need_name_extraction = not front_data.get('name', '') or not front_data.get('first_name', '') or not front_data.get('last_name', '') or not front_data.get('name_ar', '') or len(front_data.get('name', '').split(' ')) < 2
|
|
691
|
-
# #
|
|
692
|
-
# # numeric_fields = extract_numeric_fields_from_raw(front_id_text_description, third_part_text, front_data.get('name', ''), extract_names=need_name_extraction)
|
|
693
|
-
# #
|
|
694
|
-
# # #If names were extracted via GPT, update front_data with the new values
|
|
695
|
-
# # if need_name_extraction:
|
|
696
|
-
# # if numeric_fields.get('name', ''):
|
|
697
|
-
# # front_data['name'] = numeric_fields['name']
|
|
698
|
-
# # if numeric_fields.get('first_name', ''):
|
|
699
|
-
# # front_data['first_name'] = numeric_fields['first_name']
|
|
700
|
-
# # if numeric_fields.get('last_name', ''):
|
|
701
|
-
# # front_data['last_name'] = numeric_fields['last_name']
|
|
702
|
-
# # if numeric_fields.get('name_ar', ''):
|
|
703
|
-
# # front_data['name_ar'] = numeric_fields['name_ar']
|
|
704
|
-
# #
|
|
705
|
-
# # #Update the rest of the fields
|
|
706
|
-
# # front_data.update({k: v for k, v in numeric_fields.items() if k not in ['name', 'first_name', 'last_name', 'name_ar']})
|
|
707
|
-
|
|
708
|
-
# if not front_data.get('expiry_date', ''):
|
|
709
|
-
# try:
|
|
710
|
-
# # Find all dates in dd-mm-yyyy format
|
|
711
|
-
# date_pattern = r'\d{2}-\d{2}-\d{4}'
|
|
712
|
-
# dates = re.findall(date_pattern, front_id_text_description)
|
|
713
|
-
|
|
714
|
-
# if dates:
|
|
715
|
-
# # Convert strings to datetime objects
|
|
716
|
-
# date_objects = []
|
|
717
|
-
# for date_str in dates:
|
|
718
|
-
# try:
|
|
719
|
-
# date_obj = datetime.strptime(date_str, '%d-%m-%Y')
|
|
720
|
-
# date_objects.append(date_obj)
|
|
721
|
-
# except ValueError:
|
|
722
|
-
# continue
|
|
723
|
-
|
|
724
|
-
# if date_objects:
|
|
725
|
-
# # Get the latest date as expiry
|
|
726
|
-
# max_date = max(date_objects)
|
|
727
|
-
# front_data['expiry_date'] = max_date.strftime('%d-%m-%Y')
|
|
728
|
-
# else:
|
|
729
|
-
# front_data['expiry_date'] = ''
|
|
730
|
-
# else:
|
|
731
|
-
# front_data['expiry_date'] = ''
|
|
732
|
-
# except Exception as e:
|
|
733
|
-
# print(f"Error extracting expiry date: {e}")
|
|
734
|
-
# front_data['expiry_date'] = ''
|
|
735
|
-
# return front_data
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
# def qatar_front_id_extraction_old(client, image_data, front_id_text, front_id_text_description):
|
|
739
|
-
# cropped_id_card, third_part, third_part_text = detect_id_card(client, image_data, front_id_text, part='third')
|
|
740
|
-
# front_data = extract_name_fields_from_cropped_part(third_part_text.replace("\n", ""))
|
|
741
|
-
# if not front_data.get('name', '') or not front_data.get('first_name', '') or not front_data.get('last_name',
|
|
742
|
-
# '') or len(
|
|
743
|
-
# front_data.get('name', '').split(' ')) < 2:
|
|
744
|
-
# front_data_temp = extract_name_fields_from_cropped_part(front_id_text_description)
|
|
745
|
-
# front_data['name'] = front_data_temp.get('name', '')
|
|
746
|
-
# front_data['first_name'] = front_data_temp.get('first_name', '')
|
|
747
|
-
# front_data['last_name'] = front_data_temp.get('last_name', '') if len(
|
|
748
|
-
# front_data_temp.get('last_name', '')) > 1 else ''
|
|
749
|
-
|
|
750
|
-
# name_ar = extract_name_ar(front_id_text_description)
|
|
751
|
-
# if name_ar:
|
|
752
|
-
# front_data["name_ar"] = name_ar
|
|
753
|
-
# else:
|
|
754
|
-
# front_data["name_ar"] = ''
|
|
755
|
-
|
|
756
|
-
# # Check if we need to extract names using GPT
|
|
757
|
-
# need_name_extraction = not front_data.get('name', '') or not front_data.get('first_name', '') or not front_data.get(
|
|
758
|
-
# 'last_name', '') or not front_data.get('name_ar', '') or len(front_data.get('name', '').split(' ')) < 2
|
|
759
|
-
|
|
760
|
-
# numeric_fields = extract_numeric_fields_from_raw(front_id_text_description, third_part_text,
|
|
761
|
-
# front_data.get('name', ''), extract_names=need_name_extraction)
|
|
762
|
-
|
|
763
|
-
# # If names were extracted via GPT, update front_data with the new values
|
|
764
|
-
# if need_name_extraction:
|
|
765
|
-
# if numeric_fields.get('name', ''):
|
|
766
|
-
# front_data['name'] = numeric_fields['name']
|
|
767
|
-
# if numeric_fields.get('first_name', ''):
|
|
768
|
-
# front_data['first_name'] = numeric_fields['first_name']
|
|
769
|
-
# if numeric_fields.get('last_name', ''):
|
|
770
|
-
# front_data['last_name'] = numeric_fields['last_name']
|
|
771
|
-
# if numeric_fields.get('name_ar', ''):
|
|
772
|
-
# front_data['name_ar'] = numeric_fields['name_ar']
|
|
773
|
-
|
|
774
|
-
# # Update the rest of the fields
|
|
775
|
-
# front_data.update(
|
|
776
|
-
# {k: v for k, v in numeric_fields.items() if k not in ['name', 'first_name', 'last_name', 'name_ar']})
|
|
777
|
-
|
|
778
|
-
# if not front_data.get('expiry_date', ''):
|
|
779
|
-
# try:
|
|
780
|
-
# # Find all dates in dd-mm-yyyy format
|
|
781
|
-
# date_pattern = r'\d{2}-\d{2}-\d{4}'
|
|
782
|
-
# dates = re.findall(date_pattern, front_id_text_description)
|
|
783
|
-
|
|
784
|
-
# if dates:
|
|
785
|
-
# # Convert strings to datetime objects
|
|
786
|
-
# date_objects = []
|
|
787
|
-
# for date_str in dates:
|
|
788
|
-
# try:
|
|
789
|
-
# date_obj = datetime.strptime(date_str, '%d-%m-%Y')
|
|
790
|
-
# date_objects.append(date_obj)
|
|
791
|
-
# except ValueError:
|
|
792
|
-
# continue
|
|
793
|
-
|
|
794
|
-
# if date_objects:
|
|
795
|
-
# # Get the latest date as expiry
|
|
796
|
-
# max_date = max(date_objects)
|
|
797
|
-
# front_data['expiry_date'] = max_date.strftime('%d-%m-%Y')
|
|
798
|
-
# else:
|
|
799
|
-
# front_data['expiry_date'] = ''
|
|
800
|
-
# else:
|
|
801
|
-
# front_data['expiry_date'] = ''
|
|
802
|
-
# except Exception as e:
|
|
803
|
-
# print(f"Error extracting expiry date: {e}")
|
|
804
|
-
# front_data['expiry_date'] = ''
|
|
805
|
-
|
|
806
|
-
# return front_data
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
# def extract_employer_from_back(data, passport_number, passport_date, serial_no):
|
|
810
|
-
# patterns_to_remove = [r"\b[a-zA-Z0-9]+\b",
|
|
811
|
-
# r"توقع حامل البطاقة",
|
|
812
|
-
# r"مدير عام الجنسية والمنافذ وشؤون الوالدين",
|
|
813
|
-
# r"المستقدم", r"توقع", r"حامل", r"البطاقة", r"مدير", r"عام", r"الإدارة",
|
|
814
|
-
# r"الجوازات", r"مدير عام الجنسية والمناقة وشؤون الوافدين",
|
|
815
|
-
# r"صل", r"تاريخ النهاء الجواز", r"تاريخ", r"الجواز", r"البطاقة", r"توقع حامل البطاقة",
|
|
816
|
-
# r"رق[ـم]* ج[ـوا]*ز السفر", r"تاريخ انتهاء ?الجواز", r"الرقم المسلسل",
|
|
817
|
-
# r"ن[ـو]*ع الرخص[ـة]*", r"مدير عام الإدارة العامة( للجوازات| الجورت)?",
|
|
818
|
-
# r"عمل",
|
|
819
|
-
# r"الارة البا",
|
|
820
|
-
# r"وزارة الله",
|
|
821
|
-
# r"مدير عام الجنسية والمنافذ وشؤون الوافدين",
|
|
822
|
-
# r"مدير إدارة الجنسية و وثائق السفر",
|
|
823
|
-
# r"العنوان منطقة",
|
|
824
|
-
# r"General Director of Nationality",
|
|
825
|
-
# r"Borders & Expatriates Affairs",
|
|
826
|
-
# r"Passport expiry date",
|
|
827
|
-
# r"تاریخ انتهاء الجواز",
|
|
828
|
-
# r"Drectorate of Passports",
|
|
829
|
-
# r"Directorate of Passports",
|
|
830
|
-
# r"Holder's Signature",
|
|
831
|
-
# r"Authority's signature",
|
|
832
|
-
# r"Residericy Type",
|
|
833
|
-
# r"ترفيع حامل البطاقة", r"توقيع حامل البطاقة", r"passport_number|passport_date|serial_no",
|
|
834
|
-
# r"Holder's signature", r"Passport Number", r"Passport Expiry",
|
|
835
|
-
# r"Serial No", r"Residency Type", r"Employer", r"Directorate of Passports",
|
|
836
|
-
# r"General Director of the General", re.escape(passport_number),
|
|
837
|
-
# re.escape(passport_date), re.escape(serial_no), r":",
|
|
838
|
-
# ]
|
|
839
|
-
|
|
840
|
-
# if 'employer' not in data.lower() or 'passport' not in data.lower():
|
|
841
|
-
# employer = ''
|
|
842
|
-
# return employer
|
|
843
|
-
|
|
844
|
-
# # compiled_patterns = [re.compile(pattern) for pattern in patterns_to_remove]
|
|
845
|
-
# compiled_patterns = [re.compile(pattern) for pattern in patterns_to_remove if pattern.strip()]
|
|
846
|
-
# data = data.replace("Employer", "").replace("Employe", "").replace("المستقدم :", "").replace("المستقدم", "")
|
|
847
|
-
|
|
848
|
-
# address_keywords = ["العنوان", "منطقة", "شارع"]
|
|
849
|
-
# lines = [
|
|
850
|
-
# line.strip() for line in data.split("\n")
|
|
851
|
-
# if line.strip() and not any(keyword in line for keyword in address_keywords)
|
|
852
|
-
# ]
|
|
853
|
-
|
|
854
|
-
# filtered_lines = []
|
|
855
|
-
# for line in lines:
|
|
856
|
-
# matched = False
|
|
857
|
-
# for pattern in compiled_patterns:
|
|
858
|
-
# if pattern.search(line):
|
|
859
|
-
# # print(f'Pattern: {pattern.pattern} matched line: {line}')
|
|
860
|
-
# matched = True
|
|
861
|
-
# break
|
|
862
|
-
|
|
863
|
-
# if not matched:
|
|
864
|
-
# filtered_lines.append(line)
|
|
865
|
-
|
|
866
|
-
# # print(f'FILTERED LINES: {filtered_lines}\n')
|
|
867
|
-
|
|
868
|
-
# lines = [re.sub(r'[A-Za-z0-9]', '', i) for i in filtered_lines]
|
|
869
|
-
|
|
870
|
-
# # print(f'FILTERED LINES 2: {lines}\n')
|
|
871
|
-
|
|
872
|
-
# try:
|
|
873
|
-
# employer = max(lines, key=len)
|
|
874
|
-
# except:
|
|
875
|
-
# employer = ''
|
|
876
|
-
|
|
877
|
-
# if employer:
|
|
878
|
-
# employer.strip().replace("'", '')
|
|
879
|
-
# else:
|
|
880
|
-
# employer = ''
|
|
881
|
-
|
|
882
|
-
# return employer
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
# def qatar_back_id_extraction(back_id_text_description):
|
|
886
|
-
# serial_no_pattern = r"\b\d{14}\b|\b[A-Za-z0-9]{13,16}\b"
|
|
887
|
-
# passport_no_pattern = r"([A-Za-z]\d{8}|[A-Za-z]{2}\d{7}|[A-Za-z]\d{7}|[A-Za-z]\d{6})"
|
|
888
|
-
# # emp_pattern = r'Employer:\s*([\w\s.]+.)\n\b'
|
|
889
|
-
|
|
890
|
-
# serial_no_match = re.search(serial_no_pattern, back_id_text_description, re.IGNORECASE)
|
|
891
|
-
|
|
892
|
-
# try:
|
|
893
|
-
# if serial_no_match:
|
|
894
|
-
# serial_no = serial_no_match.group(0)
|
|
895
|
-
# else:
|
|
896
|
-
# serial_no = serial_no_match.group(1)
|
|
897
|
-
# except:
|
|
898
|
-
# serial_no = ''
|
|
899
|
-
|
|
900
|
-
# passport_no_match = re.search(passport_no_pattern, back_id_text_description, re.IGNORECASE)
|
|
901
|
-
# if passport_no_match:
|
|
902
|
-
# passport_no = passport_no_match.group(0)
|
|
903
|
-
# else:
|
|
904
|
-
# passport_no = ''
|
|
905
|
-
|
|
906
|
-
# dates = sort_dates_by_datetime(re.findall(r'\d{2}/\d{2}/\d{4}', back_id_text_description))
|
|
907
|
-
# passport_expiry = dates[0] if dates else ''
|
|
908
|
-
|
|
909
|
-
# try:
|
|
910
|
-
# back_id_text_description_original = back_id_text_description
|
|
911
|
-
# if 'Name' in back_id_text_description:
|
|
912
|
-
# back_id_text_description = back_id_text_description.split("Serial")[1]
|
|
913
|
-
|
|
914
|
-
# employer = extract_employer_from_back(back_id_text_description, passport_no, passport_expiry, serial_no)
|
|
915
|
-
# # print(f'Employer here 1: {employer}\n')
|
|
916
|
-
|
|
917
|
-
# if employer is None or employer == '':
|
|
918
|
-
# back_id_text_description_splitted_2 = back_id_text_description_original.split("Name")[1]
|
|
919
|
-
# employer = extract_employer_from_back(back_id_text_description_splitted_2, passport_no, passport_expiry,
|
|
920
|
-
# serial_no)
|
|
921
|
-
# # print(f'Employer here 2: {employer}\n')
|
|
922
|
-
|
|
923
|
-
# if not is_arabic(employer):
|
|
924
|
-
# employer = extract_employer_from_back(back_id_text_description, passport_no, passport_expiry, serial_no)
|
|
925
|
-
# # print(f'Employer here 3: {employer}\n')
|
|
926
|
-
# except:
|
|
927
|
-
# try:
|
|
928
|
-
# employer = extract_employer_from_back(back_id_text_description, passport_no, passport_expiry, serial_no)
|
|
929
|
-
# # print(f'Employer here 4: {employer}\n')
|
|
930
|
-
# except:
|
|
931
|
-
# employer = ''
|
|
932
|
-
|
|
933
|
-
# employer_en = ''
|
|
934
|
-
# if employer:
|
|
935
|
-
# try:
|
|
936
|
-
# employer_en = GoogleTranslator(dest='en').translate(employer)
|
|
937
|
-
# if employer_en and (employer_en.startswith('Director of the Nationality') or employer_en.startswith(
|
|
938
|
-
# 'Director of Nationality') or employer_en.startswith('Director General')) or employer_en == None:
|
|
939
|
-
# employer, employer_en = '', ''
|
|
940
|
-
# except:
|
|
941
|
-
# pass
|
|
942
|
-
|
|
943
|
-
# back_data = {
|
|
944
|
-
# "passport_number": passport_no,
|
|
945
|
-
# "passport_expiry": passport_expiry,
|
|
946
|
-
# "card_number": serial_no,
|
|
947
|
-
# "employer": str(employer),
|
|
948
|
-
# "employer_en": employer_en,
|
|
949
|
-
# "issuing_country": "QAT"
|
|
950
|
-
# }
|
|
951
|
-
|
|
952
|
-
# return back_data
|
|
953
1
|
|
|
954
2
|
|
|
955
3
|
import base64
|
|
956
4
|
import time
|
|
957
5
|
from io import BytesIO
|
|
958
|
-
|
|
6
|
+
|
|
959
7
|
import cv2
|
|
960
8
|
|
|
961
9
|
from openai import OpenAI
|
|
@@ -1016,7 +64,7 @@ Instructions:
|
|
|
1016
64
|
|
|
1017
65
|
class QatarFront(BaseModel):
|
|
1018
66
|
|
|
1019
|
-
id_number: str = Field(...,
|
|
67
|
+
id_number: str = Field(...,min_length=9, max_length=11,
|
|
1020
68
|
description = "The ID number exactly as shown on the card (preserve original format)",
|
|
1021
69
|
)
|
|
1022
70
|
|
|
@@ -1075,7 +123,7 @@ class QatarBack(BaseModel):
|
|
|
1075
123
|
)
|
|
1076
124
|
|
|
1077
125
|
passport_number: str = Field(...,
|
|
1078
|
-
description = "Passport number extract exactly as written on the card ex: EA0605652"
|
|
126
|
+
description = "Passport number extract exactly as written on the card ex: EA0605652."
|
|
1079
127
|
)
|
|
1080
128
|
|
|
1081
129
|
passport_expiry: str = Field(...,
|
|
@@ -1166,7 +214,7 @@ def _image_to_jpeg_bytesio(image) -> BytesIO:
|
|
|
1166
214
|
|
|
1167
215
|
def get_response_from_openai_qat(image, side, country, openai_key):
|
|
1168
216
|
|
|
1169
|
-
logging.info("Processing image for Qatari
|
|
217
|
+
logging.info("Processing image for Qatari NID extraction OPENAI......")
|
|
1170
218
|
logging.info(f" and type: {type(image)}")
|
|
1171
219
|
try:
|
|
1172
220
|
image = _image_to_jpeg_bytesio(image)
|